In [1]:
# Libraries
import pandas as pd
from pandas import DataFrame
from nltk import edit_distance
from fuzzywuzzy import fuzz
from datetime import datetime

In [2]:
# Get data
# df = pd.read_csv("cust10k.csv", delimiter="|")
file = "./affiliationstrings/affiliationstrings_ids.csv"
df = pd.read_csv(file, index_col=0)
df.columns=["entityname"]

In [3]:
df.head()

Unnamed: 0_level_0,entityname
id1,Unnamed: 1_level_1
7927,", IBM Almaden Research Center, 650 Harry Road,..."
7930,", IIT Bombay"
7987,", University of California, San Diego, USA"
5613,"28msec Inc., Zurich, Switzerland"
9530,"28msec, Inc."


In [4]:
df.loc[8544]

entityname    Department of Computer Engineering, Kyungpook ...
Name: 8544, dtype: object

In [5]:
df.shape

(2260, 1)

In [6]:
def setup():
    # Add a column to store grouping
    df["GroupA"] = None
    df["GroupB"] = None
    df["GroupC"] = None
    df["GroupA_id1"] = None
    df["GroupB_id1"] = None
    df["GroupC_id1"] = None
    df["GroupA_score"] = None
    df["GroupB_score"] = None
    df["GroupC_score"] = None

In [7]:
setup() # RUN THIS FIRST!!!

In [9]:
groups = ["GroupA", "GroupB", "GroupC"]

for g in groups:
    print(f"Starting: {g}")
    print("=" * 72)
    initial = datetime.now()
    df = df.sample(frac=1)
    # indices of names for looping
    list_idx = [idx for idx in df.index]
    
    # Loop through names
#     num_records = 100 # restrict for debug
#     for idx in list_idx[:num_records]: # restrict for debug
    
    for idx in list_idx:
#         print("-" * 72)
        current_name = df.loc[idx]['entityname']
#         print(f"idx:{idx} name to compare: {current_name}")
        
    
        if df.loc[idx][g] is None:
            idx_to_compare = list_idx[list_idx.index(idx)+1:]
            for i in idx_to_compare:
                if df.loc[i][g] is None:
                    compare_name = df.loc[i]["entityname"]
                    similarity_score = fuzz.ratio(current_name, compare_name)
                    if similarity_score > 80:
    #                     print(f"Similarity score:{similarity_score}")
                        df.loc[df.index == i, g] = current_name
                        df.loc[df.index == i, g+"_score"] = similarity_score
                        df.loc[df.index == i, g+"_id1"] = idx
#         else:
#             print(f"{current_name} already matched. Skipping.")
    
    
    
    final = datetime.now()
    total_time = final - initial
    print("=" * 72)
    print(f"Group {g} total time: {total_time}")

print("Saving groups...")
df.to_csv("edit_dist_dedup_groups.csv")
print("Save complete!")

Starting: GroupA
Group GroupA total time: 0:05:39.694824
Starting: GroupB
Group GroupB total time: 0:05:46.199762
Starting: GroupC
Group GroupC total time: 0:05:40.589986
Saving groups...
Save complete!


In [10]:
group = "GroupA"
print(f"Number of matches: {df[group].value_counts().sum()}")
print(f"Number of groups: {len(df[group].unique())}")
df[group].value_counts()[:10]

Number of matches: 1029
Number of groups: 448


IBM T. J. Watson Research Center, Hawthorne, NY                     22
Department of Computer Science, Fudan University, China             20
Hong Kong University of Science and Technology, Hong Kong, China    15
IBM Almaden Research Center, San Jose, CA                           13
AT&T Labs - Research, Florham Park, NJ, USA                         13
University of California Santa Cruz                                 12
AT & T Labs-Research                                                12
The University of Hong Kong, Hong Kong                              12
University of Illinois at Urbana-Champaign, Urbana, IL, USA          9
University of California, Santa Barbara, Santa Barbara, CA, USA      9
Name: GroupA, dtype: int64

In [11]:
df.loc[df["entityname"].str.contains("AT&T")].head()

Unnamed: 0_level_0,entityname,GroupA,GroupB,GroupC,GroupA_id1,GroupB_id1,GroupC_id1,GroupA_score,GroupB_score,GroupC_score
id1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8948,"AT&T Labs-Research, Florham Park NJ","AT&T Labs - Research, Florham Park, NJ, USA","AT&T Labs--Research, Florham Park, NJ",,5865,3117,,90,97,
3142,AT&T Labs--Research,AT & T Labs-Research,AT&T Labs Research,,4046,454,,92,92,
6259,"AT&T Laboratories - Research, Florham Park, NJ...","AT&T Labs - Research, Florham Park, NJ, USA","AT&T Labs--Research, Florham Park, NJ","AT&T Labs-Research, Florham Park NJ",5865,3117,8948.0,91,82,81.0
1456,"AT&T Labs-Research, NJ",AT & T Labs-Research,AT&T Labs Research,AT&T Labs--Research,4046,454,3142.0,86,85,88.0
1702,AT&T Labs - Research,AT & T Labs-Research,AT&T Labs Research,AT&T Labs--Research,4046,454,3142.0,90,95,92.0


Remove duplicate permutations of tuples:

https://stackoverflow.com/questions/15352995/removing-permutations-from-a-list-of-tuples

In [39]:
matches_temp = list(set(list(zip(df["GroupA_id1"].index, df["GroupA_id1"].values)) + \
        list(zip(df["GroupB_id1"].index, df["GroupB_id1"].values)) + \
        list(zip(df["GroupC_id1"].index, df["GroupC_id1"].values))))

matches = []
for m in matches_temp:
    if m[0] is not None and m[1] is not None:
        matches.append(m)

# Remove permutation duplicates
matches = list(set(tuple(sorted(t)) for t in matches))

In [40]:
len(matches)

2019

In [41]:
matches[:10]

[(2268, 7422),
 (130, 8584),
 (8253, 9141),
 (1476, 8078),
 (1385, 7594),
 (1037, 6673),
 (48, 8992),
 (2323, 5566),
 (6064, 6278),
 (1974, 9593)]

In [52]:
df_matches = pd.DataFrame(matches)
df_matches.columns = ["entityid1", "entityid2"]
# df_matches.set_index("entityid1", inplace=True)
df_matches["match_string"] = df_matches.apply(lambda row: str(row["entityid1"])+"|"+str(row["entityid2"]), axis=1)
df_matches.head()

Unnamed: 0,entityid1,entityid2,match_string
0,2268,7422,2268|7422
1,130,8584,130|8584
2,8253,9141,8253|9141
3,1476,8078,1476|8078
4,1385,7594,1385|7594


In [43]:
# Get ground truth
file = "./affiliationstrings/affiliationstrings_mapping.csv"
df_truth = pd.read_csv(file, index_col=0, header=None, names=["entityid1", "entityid2"])

In [44]:
truth_tuples = list(df_truth.reset_index().to_records(index=False))
truth_unique = list(set(tuple(sorted(t)) for t in truth_tuples))
truth_unique[:10]

[(3460, 6963),
 (1808, 4095),
 (3265, 9562),
 (2551, 5771),
 (6605, 7876),
 (3842, 9044),
 (6150, 6549),
 (4208, 9532),
 (9136, 9376),
 (2927, 8872)]

In [45]:
len(truth_unique)

16408

In [53]:
df_truthunique = pd.DataFrame(truth_unique)
df_truthunique.columns = ["entityid1", "entityid2"]
# df_truthunique.set_index("entityid1", inplace=True)
df_truthunique["match_string"] = df_truthunique.apply(lambda row: str(row["entityid1"])+"|"+str(row["entityid2"]), axis=1)
df_truthunique.head()

Unnamed: 0,entityid1,entityid2,match_string
0,3460,6963,3460|6963
1,1808,4095,1808|4095
2,3265,9562,3265|9562
3,2551,5771,2551|5771
4,6605,7876,6605|7876


In [56]:
matches_list = list(df_matches["match_string"])
truth_list = list(df_truthunique["match_string"])

In [64]:
len(matches_list)

2019

In [67]:
good_matches = []
for m in matches_list:
    if m in truth_list:
        good_matches.append("Good match")
    else:
        good_matches.append("Bad match")

In [68]:
len(good_matches)

2019

In [69]:
df_matches["GoodBad"] = good_matches

In [70]:
df_matches.head()

Unnamed: 0,entityid1,entityid2,match_string,GoodBad
0,2268,7422,2268|7422,Good match
1,130,8584,130|8584,Good match
2,8253,9141,8253|9141,Good match
3,1476,8078,1476|8078,Bad match
4,1385,7594,1385|7594,Good match


In [71]:
df_matches["GoodBad"].value_counts()

Good match    1619
Bad match      400
Name: GoodBad, dtype: int64

In [72]:
found_truth = []
for m in truth_list:
    if m in matches_list:
        found_truth.append("Found")
    else:
        found_truth.append("Not found")

In [73]:
len(found_truth)

16408

In [75]:
df_truthunique["Found"] = found_truth

In [76]:
df_truthunique.head()

Unnamed: 0,entityid1,entityid2,match_string,Found
0,3460,6963,3460|6963,Not found
1,1808,4095,1808|4095,Not found
2,3265,9562,3265|9562,Not found
3,2551,5771,2551|5771,Not found
4,6605,7876,6605|7876,Not found


In [77]:
df_truthunique["Found"].value_counts()

Not found    14789
Found         1619
Name: Found, dtype: int64