In [1]:
# Libraries
import pandas as pd
from pandas import DataFrame
from nltk import edit_distance
from fuzzywuzzy import fuzz
from datetime import datetime

In [2]:
# Get data
# df = pd.read_csv("cust10k.csv", delimiter="|")
file = "./affiliationstrings/affiliationstrings_ids.csv"
df = pd.read_csv(file, index_col=0)
df.columns=["entityname"]

In [3]:
df.head()

Unnamed: 0_level_0,entityname
id1,Unnamed: 1_level_1
7927,", IBM Almaden Research Center, 650 Harry Road,..."
7930,", IIT Bombay"
7987,", University of California, San Diego, USA"
5613,"28msec Inc., Zurich, Switzerland"
9530,"28msec, Inc."


In [5]:
df.loc[8544]

entityname    Department of Computer Engineering, Kyungpook ...
Name: 8544, dtype: object

In [6]:
df.shape

(2260, 1)

In [7]:
def setup():
    # Add a column to store grouping
    df["GroupA"] = None
    df["GroupB"] = None
    df["GroupC"] = None
    df["GroupA_id1"] = None
    df["GroupB_id1"] = None
    df["GroupC_id1"] = None
    df["GroupA_score"] = None
    df["GroupB_score"] = None
    df["GroupC_score"] = None

In [8]:
setup() # RUN THIS FIRST!!!

In [9]:
groups = ["GroupA", "GroupB", "GroupC"]

for g in groups:
    print(f"Starting: {g}")
    print("=" * 72)
    initial = datetime.now()
    df = df.sample(frac=1)
    # indices of names for looping
    list_idx = [idx for idx in df.index]
    
    # Loop through names
    num_records = 100 # restrict for debug
    for idx in list_idx[:num_records]: # restrict for debug
    
    # for idx in list_idx:
        print("-" * 72)
        current_name = df.loc[idx]['entityname']
        print(f"idx:{idx} name to compare: {current_name}")
        
    
        if df.loc[idx][g] is None:
            idx_to_compare = list_idx[list_idx.index(idx)+1:]
            for i in idx_to_compare:
                if df.loc[i][g] is None:
                    compare_name = df.loc[i]["entityname"]
                    similarity_score = fuzz.ratio(current_name, compare_name)
                    if similarity_score > 80:
    #                     print(f"Similarity score:{similarity_score}")
                        df.loc[df.index == i, g] = current_name
                        df.loc[df.index == i, g+"_score"] = similarity_score
                        df.loc[df.index == i, g+"_id1"] = idx
        else:
            print(f"{current_name} already matched. Skipping.")
    
    
    
    final = datetime.now()
    total_time = final - initial
    print("=" * 72)
    print(f"Group {g} total time: {total_time}")

print("Saving groups...")
df.to_csv("edit_dist_dedup_groups.csv")
print("Save complete!")

Starting: GroupA
------------------------------------------------------------------------
idx:8876 name to compare: Department of Computer Science, University of Illinois at Chicago, Chicago, USA
------------------------------------------------------------------------
idx:1090 name to compare: UCLA Computer Science Department, Los Angeles, CA
------------------------------------------------------------------------
idx:8941 name to compare: Ecole Polytechnique Fédéral de Lausanne & Google, LTAA, Lausanne, Switzerland
------------------------------------------------------------------------
idx:4843 name to compare: UNC Chapel Hill
------------------------------------------------------------------------
idx:9141 name to compare: University of Melbourne Melbourne, Australia
------------------------------------------------------------------------
idx:7793 name to compare: INRIA Rocquencourt, 78153 Le Chesnay, France
------------------------------------------------------------------------
id

------------------------------------------------------------------------
idx:1180 name to compare: University of Konstanz, Konstanz, Germany
------------------------------------------------------------------------
idx:1640 name to compare: Database Research Group, Swiss Federal Institute of Technology (ETH), Zurich, Switzerland
------------------------------------------------------------------------
idx:9708 name to compare: Università della Basilicata -- Potenza, Italy
------------------------------------------------------------------------
idx:8739 name to compare: Pisa KDD Laboratory, ISTI---CNR, Pisa, Italy 56124
------------------------------------------------------------------------
idx:2203 name to compare: The Hebrew University of Jerusalem, Edmond J. Safra Campus Jerusalem, Israel
------------------------------------------------------------------------
idx:882 name to compare: San Diego Supercomputer Center, University of California, San Diego, CA
-----------------------------

------------------------------------------------------------------------
idx:5664 name to compare: Syracuse University, Syracuse, NY, USA
------------------------------------------------------------------------
idx:2271 name to compare: University of Michigan, Ann Arbor, Michigan
------------------------------------------------------------------------
idx:8795 name to compare: Computer and Information Science and Engineering, University of Florida, Gainesville, USA 32611
------------------------------------------------------------------------
idx:5403 name to compare: HKUST, Hong Kong, China
------------------------------------------------------------------------
idx:4967 name to compare: Xerox Research Centre, France
------------------------------------------------------------------------
idx:479 name to compare: Columbia University, New York, NY
------------------------------------------------------------------------
idx:8128 name to compare: Amazon.com, Seattle
---------------------

------------------------------------------------------------------------
idx:9231 name to compare: The Chinese University of Hong Kong, New Territories, Hong Kong
------------------------------------------------------------------------
idx:4737 name to compare: IBM Toronto Development Laboratory, ON, Canada
------------------------------------------------------------------------
idx:7655 name to compare: Institut für Informatik, Universität Freiburg, Freiburg, Germany
------------------------------------------------------------------------
idx:8661 name to compare: Department of EECS, University of Michigan, Ann Arbor, USA
------------------------------------------------------------------------
idx:2259 name to compare: Univ. Santa Clara
------------------------------------------------------------------------
idx:746 name to compare: The Ohio State University
------------------------------------------------------------------------
idx:532 name to compare: Department of EECS, University

------------------------------------------------------------------------
idx:8769 name to compare: CERIAS and Department of Computer Science, Purdue University, West Lafayette, USA
------------------------------------------------------------------------
idx:3126 name to compare: University of Ioannina, Greece
------------------------------------------------------------------------
idx:736 name to compare: Brown University
------------------------------------------------------------------------
idx:3524 name to compare: University of Florida, Gainesville, USA
------------------------------------------------------------------------
idx:2218 name to compare: Oracle
------------------------------------------------------------------------
idx:9354 name to compare: NICTA, University of Melbourne, Parkville, Australia
------------------------------------------------------------------------
idx:2838 name to compare: University of Maryland, College Park
-----------------------------------------

------------------------------------------------------------------------
idx:1686 name to compare: State University of New York at Buffalo
------------------------------------------------------------------------
idx:6990 name to compare: Japan Advanced Institute of Science and Technology
Japan Advanced Institute of Science and Technology already matched. Skipping.
------------------------------------------------------------------------
idx:7967 name to compare: The University of British Columbia, Vancouver, BC, Canada
------------------------------------------------------------------------
idx:8255 name to compare: Université Paris Dauphine, Paris, France
------------------------------------------------------------------------
idx:1350 name to compare: Laboratory for Foundations of Computer Science, University of Edinburgh, Edinburgh, UK
------------------------------------------------------------------------
idx:8861 name to compare: IBM Research, Yorktown Heights, NY
----------------

In [51]:
group = "GroupA"
print(f"Number of matches: {df[group].value_counts().sum()}")
print(f"Number of groups: {len(df[group].unique())}")
df[group].value_counts()[:10]

Number of matches: 285
Number of groups: 70


University of California, San Diego, CA                                                  18
University of Wisconsin, Madison                                                         15
Department of Computer Science and Engineering, University of Washington, Seattle, WA    14
IBM Almaden Research Center, San Jos, CA, USA                                            13
AT&T Labs-Research, NJ                                                                   12
University of Toronto, Toronto, Canada                                                    9
University of Texas at Arlington                                                          8
Google Inc., Mountain View, CA                                                            8
Max-Planck-Institut fü Informatik, Saarbrücken, Germany                                   7
University of Maryland, College Park, MD 20                                               7
Name: GroupA, dtype: int64

In [49]:
df.loc[df["entityname"].str.contains("AT&T")].head()

Unnamed: 0_level_0,entityname,GroupA,GroupB,GroupC,GroupA_id1,GroupB_id1,GroupC_id1,GroupA_score,GroupB_score,GroupC_score
id1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8024,"AT&T Labs--Research, USA","AT&T Labs-Research, NJ",AT & T Labs-Research,,1456.0,4046.0,,87.0,82.0,
6156,"AT&T Labs, Florham Park, NJ, USA",,,,,,,,,
6764,"AT&T Labs, Research","AT&T Labs-Research, NJ",AT & T Labs-Research,,1456.0,4046.0,,83.0,87.0,
7105,"AT&T Research Laboratory, Florham Park, NJ",,,,,,,,,
103,AT&T Labs-Research,"AT&T Labs-Research, NJ",AT & T Labs-Research,"AT&T Labs--Research, USA",1456.0,4046.0,8024.0,90.0,95.0,86.0


In [37]:
matches_temp = list(set(list(zip(df["GroupA_id1"].index, df["GroupA_id1"].values)) + \
        list(zip(df["GroupB_id1"].index, df["GroupB_id1"].values)) + \
        list(zip(df["GroupC_id1"].index, df["GroupC_id1"].values))))

matches = []
for m in matches_temp:
    if m[0] is not None and m[1] is not None:
        matches.append(m)

In [38]:
len(matches)

739

In [48]:
matches[:10]

[(6026, 479),
 (8253, 9141),
 (135, 9779),
 (1037, 6673),
 (397, 467),
 (5820, 5900),
 (8359, 8814),
 (2990, 746),
 (8449, 7967),
 (7995, 8705)]

In [45]:
# Get ground truth
file = "./affiliationstrings/affiliationstrings_mapping.csv"
df_truth = pd.read_csv(file, index_col=0, header=None, names=["entityid1", "entityid2"])

In [47]:
df_truth.head()

Unnamed: 0_level_0,entityid2
entityid1,Unnamed: 1_level_1
7927,8445
7927,5887
7927,789
7927,3276
7927,2900
