In [1]:
import recordlinkage
import pandas as pd

In [2]:
data = pd.read_csv("https://www.treasury.gov/ofac/downloads/alt.csv", header=None, skiprows=2000, nrows=100)
data.rename(columns={3:'name'}, inplace=True)
data.head()

Unnamed: 0,0,1,2,name,4
0,9769,9300,aka,"SHEYMAN, Viktor Vladimirovich",-0-
1,9769,9351,aka,"SHEIMAN, Victor Vladimirovich",-0-
2,9769,9352,aka,"SHEYMAN, Victor Vladimirovich",-0-
3,9769,9353,aka,"SHEIMAN, Viktor Vladimirovich",-0-
4,9769,9473,aka,"SHEYMAN, Viktar Uladzimiravich",-0-


In [3]:
data.shape

(100, 5)

In [4]:
data.tail()

Unnamed: 0,0,1,2,name,4
95,10011,9712,aka,"AL-BATHALI, Mubarak",-0-
96,10011,9713,aka,"AL BATHALI, Mubarak Mishkhis Sanad",-0-
97,10011,9714,aka,"AL-BATHALI, Mubarak Mishkhas Sanad",-0-
98,10011,9715,aka,"AL-BADHALI, Mubarak Mishkhis Sanad",-0-
99,10011,9716,aka,"AL-BAZALI, Mubarak Mishkhas Sanad",-0-


In [5]:
data_1 = pd.DataFrame(data.iloc[:,3])
data_2 = pd.DataFrame(data.iloc[:,3])

In [6]:
from recordlinkage.preprocessing import phonetic

data_1["phonetic_name"] = phonetic(data_1["name"], "soundex")
data_2["phonetic_name"] = phonetic(data_2["name"], "soundex")

In [7]:
data_1.head()

Unnamed: 0,name,phonetic_name
0,"SHEYMAN, Viktor Vladimirovich",S551
1,"SHEIMAN, Victor Vladimirovich",S551
2,"SHEYMAN, Victor Vladimirovich",S551
3,"SHEIMAN, Viktor Vladimirovich",S551
4,"SHEYMAN, Viktar Uladzimiravich",S551


In [8]:
data_1.shape

(100, 2)

In [9]:
candidate_links = []
for x in range(data_1.shape[0]):
    for y in range(data_1.shape[0]):
        candidate_links.append([x,y])
        
candidate_links = pd.MultiIndex.from_tuples(candidate_links)

In [10]:
candidate_links[:15]

MultiIndex([(0,  0),
            (0,  1),
            (0,  2),
            (0,  3),
            (0,  4),
            (0,  5),
            (0,  6),
            (0,  7),
            (0,  8),
            (0,  9),
            (0, 10),
            (0, 11),
            (0, 12),
            (0, 13),
            (0, 14)],
           )

In [11]:
compare = recordlinkage.Compare()

# Options for "string" method argument are 
# [‘jaro’, ‘jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, 
# ‘qgram’, ‘cosine’, ‘smith_waterman’, ‘lcs’]. Default: ‘levenshtein’

# Options for "numeric" method argument are 
# [‘step’, ‘linear’, ‘exp’, ‘gauss’ or ‘squared’]. Default ‘linear’.

compare.exact('phonetic_name', 'phonetic_name', label="phonetic_name_matched")
compare.string('name', 'name', method='levenshtein', label="name_matched")

features = compare.compute(candidate_links, data_1, data_2)
features

Unnamed: 0,Unnamed: 1,phonetic_name_matched,name_matched
0,0,1,1.000000
0,1,1,0.931034
0,2,1,0.965517
0,3,1,0.965517
0,4,1,0.866667
...,...,...,...
99,95,0,0.484848
99,96,0,0.882353
99,97,0,0.941176
99,98,0,0.911765


In [12]:
features.reset_index(inplace=True)

final_merge = pd.merge(pd.merge(features, data_1, how='left', left_on='level_0', right_index=True), \
                       data_2, how='left', left_on='level_1', right_index=True)


In [13]:
pd.set_option('display.max_rows', 100)
final_merge[final_merge.level_0 == 0].sort_values(by='name_matched', ascending=False)

Unnamed: 0,level_0,level_1,phonetic_name_matched,name_matched,name_x,phonetic_name_x,name_y,phonetic_name_y
0,0,0,1,1.0,"SHEYMAN, Viktor Vladimirovich",S551,"SHEYMAN, Viktor Vladimirovich",S551
6,0,6,1,0.965517,"SHEYMAN, Viktor Vladimirovich",S551,"SHEYMAN, Viktar Vladimirovich",S551
2,0,2,1,0.965517,"SHEYMAN, Viktor Vladimirovich",S551,"SHEYMAN, Victor Vladimirovich",S551
3,0,3,1,0.965517,"SHEYMAN, Viktor Vladimirovich",S551,"SHEIMAN, Viktor Vladimirovich",S551
1,0,1,1,0.931034,"SHEYMAN, Viktor Vladimirovich",S551,"SHEIMAN, Victor Vladimirovich",S551
5,0,5,1,0.931034,"SHEYMAN, Viktor Vladimirovich",S551,"SHEIMAN, Viktar Vladimirovich",S551
8,0,8,1,0.9,"SHEYMAN, Viktor Vladimirovich",S551,"SHEYMAN, Viktor Uladzimiravich",S551
7,0,7,1,0.866667,"SHEYMAN, Viktor Vladimirovich",S551,"SHEIMAN, Viktor Uladzimiravich",S551
10,0,10,1,0.866667,"SHEYMAN, Viktor Vladimirovich",S551,"SHEYMAN, Victor Uladzimiravich",S551
4,0,4,1,0.866667,"SHEYMAN, Viktor Vladimirovich",S551,"SHEYMAN, Viktar Uladzimiravich",S551
