## Table of Contents:
* [Importing libraries,loading & transforming data](#first-bullet)
* [Comparing metrics for different distances](#second-bullet)
* [Data Profiler](#third-bullet)

## Importing libraries, loading & transforming data <a class="anchor" id="first-bullet"></a>

 - I loaded all the packages required for running the code
 - dfA and dfB are loaded from 'recordlinkage' library
 - functions are written to calculate string distances

In [1]:
from recordlinkage.preprocessing import phonetic
import pandas as pd
import recordlinkage, spacy, textdistance, collections, time, json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
from recordlinkage.datasets import load_febrl4
dfA, dfB, true_links = load_febrl4(return_links=True)
print("Dataset A")
display(dfA.sort_index().head())
print("Dataset B")
display(dfB.sort_index().head())

Dataset A


Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-0-org,rachael,dent,1,knox street,lakewood estate,byford,4129,vic,19280722.0,1683994
rec-1-org,isabella,everett,25,pike place,rowethorpe,marsden,2152,nsw,19110816.0,6653129
rec-10-org,lachlan,reid,5,carrington road,legacy vlge,yagoona,2464,nsw,19500531.0,3232033
rec-100-org,hayden,stapley,38,tindale street,villa 2,cromer heights,4125,vic,,4620080
rec-1000-org,victoria,zbierski,70,wybalena grove,inverneath,paralowie,5065,nsw,19720503.0,1267612


Dataset B


Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-0-dup-0,rachael,dent,4.0,knox street,lakewood estate,byford,4129,vic,19280722.0,1683994
rec-1-dup-0,isabella,everett,25.0,pike mlace,rowethorpe,marsden,2152,nsw,19110816.0,6653129
rec-10-dup-0,lachlnn,reid,5.0,carrington road,legacy vlge,yagoona,2446,nsw,19500531.0,3232033
rec-100-dup-0,hayden,stapley,,tindale street,villa 2,cromer heights,4125,vic,,4620080
rec-1000-dup-0,victoria,zbierski,70.0,wybalena grove,inverbeath,paralowie,5065,nsw,19720503.0,1267612


In [4]:
print("Number of True Links",len(true_links))
print("Number of records in Dataset A",len(dfA))
print("Number of records in Dataset B",len(dfB))

Number of True Links 5000
Number of records in Dataset A 5000
Number of records in Dataset B 5000


In [5]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [6]:
#BERT score calculation
start_bert = time.time()

mem = collections.defaultdict()

sentencesA = dfA['given_name']
sentencesA.dropna(inplace=True)

sentencesB = dfB['given_name']
sentencesB.dropna(inplace=True)

for i in sentencesA:
    if i not in mem:
        mem[i] = model.encode(i)
        
for i in sentencesB:
    if i not in mem:
        mem[i] = model.encode(i)
        
end_bert = time.time()
print("Time to get encodings for all given names using BERT", end_bert-start_bert)

Time to get encodings for all given names using BERT 57.425655126571655


In [7]:
# Add a phonetic version of the first and last names to the two datasets
dfA["phonetic_given_name"] = phonetic(dfA["given_name"], "soundex")
dfB["phonetic_given_name"] = phonetic(dfB["given_name"], "soundex")
dfA["phonetic_surname"] = phonetic(dfA["surname"], "soundex")
dfB["phonetic_surname"] = phonetic(dfB["surname"], "soundex")

# I'm also adding "initials"
dfA["initials"] = (dfA["given_name"].str[0]  + dfA["surname"].str[0])
dfB["initials"] = (dfB["given_name"].str[0]  + dfB["surname"].str[0])

# Cast the social security number to a "numeric"
dfA['soc_sec_id']= pd.to_numeric(dfA['soc_sec_id'])
dfB['soc_sec_id']= pd.to_numeric(dfB['soc_sec_id'])

In [8]:
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(dfA, dfB)

print(candidate_links[0:10])

MultiIndex([('rec-1070-org', 'rec-2820-dup-0'),
            ('rec-1070-org',  'rec-684-dup-0'),
            ('rec-1070-org', 'rec-2942-dup-0'),
            ('rec-1070-org', 'rec-2283-dup-0'),
            ('rec-1070-org',  'rec-992-dup-0'),
            ('rec-1070-org', 'rec-3535-dup-0'),
            ('rec-1070-org', 'rec-2231-dup-0'),
            ('rec-1070-org', 'rec-1889-dup-0'),
            ('rec-1070-org', 'rec-2033-dup-0'),
            ('rec-1070-org', 'rec-4515-dup-0')],
           names=['rec_id_1', 'rec_id_2'])


In [9]:
from recordlinkage import Compare
compare = Compare()

In [10]:
# Options for "string" method argument are 
# [‘jaro’, ‘jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, 
# ‘qgram’, ‘cosine’, ‘smith_waterman’, ‘lcs’]. Default: ‘levenshtein’

# Options for "numeric" method argument are 
# [‘step’, ‘linear’, ‘exp’, ‘gauss’ or ‘squared’]. Default ‘linear’.

def test_method(i):
    compare = Compare()
    compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name_" + i)
    compare.exact('phonetic_surname', 'phonetic_surname', label='phonetic_surname_' + i)
    compare.string('given_name', 'given_name', method=i, label="given_name_" + i)
    compare.string('surname', 'surname', method=i, label="surname_"+ i)
    compare.string('suburb', 'suburb', method=i, label="suburb_"+ i)
    compare.string('state', 'state', method=i, label="state_"+ i)
    compare.string('address_1', 'address_1', method=i, label="address_1_"+ i)
    compare.numeric("soc_sec_id","soc_sec_id", label="soc_sec_id_"+i)
    
    score = []
    records_count = []
    precision = []
    recall = []
    f1score = []
    accuracy = []
    
    features  = compare.compute(candidate_links, dfA, dfB)
    #print(features.head())
    
    for i in range(8):
        matches = features[features.sum(axis=1) > i]
        pre = recordlinkage.precision(true_links, matches)
        rcl = recordlinkage.recall(true_links, matches)
        acc = recordlinkage.accuracy(true_links, matches, len(candidate_links))
        fscr = recordlinkage.fscore(true_links, matches)
        
        score.append(i)
        records_count.append(matches.shape[0])
        precision.append(pre)
        recall.append(rcl)
        f1score.append(fscr)
        accuracy.append(acc)
    
    df = pd.DataFrame(list(zip(score, records_count, precision, recall, f1score, accuracy)), 
                      columns= ['score', 'records_count', 'precision', 'recall', 'f1 score', 'accuracy'])
    return df

In [11]:
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2>{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)


## Comparing metrics for different distances <a class="anchor" id="second-bullet"></a>

 - Metrics such as precision, recall, F1 score and accuracy are calculated for various distance metrics
 - Eight different columns are used in this calculation including given_name, surname, phonetic_given_name, phonetic_surname, suburb, address, etc.,
 - Minimum and maximum probabilities for this use case are 0 and 8 eight respectively

In [12]:
start_calc = time.time()

display_side_by_side(test_method('jaro'), test_method('jarowinkler'), test_method('levenshtein'),
                        test_method('damerau_levenshtein'), test_method('cosine'), 
                    titles = ['jaro', 'jarowinkler', 'levenshtein', 'damerau_levenshtein', 'cosine'])

end_calc = time.time()
print("Time to get encodings for all distances", end_calc - start_calc)    

Unnamed: 0,score,records_count,precision,recall,f1 score,accuracy
0,0,103510,0.036054,0.7464,0.068786,0.023804
1,1,103473,0.036067,0.7464,0.06881,0.024162
2,2,88207,0.04231,0.7464,0.08008,0.171645
3,3,33391,0.111767,0.7464,0.194421,0.701217
4,4,8122,0.459246,0.746,0.568511,0.9453
5,5,4072,0.913065,0.7436,0.819665,0.984195
6,6,3598,0.987215,0.7104,0.826239,0.985567
7,7,2438,1.0,0.4876,0.655553,0.975249

Unnamed: 0,score,records_count,precision,recall,f1 score,accuracy
0,0,103510,0.036054,0.7464,0.068786,0.023804
1,1,103473,0.036067,0.7464,0.06881,0.024162
2,2,88412,0.042211,0.7464,0.079904,0.169665
3,3,33560,0.111204,0.7464,0.193568,0.699585
4,4,8210,0.454446,0.7462,0.564875,0.944469
5,5,4102,0.906387,0.7436,0.816963,0.983905
6,6,3603,0.985845,0.7104,0.825758,0.985518
7,7,2438,1.0,0.4876,0.655553,0.975249

Unnamed: 0,score,records_count,precision,recall,f1 score,accuracy
0,0,103510,0.036054,0.7464,0.068786,0.023804
1,1,67657,0.055161,0.7464,0.102729,0.370177
2,2,26051,0.143257,0.7464,0.240379,0.772128
3,3,8591,0.434408,0.7464,0.549187,0.940808
4,4,4373,0.852047,0.7452,0.79505,0.981441
5,5,3790,0.977309,0.7408,0.842776,0.986649
6,6,3496,0.999428,0.6988,0.822505,0.985431
7,7,2364,1.0,0.4728,0.642042,0.974534

Unnamed: 0,score,records_count,precision,recall,f1 score,accuracy
0,0,103510,0.036054,0.7464,0.068786,0.023804
1,1,68108,0.054795,0.7464,0.102096,0.36582
2,2,26267,0.142079,0.7464,0.238718,0.770042
3,3,8668,0.430549,0.7464,0.546093,0.940064
4,4,4386,0.849749,0.7454,0.794162,0.981335
5,5,3795,0.976285,0.741,0.842524,0.98662
6,6,3503,0.999429,0.7002,0.823474,0.985499
7,7,2385,1.0,0.477,0.645904,0.974737

Unnamed: 0,score,records_count,precision,recall,f1 score,accuracy
0,0,103510,0.036054,0.7464,0.068786,0.023804
1,1,46978,0.079441,0.7464,0.143599,0.569955
2,2,19306,0.193308,0.7464,0.307085,0.837291
3,3,7186,0.519343,0.7464,0.612506,0.954381
4,4,4138,0.900435,0.7452,0.815496,0.983712
5,5,3751,0.984537,0.7386,0.844018,0.986813
6,6,3459,1.0,0.6918,0.817827,0.985113
7,7,2258,1.0,0.4516,0.62221,0.97351


Time to get encodings for all distances 33.93990421295166


In [13]:
# !python -m spacy download en_core_web_sm
#Use this do download the package if the below code throws an error while loading

nlp = spacy.load('en_core_web_sm')

In [14]:
def sims_distance(word1, word2):
    a, b = word1, word2
    word1 = nlp(word1)
    word2 = nlp(word2)
    
    method = []
    distance = []
    
    method.append('damerau_levenshtein')
    distance.append(round(textdistance.damerau_levenshtein.normalized_similarity(a, b), 4))
    
    method.append('levenshtein')
    distance.append(round(textdistance.levenshtein.normalized_similarity(a, b), 4))
    
    method.append('jaro')
    distance.append(round(textdistance.jaro.normalized_similarity(a, b), 4))
    
    method.append('jarowinkler')
    distance.append(round(textdistance.jaro_winkler.normalized_similarity(a, b), 4))
    
    method.append('Semantic')
    distance.append(round(word1.similarity(word2), 4))
    
    if a not in mem:
        mem[a] = model.encode(a)
    if b not in mem:
        mem[b] = model.encode(b)  
    
    method.append('BERT')
    distance.append(round(cosine_similarity([mem[a]],[mem[b]])[0][0], 4))
    
    #print('damerau_levenshtein: ', textdistance.damerau_levenshtein.normalized_similarity(a, b))
    #print('levenshtein: ', textdistance.levenshtein.normalized_similarity(a,b))
    #print('jaro: ', textdistance.jaro.normalized_similarity(a,b))
    #print('jaro_winkler: ', textdistance.jaro_winkler.normalized_similarity(a,b))
    #print('Semantic distance', word1.similarity(word2))
    #print('Bert scores: ', cosine_similarity([mem[a]],[mem[b]])[0][0])
    
    df = pd.DataFrame(list(zip(method, distance)), columns= ['method', 'distance'])
    
    return df

In [15]:
sims_distance('michael', 'michaela')

Unnamed: 0,method,distance
0,damerau_levenshtein,0.875
1,levenshtein,0.875
2,jaro,0.9583
3,jarowinkler,0.975
4,Semantic,0.6458
5,BERT,0.8886


In [16]:
sims_distance('pink', 'sink')

Unnamed: 0,method,distance
0,damerau_levenshtein,0.75
1,levenshtein,0.75
2,jaro,0.8333
3,jarowinkler,0.8333
4,Semantic,0.4252
5,BERT,0.4542


In [17]:
sims_distance('good', 'god')

Unnamed: 0,method,distance
0,damerau_levenshtein,0.75
1,levenshtein,0.75
2,jaro,0.9167
3,jarowinkler,0.9333
4,Semantic,0.4021
5,BERT,0.7265


In [18]:
sims_distance("Geoff", "Jeff")

Unnamed: 0,method,distance
0,damerau_levenshtein,0.6
1,levenshtein,0.6
2,jaro,0.7833
3,jarowinkler,0.7833
4,Semantic,0.5794
5,BERT,0.7952


## Data Profiler <a class="anchor" id="third-bullet"></a>

 - Data profiler is printed using open source python package provided by Capital One 
 - The DataProfiler is a Python library designed to make data analysis, monitoring and sensitive data detection easy
 
Sources:
 - https://www.capitalone.com/tech/open-source/basics-of-data-profiler/
 - https://github.com/capitalone/DataProfiler

In [19]:
from dataprofiler import Data, Profiler

In [20]:
#Providing dfA as input for explaining the data
profile = Profiler(dfA)

INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns...  (with 7 processes)


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:06<00:00,  1.99it/s]


INFO:DataProfiler.profilers.profile_builder: Calculating the statistics...  (with 4 processes)


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:04<00:00,  2.62it/s]


In [21]:
# Generate a report and use json to prettify.
report  = profile.report(report_options={"output_format":"pretty"})
report.keys()

dict_keys(['global_stats', 'data_stats'])

In [22]:
# Print the report
report['global_stats']

{'samples_used': 5000,
 'column_count': 13,
 'row_count': 5000,
 'row_has_null_ratio': 0.1912,
 'row_is_null_ratio': 0.0,
 'unique_row_ratio': 1.0,
 'duplicate_row_count': 0,
 'file_type': "<class 'pandas.core.frame.DataFrame'>",
 'encoding': None,
 'correlation_matrix': None,
 'chi2_matrix': '[[ 1., nan,  0., nan, nan, nan, nan,  0., nan, nan,  0., nan,  0.], ... , [ 0., nan,  0., nan, nan, nan, nan,  0., nan, nan,  0., nan,  1.]]',
 'profile_schema': defaultdict(list,
             {'given_name': [0],
              'surname': [1],
              'street_number': [2],
              'address_1': [3],
              'address_2': [4],
              'suburb': [5],
              'postcode': [6],
              'state': [7],
              'date_of_birth': [8],
              'soc_sec_id': [9],
              'phonetic_given_name': [10],
              'phonetic_surname': [11],
              'initials': [12]}),
 'times': {'row_stats': 0.0101}}