# Table Of Contents<a class="anchor" id="zero-bullet"></a>:
* [Imports](#first-bullet)
* [Global Functions](#second-bullet)
* [Data Import](#third-bullet)
* [Benchmarking](#fourth-bullet)
 * [Levenshtein distance](#1)
 * [Levenshtein distance norm](#2)
 * [Damerau-Levenshtein distance](#3)
 * [Damerau-Levenshtein distance norm](#4)
 * [Jaro-Winkler Similarity](#5)
 * [Hamming distance](#6)
 * [Hamming distance norm](#7)
 * [Jaccard Similarity](#8)
 * [Jaccard Similarity Modified](#9)
 * [NYSIIS](#10)
 * [Soundex](#11)
 * [Double Metaphone](#12)
 * [Learnable Similarity Distance](#13)
* [Summary](#fifth-bullet)

## Imports<a class="anchor" id="first-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

In [None]:
import pandas as pd
import numpy as np
import json
import time
from metaphone import doublemetaphone
import plotly.express as px
import jellyfish
import plotly.graph_objects as go

## Global Functions<a class="anchor" id="second-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

In [None]:
def progress_line(i: int = 0, m: int = 100, num: int = 50) -> str:
    """
    Returns a progress bar as a string based on the given input parameters.

    Parameters:
    i (int): The current progress index. Default value is 0.
    m (int): The maximum progress index. Default value is 100.
    num (int): The number of characters to use for the progress bar. Default value is 50.

    Returns:
    str: A string representing the progress bar.

    Example:
    >>> progress_line(30, 100, 20)
    '[####----------------------]30.0%: 30 of 100'
    """
    percentage = i/m
    hash_number = round(percentage*num)
    loadbar = '[' + '#'*hash_number + '-'*(num-hash_number) + ']' + f'{round(percentage*100, 2)}%: {i} of {m}'
    return loadbar


def find_candidates(LHS: dict, RHS: dict, metric) -> pd.DataFrame:
    """
    Find candidate matches between two dictionaries of names using a specified metric.

    Args:
        LHS (dict): Left-hand side dictionary of names.
        RHS (dict): Right-hand side dictionary of names.
        metric (function): Metric function to measure the similarity between names.

    Returns:
        pd.DataFrame: DataFrame containing the candidate matches with distances and labels.

    """
    start = time.time()
    candidates = []
    LHS_names = LHS.keys()
    RHS_names = RHS.keys()
    i = 0
    for name_l in LHS_names:
        min_dist = np.inf
        candidate = ''
        for name_r in RHS_names:
            dist = metric(name_l, name_r)
            if dist < min_dist:
                min_dist = dist
                candidate = name_r
                if dist == 0:
                    break
        i += 1
        candidates.append((name_l, candidate, min_dist, (name_l, candidate), int(LHS.get(name_l)), int(RHS.get(candidate))))
        print(progress_line(i, len(LHS)), end='\r')  # Print progress information
    print(f'\nSearching process took us {time.time() - start:.2f}')  # Print time taken for searching
    result = pd.DataFrame(candidates, columns=['lhs_name', 'rhs_name', 'dist', 'pair', 'lhs_label', 'rhs_label'])

    return result

def find_duplicates(names):
    """
    Find duplicate names in a list.

    Args:
        names (list): List of names to search for duplicates.

    Returns:
        list: List of tuples containing duplicate names and their positions.

    """
    duplicates = []
    seen = set()

    for i, name in enumerate(names):
        if name in seen and name not in duplicates:  # Check if the name has been seen before and is not already in the duplicates list
            duplicates.append((name, i))
        else:
            seen.add(name)  # Add the name to the set of seen names

    return duplicates

## Data import<a class="anchor" id="third-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

### Initialize LHS

In [None]:
path = r'validation_dataset.csv'

Import the validation dataset

In [None]:
val_df = pd.read_csv(path, encoding='utf-8')

In [None]:
val_df.head()

In [None]:
# create a list of misspelling and an according list of labels
misspellings = list(val_df[['mistake_1', 'mistake_2', 'mistake_3', 'mistake_4']].values.flatten())
misspelling_labels = np.repeat(val_df['label_2'], 4)

Check if there're duplicates among the given data. I.e. if the misspelled name by chance changes into some other name, that also presented in the data. This can create ambiguity during the matching phase. We want to avoid it during benchmarking.

In [None]:
duplicates = find_duplicates(misspellings)

if duplicates:
    print("Duplicate names found:")
    for name, position in duplicates:
        print(f"{name} - Position: {position}")
else:
    print("No duplicate names found.")


Create LHS dictionary

In [None]:
LHS = {name: label for name, label in zip(misspellings, misspelling_labels)}

In [None]:
LHS

In [None]:
len(LHS)

### Initialize RHS

In [None]:
path_rhs = r'RHS_2\labels_dict.json'
with open(path_rhs, 'r', encoding='utf-8') as f:
    RHS = json.loads(f.read())

In [None]:
RHS

#### Full RHS

Note: import full RHS only for exploring purposes. The data is too big to hangle with the introduced algorithms.

In [None]:
path_rhs = r'first_and_last.csv'
RHS = pd.read_csv(path_rhs)
rhs_last_names = RHS['last']
RHS = {name: LHS.get(name) for name in rhs_last_names}
RHS = {name: 0 for name, label in RHS.items() if not isinstance(label, int)}
RHS = {name: label for name, label in RHS.items() if isinstance(name, str)}

## Benchmarking the algorithms introduced in Chapter 2<a class="anchor" id="fourth-bullet"></a>

### Levenshtein distance<a class="anchor" id="1"></a>

[Back to the Table of Contents](#zero-bullet) --- [Next](#2)

In [None]:
lev = find_candidates(LHS=LHS, RHS=RHS, metric=jellyfish.levenshtein_distance)

Create normalized distances for the found matches

In [None]:
lev['normalized_dist'] = lev.apply(lambda row: 1 - (row['dist']/max((len(row['lhs_name']), (len(row['rhs_name']))))), axis=1)

Create indicator column, that shows if the match was correct

In [None]:
lev['match'] = lev.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

Create visual insights

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=list(lev[lev['match']]['dist']), y=list(lev[lev['match']]['normalized_dist']), name='True match', boxpoints='all'))
fig.add_trace(go.Box(x=list(lev[~lev['match']]['dist']), y=list(lev[~lev['match']]['normalized_dist']), name='False match', boxpoints='all'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    xaxis_title="Edit distance",
    yaxis_title="Normalized edit distance",
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Levenshtein Distance",
)

In [None]:
fig.write_image("images/lev_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(lev[lev['match']]['dist']), name='True match'))
fig.add_trace(go.Histogram(x=list(lev[~lev['match']]['dist']), name='False match'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Levenshtein Distance",
)


In [None]:
fig.write_image("images/lev_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(lev['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    xaxis_title="Name match",
    yaxis_title="Aggregate",
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
)

In [None]:
fig.write_image("images/lev_match_distr.png")

Calculate missmatches

In [None]:
mismatched = 1 - (lev[lev['match']].shape[0]/lev.shape[0])

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({lev[~lev["match"]].shape[0]})')

### Normalized Levenshtein distance<a class="anchor" id="2"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#1) --- [Next](#3)

In [None]:
def normalized_levenshtein(s1: str, s2: str):
    dist = jellyfish.levenshtein_distance(s1, s2)
    return (dist/max(len(s1), len(s2)))

In [None]:
lev_n = find_candidates(LHS=LHS, RHS=RHS, metric=normalized_levenshtein)

Create regular distances for the found matches

In [None]:
lev_n['normalized_dist'] = lev_n.apply(lambda row: 1 - row['dist'], axis=1)
lev_n['dist'] = lev_n.apply(lambda row: row['dist'] * max(len(row['lhs_name']), len(row['rhs_name'])), axis=1)

Create indicator column, that shows if the match was correct

In [None]:
lev_n['match'] = lev_n.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

Create visual insights

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=list(lev_n[lev_n['match']]['dist']), y=list(lev_n[lev_n['match']]['normalized_dist']), name='True match', boxpoints='all'))
fig.add_trace(go.Box(x=list(lev_n[~lev_n['match']]['dist']), y=list(lev_n[~lev_n['match']]['normalized_dist']), name='False match', boxpoints='all'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    #title="Levenshtein Distance",
    xaxis_title="Edit distance",
    yaxis_title="Normalized edit distance",
)

In [None]:
fig.write_image("images/lev_n_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(lev_n[lev_n['match']]['dist']), name='True match'))
fig.add_trace(go.Histogram(x=list(lev_n[~lev_n['match']]['dist']), name='False match'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
)


In [None]:
fig.write_image("images/lev_n_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(lev_n['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("images/lev_n_match_distr.png")

Calculate missmatches

In [None]:
mismatched = 1 - (lev_n[lev_n['match']].shape[0]/len(LHS))

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({lev_n[~lev_n["match"]].shape[0]})')

### Damerau-Levenshtein Distance<a class="anchor" id="3"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#2) --- [Next](#4)

In [None]:
dam_lev = find_candidates(LHS=LHS, RHS=RHS, metric=jellyfish.damerau_levenshtein_distance)

Create normalized distances for the found matches

In [None]:
dam_lev['normalized_dist'] = dam_lev.apply(lambda row: 1 - (row['dist']/max((len(row['lhs_name']), (len(row['rhs_name']))))), axis=1)

Create indicator column, that shows if the match was correct

In [None]:
dam_lev['match'] = dam_lev.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

Create visual insights

In [None]:
Create visual insightsfig = go.Figure()
fig.add_trace(go.Box(x=list(dam_lev[dam_lev['match']]['dist']), y=list(dam_lev[dam_lev['match']]['normalized_dist']), name='True match', boxpoints='all'))
fig.add_trace(go.Box(x=list(dam_lev[~dam_lev['match']]['dist']), y=list(dam_lev[~dam_lev['match']]['normalized_dist']), name='False match', boxpoints='all'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    #title="Levenshtein Distance",
    xaxis_title="Edit distance",
    yaxis_title="Normalized edit distance",
)

In [None]:
fig.write_image("images/dam_lev_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(dam_lev[dam_lev['match']]['dist']), name='True match'))
fig.add_trace(go.Histogram(x=list(dam_lev[~dam_lev['match']]['dist']), name='False match'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
)


In [None]:
fig.write_image("images/dam_lev_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(dam_lev['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("images/dam_lev_match_distr.png")

Calculate missmatches

In [None]:
mismatched = 1 - (dam_lev[dam_lev['match']].shape[0]/dam_lev.shape[0])

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({dam_lev[~dam_lev["match"]].shape[0]})')

### Normalized Damerau-Levenshtein Distance<a class="anchor" id="4"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#3) --- [Next](#5)

In [None]:
[Back to the Table of Contents](#zero-bullet)def normalized_dam_lev(s1: str, s2: str):
    dist = jellyfish.damerau_levenshtein_distance(s1, s2)
    return dist/max(len(s1), len(s2))

In [None]:
dam_lev_n = find_candidates(LHS=LHS, RHS=RHS, metric=normalized_dam_lev)

Create regular distances for the found matches

In [None]:
dam_lev_n['normalized_dist'] = dam_lev_n.apply(lambda row: 1 - row['dist'], axis=1)
dam_lev_n['dist'] = dam_lev_n.apply(lambda row: row['dist'] * max(len(row['lhs_name']), len(row['rhs_name'])), axis=1)

Create indicator column, that shows if the match was correct

In [None]:
dam_lev_n['match'] = dam_lev_n.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

Create visual insights

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=list(dam_lev_n[dam_lev_n['match']]['dist']), y=list(dam_lev_n[dam_lev_n['match']]['normalized_dist']), name='True match', boxpoints='all'))
fig.add_trace(go.Box(x=list(dam_lev_n[~dam_lev_n['match']]['dist']), y=list(dam_lev_n[~dam_lev_n['match']]['normalized_dist']), name='False match', boxpoints='all'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    #title="Levenshtein Distance",
    xaxis_title="Edit distance",
    yaxis_title="Normalized edit distance",
)

In [None]:
fig.write_image("images/dam_lev_n_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(dam_lev_n[dam_lev_n['match']]['dist']), name='True match'))
fig.add_trace(go.Histogram(x=list(dam_lev_n[~dam_lev_n['match']]['dist']), name='False match'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
)


In [None]:
fig.write_image("images/dam_lev_n_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(dam_lev_n['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("images/dam_lev_n_match_distr.png")

Calculate missmatches

In [None]:
mismatched = 1 - (dam_lev_n[dam_lev_n['match']].shape[0]/len(LHS))

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({dam_lev_n[~dam_lev_n["match"]].shape[0]})')

### Jaro-Winkler similarity<a class="anchor" id="5"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#4) --- [Next](#6)

In [None]:
def jws(s1, s2):
    return 1 - jellyfish.jaro_winkler_similarity(s1, s2)

Create regular distances for the found matches

In [None]:
jw = find_candidates(LHS=LHS, RHS=RHS, metric=jws)

Create indicator column, that shows if the match was correct

In [None]:
Create indicator column, that shows if the match was correctjw['match'] = jw.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

Create visual insights

In [None]:
Create visual insightsfig = go.Figure()
fig.add_trace(go.Histogram(x=list(jw[jw['match']]['dist']), name='True match'))
fig.add_trace(go.Histogram(x=list(jw[~jw['match']]['dist']), name='False match'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
)


In [None]:
fig.write_image("images/jw_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(jw['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("images/jw_match_distr.png")

Calculate missmatches

In [None]:
mismatched = 1 - (jw[jw['match']].shape[0]/jw.shape[0])

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({jw[~jw["match"]].shape[0]})')

### Hamming distance<a class="anchor" id="6"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#5) --- [Next](#7)

In [None]:
ham = find_candidates(LHS=LHS, RHS=RHS, metric=jellyfish.hamming_distance)

Create normalized distances for the found matches

In [None]:
ham['normalized_dist'] = ham.apply(lambda row: 1 - (row['dist']/max((len(row['lhs_name']), (len(row['rhs_name']))))), axis=1)

In [None]:
ham['match'] = ham.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=list(ham[ham['match']]['dist']), y=list(ham[ham['match']]['normalized_dist']), name='True match', boxpoints='all'))
fig.add_trace(go.Box(x=list(ham[~ham['match']]['dist']), y=list(ham[~ham['match']]['normalized_dist']), name='False match', boxpoints='all'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    #title="Levenshtein Distance",
    xaxis_title="Edit distance",
    yaxis_title="Normalized edit distance",
)

In [None]:
fig.write_image("images/ham_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(ham[ham['match']]['dist']), name='True match'))
fig.add_trace(go.Histogram(x=list(ham[~ham['match']]['dist']), name='False match'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
)


In [None]:
fig.write_image("images/ham_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(ham['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("images/ham_match_distr.png")

In [None]:
mismatched = 1 - (ham[ham['match']].shape[0]/len(LHS))

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({ham[~ham["match"]].shape[0]})')

### Normalized Hamming Distance<a class="anchor" id="7"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#6) --- [Next](#8)

In [None]:
[Back to the Table of Contents](#zero-bullet)def normalized_hamming(s1: str, s2: str):
    dist = jellyfish.hamming_distance(s1, s2)
    return dist/max(len(s1), len(s2))

In [None]:
ham_n = find_candidates(LHS=LHS, RHS=RHS, metric=normalized_hamming)

In [None]:
ham_n['normalized_dist'] = ham_n.apply(lambda row: 1 - row['dist'], axis=1)
ham_n['dist'] = ham_n.apply(lambda row: row['dist'] * max(len(row['lhs_name']), len(row['rhs_name'])), axis=1)

In [None]:
ham_n['match'] = ham_n.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=list(ham_n[ham_n['match']]['dist']), y=list(ham_n[ham_n['match']]['normalized_dist']), name='True match', boxpoints='all'))
fig.add_trace(go.Box(x=list(ham_n[~ham_n['match']]['dist']), y=list(ham_n[~ham_n['match']]['normalized_dist']), name='False match', boxpoints='all'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    #title="Levenshtein Distance",
    xaxis_title="Edit distance",
    yaxis_title="Normalized edit distance",
)

In [None]:
fig.write_image("images/ham_n_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(ham_n[ham_n['match']]['dist']), name='True match'))
fig.add_trace(go.Histogram(x=list(ham_n[~ham_n['match']]['dist']), name='False match'))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
)


In [None]:
fig.write_image("images/ham_n_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(ham_n['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("images/ham_n_match_distr.png")

In [None]:
mismatched = 1 - (ham_n[ham_n['match']].shape[0]/len(LHS))

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({ham_n[~ham_n["match"]].shape[0]})')

### Jaccard similarity<a class="anchor" id="8"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#7) --- [Next](#9)

In [None]:
[Back to the Table of Contents](#zero-bullet)def jsm(s1, s2):
    if len(s2) > len(s1):
        set2 = set(s1)
        set1 = set(s2)
    else:
        set1 = set(s1)
        set2 = set(s2)
    return 1 - len(set1.intersection(set2))/len(set1.union(set2))

In [None]:
jac = find_candidates(LHS=LHS, RHS=RHS, metric=jsm)

In [None]:
jac['normalized_dist'] = jac.apply(lambda row: 1 - (row['dist']/max((len(row['lhs_name']), (len(row['rhs_name']))))), axis=1)

In [None]:
jac['match'] = jac.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(jac[jac['match']]['dist']), y=list(jac[jac['match']]['normalized_dist']), name='True match', mode="markers"))
fig.add_trace(go.Scatter(x=list(jac[~jac['match']]['dist']), y=list(jac[~jac['match']]['normalized_dist']), name='False match', mode="markers"))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    #title="Levenshtein Distance",
    xaxis_title="Edit distance",
    yaxis_title="Normalized edit distance",
)

In [None]:
fig.write_image("images/jac_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(jac[jac['match']]['dist']), name='True match', nbinsx=6))
fig.add_trace(go.Histogram(x=list(jac[~jac['match']]['dist']), name='False match', nbinsx=6))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("jac_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(jac['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("images/jac_match_distr.png")

In [None]:
fig.write_image("images/jac_match_distr.png")

In [None]:
mismatched = 1 - (jac[jac['match']].shape[0]/len(LHS))

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({jac[~jac["match"]].shape[0]})')

### Modified Jaccard similarity<a class="anchor" id="9"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#8) --- [Next](#10)

In [None]:
from collections import Counter

In [None]:
def jsm_m(s1, s2):
    if len(s2) > len(s1):
        set2 = Counter(s1)
        set1 = Counter(s2)
    else:
        set1 = Counter(s1)
        set2 = Counter(s2)
    union = sum(set1.values()) + sum(set2.values())
    set1.subtract(set2)

    # Drop elements with negative values
    #set1 = Counter({k: v for k, v in set1.items() if v >= 0})
    set1 = Counter({k: abs(v) for k, v in set1.items()})
    return sum(set1.values())/union

In [None]:
jac_n = find_candidates(LHS=LHS, RHS=RHS, metric=jsm_m)

In [None]:
jac_n['normalized_dist'] = jac_n.apply(lambda row: 1 - (row['dist']/max((len(row['lhs_name']), (len(row['rhs_name']))))), axis=1)

In [None]:
jac_n['match'] = jac_n.apply(lambda row: row['lhs_label'] == row['rhs_label'],axis=1)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(jac_n[jac_n['match']]['dist']), y=list(jac_n[jac_n['match']]['normalized_dist']), name='True match', mode="markers"))
fig.add_trace(go.Scatter(x=list(jac_n[~jac_n['match']]['dist']), y=list(jac_n[~jac_n['match']]['normalized_dist']), name='False match', mode="markers"))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    #title="Levenshtein Distance",
    xaxis_title="Edit distance",
    yaxis_title="Normalized edit distance",
)

In [None]:
fig.write_image("images/jac_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(jac_n[jac_n['match']]['dist']), name='True match', nbinsx=6))
fig.add_trace(go.Histogram(x=list(jac_n[~jac_n['match']]['dist']), name='False match', nbinsx=6))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    xaxis_title="Edit distance",
    yaxis_title="Aggregate",
)


In [None]:
fig.write_image("images/jac_dist_distr.png")

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(jac_n['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
fig.write_image("images/jac_match_distr.png")

In [None]:
mismatched = 1 - (jac_n[jac_n['match']].shape[0]/len(LHS))

In [None]:
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({jac_n[~jac_n["match"]].shape[0]})')

### NYSIIS<a class="anchor" id="10"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#9) --- [Next](#11)

In [None]:
LHS_nysiis = {i: (jellyfish.nysiis(name), int(label)) for i, (name, label) in enumerate(LHS.items())}
RHS_nysiis = {i: (jellyfish.nysiis(name), int(label)) for i, (name, label) in enumerate(RHS.items())}

In [None]:
LHS_nysiis = pd.DataFrame.from_dict(LHS_nysiis, columns=['lhs_name', 'lhs_label'], orient='index')
RHS_nysiis = pd.DataFrame.from_dict(RHS_nysiis, columns=['rhs_name', 'rhs_label'], orient='index')

In [None]:
nysiis = LHS_nysiis.merge(RHS_nysiis, left_on='lhs_name', right_on='rhs_name', how='inner')

In [None]:
nysiis['match'] = nysiis['lhs_label'] == nysiis['rhs_label']

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(nysiis['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
mismatched = 1 - (nysiis[nysiis['match']].shape[0]/len(LHS))
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({len(LHS)-nysiis[nysiis["match"]].shape[0]})')

### Soundex<a class="anchor" id="11"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#10) --- [Next](#12)

In [None]:
[Back to the Table of Contents](#zero-bullet)LHS_soundex = {i: (jellyfish.soundex(name), int(label)) for i, (name, label) in enumerate(LHS.items())}
RHS_soundex = {i: (jellyfish.soundex(name), int(label)) for i, (name, label) in enumerate(RHS.items())}

In [None]:
LHS_soundex = pd.DataFrame.from_dict(LHS_soundex, columns=['lhs_name', 'lhs_label'], orient='index')
RHS_soundex = pd.DataFrame.from_dict(RHS_soundex, columns=['rhs_name', 'rhs_label'], orient='index')

In [None]:
soundex = LHS_soundex.merge(RHS_soundex, left_on='lhs_name', right_on='rhs_name', how='inner')

In [None]:
soundex['match'] = soundex['lhs_label'] == soundex['rhs_label']

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(soundex['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
mismatched = 1 - (soundex[soundex['match']].shape[0]/len(LHS))
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({len(LHS)-soundex[soundex["match"]].shape[0]})')

### Double metaphone<a class="anchor" id="12"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#11) --- [Next](#13)

In [None]:
LHS_dm = {i: (doublemetaphone(name)[0], doublemetaphone(name)[1], int(label)) for i, (name, label) in enumerate(LHS.items())}
RHS_dm = {i: (doublemetaphone(name)[0], doublemetaphone(name)[1], int(label)) for i, (name, label) in enumerate(RHS.items())}

In [None]:
LHS_dm = pd.DataFrame.from_dict(LHS_dm, columns=['lhs_name1', 'lhs_name2', 'lhs_label'], orient='index')
RHS_dm = pd.DataFrame.from_dict(RHS_dm, columns=['rhs_name1', 'rhs_name2', 'rhs_label'], orient='index')

In [None]:
def find_match(name_label: tuple, rhs: pd.DataFrame):
    if name_label[0] != '':
        match1 = rhs[rhs['rhs_name1'] == name_label[0]]
        match2 = rhs[rhs['rhs_name2'] == name_label[0]]
        match = pd.concat([match1, match2]).reset_index(drop=True)
        match['lhs_label'] = [name_label[1] for _ in range(match.shape[0])]
        return match
    return pd.DataFrame()

In [None]:
match1 = pd.concat([find_match((name,label), RHS_dm) for name, label in zip(LHS_dm['lhs_name1'], LHS_dm['lhs_label'])]).reset_index(drop=True)
match2 = pd.concat([find_match((name,label), RHS_dm) for name, label in zip(LHS_dm['lhs_name2'], LHS_dm['lhs_label'])]).reset_index(drop=True)
dm = pd.concat([match1, match2]).reset_index(drop=True)
del match1
del match2

In [None]:
dm['match'] = dm['lhs_label'] == dm['rhs_label']

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(dm['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
mismatched = 1 - (dm[dm['match']].shape[0]/len(LHS))
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({len(LHS)-dm[dm["match"]].shape[0]})')

### Learnable similarity metric by dedupe io<a class="anchor" id="13"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#12) --- [Next](#fifth-bullet)

In [None]:
from record_linker import find_matches

In [None]:
LHS_rl = pd.DataFrame(zip(*[LHS.keys(), list(map(str, LHS.values()))]), columns=['name', 'label'])
RHS_rl = pd.DataFrame(zip(*[RHS.keys(), list(map(str, RHS.values()))]), columns=['name', 'label'])

Note: Record linker is already pre-trained for this particular data. One has to train the linker anew for any other data (see record_linker.py)

In [None]:
match = find_matches(LHS_rl, RHS_rl)

In [None]:
index, score = list(zip(*match[0]))

In [None]:
lhs_index, rhs_index = list(zip(*index))

In [None]:
LHS_rl.loc[pd.Index(list(lhs_index)), 'rhs_index'] = rhs_index
LHS_rl.loc[pd.Index(list(lhs_index)), 'dist'] = score

In [None]:
rl = LHS_rl.merge(RHS_rl, left_on='rhs_index', right_index=True, suffixes=['_lhs', '_rhs'])

In [None]:
rl['match'] = rl['label_lhs'] == rl['label_rhs']

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(rl['match'])))
fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    xaxis_title="Name match",
    yaxis_title="Aggregate",
)

In [None]:
mismatched = 1 - (rl[rl['match']].shape[0]/len(LHS))
print(f'Percentage of mismatched names: {mismatched * 100:.2f}%, ({len(LHS)-rl[rl["match"]].shape[0]})')

### Summary<a class="anchor" id="fifth-bullet"></a>

[Back to the Table of Contents](#zero-bullet) --- [Previous](#13)

Run this summary, when all dataframes listed below are ready.

In [None]:
matches = [lev['match'], 
            lev_n['match'], 
            dam_lev['match'],
            dam_lev_n['match'],
            ham['match'],
            ham_n['match'],
            jw['match'],
            jac['match'],
            jac_n['match'],
            rl['match'],
            nysiis['match'],
            soundex['match'],
            dm['match']]

In [None]:
fig = go.Figure()
for match in matches:
    match = dict(match.value_counts())
    y = list(match.values())
    x = list(match.keys())
    fig.add_trace(go.Bar(x=x, y=y))

fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    yaxis_title="Name match",
    xaxis_title="Aggregate",
)
fig.update_traces(texttemplate='%{y}', textposition='outside')

In [None]:
metrics = ['Lev', 'Lev. norm', 'Dam-Lev', 'Dam-Lev. norm', 'Ham', 'Ham. norm', 'Jaro-Wink.', 'Jac.', 'Jac. freq', 'RLinker', 'NYSIIS', 'Soundex', 'D. Metaphone']

In [None]:
trues = [sum(m) for m in matches]
falses = [len(m)-sum(m) for m in matches]

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=metrics, y=trues, name='True'))
fig.add_trace(go.Bar(x=metrics, y=falses, name='False'))
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    colorway=px.colors.qualitative.Safe,
    paper_bgcolor="White",
    # title="Matching distribution",
    yaxis_title="Number matches",
    xaxis_title="Techniques",
    uniformtext_minsize=10, uniformtext_mode='show'
)
fig.update_traces(texttemplate='%{y:.5}', textposition='outside')