### Preprocessing

In [47]:
import pandas as pd

Names = pd.read_csv('nameslist.csv')
Names

Unnamed: 0,Names
0,Howard Thompson
1,Wayne Hensley
2,Candace Mercado
3,Mary Garcia
4,Yvonne Dews
...,...
8799,Polina Yanovich
8800,Oliver Yeung
8801,Fengpeng Yuan
8802,Edward Zaneski


In [48]:
import pandas as pd
import random
import jellyfish
import fuzzywuzzy.fuzz as fuzz
from sklearn.linear_model import LogisticRegression

# Read the CSV file into a DataFrame
names = pd.read_csv('nameslist.csv')

# Define the function to generate false spellings with lower accuracy
def generate_false_spelling(name):
    # Modify the logic inside this function to generate false spellings
    # For example, randomly change multiple characters in the name
    name_list = list(name)
    num_changes = random.randint(1, int(len(name_list) * 1))  # Randomly change 1 to 100% of characters
    for _ in range(num_changes):
        random_index = random.randint(0, len(name_list) - 1)
        name_list[random_index] = random.choice(list('abcdefghijklmnopqrstuvwxyz'))
    return ''.join(name_list)

# Create a new column 'FalseSpellings' with the generated false spellings
names['FalseSpellings'] = names['Names'].apply(lambda x: generate_false_spelling(x))

# Compute Levenshtein distance
levenshtein_distances = []
for name, false_spelling in zip(names['Names'], names['FalseSpellings']):
    levenshtein_distance = jellyfish.levenshtein_distance(name, false_spelling)
    levenshtein_distances.append(levenshtein_distance)

# Add Levenshtein distance to the DataFrame
names['LevenshteinDistance'] = levenshtein_distances

# Compute fuzzywuzzy percentages
fuzzy_percentages = []
for name, false_spelling in zip(names['Names'], names['FalseSpellings']):
    fuzzy_percentage = fuzz.ratio(name, false_spelling)
    fuzzy_percentages.append(fuzzy_percentage)

# Add fuzzywuzzy percentages to the DataFrame
names['FuzzyPercentages'] = fuzzy_percentages

#add match column
names['Match'] = names['Names'] == names['FalseSpellings']


Unnamed: 0,Names,FalseSpellings,LevenshteinDistance,FuzzyPercentages,Match
0,Howard Thompson,gowugd Thomeson,4,73,False
1,Wayne Hensley,izgnswHensoee,7,46,False
2,Candace Mercado,Caamhcxrqemozip,11,40,False
3,Mary Garcia,Maaz Gprciv,4,64,False
4,Yvonne Dews,Yvogne Dews,1,91,False
...,...,...,...,...,...
8799,Polina Yanovich,Poping iibivict,7,53,False
8800,Oliver Yeung,Olixer Yeunz,2,83,False
8801,Fengpeng Yuan,Fenupeng suaf,3,77,False
8802,Edward Zaneski,Ealarfdpannski,6,64,False


In [50]:
# show names where match == true
names[names['Match'] == True]

Unnamed: 0,Names,FalseSpellings,LevenshteinDistance,FuzzyPercentages,Match
382,Joann Alonzo,Joann Alonzo,0,100,True
616,Nancy Owens,Nancy Owens,0,100,True
811,Micah Lee,Micah Lee,0,100,True
1598,Mary Santos,Mary Santos,0,100,True
2277,Joe Meeks,Joe Meeks,0,100,True
2437,Ethel Fidler,Ethel Fidler,0,100,True
2946,Wanda Dickinson,Wanda Dickinson,0,100,True
3521,Virginia Somerville,Virginia Somerville,0,100,True
4192,Crystal Robbins,Crystal Robbins,0,100,True
4418,Rosetta Finch,Rosetta Finch,0,100,True


In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

 

# Prepare the features and target variable
X = names[['LevenshteinDistance', 'FuzzyPercentages']]
y = names['Match']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

# Predict the probabilities for new data
new_data = pd.DataFrame({
    'Names': ['John Smith', 'Ashlin Darius Govindasamy', 'Emily Johnson'],
    'FalseSpellings': ['John Smmith', 'Ashlin Darius Govindasamy', 'Emma Johnson'],    
})

new_data

Accuracy: 1.0


Unnamed: 0,Names,FalseSpellings
0,John Smith,John Smmith
1,Ashlin Darius Govindasamy,Ashlin Darius Govindasamy
2,Emily Johnson,Emma Johnson


In [55]:


# compute LevenshteinDistance
levenshtein_distances = []
for name, false_spelling in zip(new_data['Names'], new_data['FalseSpellings']):
    levenshtein_distance = jellyfish.levenshtein_distance(name, false_spelling)
    levenshtein_distances.append(levenshtein_distance)

# Add Levenshtein distance to the DataFrame
new_data['LevenshteinDistance'] = levenshtein_distances

# Compute fuzzywuzzy percentages
fuzzy_percentages = []
for name, false_spelling in zip(new_data['Names'], new_data['FalseSpellings']):
    fuzzy_percentage = fuzz.ratio(name, false_spelling)
    fuzzy_percentages.append(fuzzy_percentage)

# Add fuzzywuzzy percentages to the DataFrame
new_data['FuzzyPercentages'] = fuzzy_percentages

X_new = new_data[['LevenshteinDistance', 'FuzzyPercentages']]
predictions = model.predict_proba(X_new)[:, 1]
print("Name Match Probabilities:")
for name, prob in zip(new_data['Names'], predictions):
    print(f"{name}: {prob}")


Name Match Probabilities:
John Smith: 0.019742438257868516
Ashlin Darius Govindasamy: 0.9106135158183893
Emily Johnson: 1.8609785869046986e-10
