# Data Prep

## Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Set this to True when you want to regenerate the Levenshtein Distance
#  otherwise will load csv file
REGEN = True

In [3]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_first'] = sdf.name_first.str.title()
sdf['name_last'] = sdf.name_last.str.title()

sdf

Unnamed: 0,name_last,name_first,race
0,Walker,Elizabeth,nh_white
1,Palmer,Alton,nh_white
2,Mc Cleod,Alicia,nh_black
3,Scarborough,Dale,nh_white
4,Walker,Daniel,nh_white
...,...,...,...
13653888,Philpott,April,nh_white
13653889,Walters,William,nh_white
13653890,Sawyer,Matthew,nh_white
13653891,Thomas,Janine,nh_white


In [4]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanic    2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [5]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [6]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)

gdf['total_norm'] = gdf['total_n']/np.max(gdf['total_n'])
gdf.reset_index(inplace=True)

In [7]:
gdf.head(15)

race,name_last,asian,hispanic,nh_black,nh_white,total_n,total_norm
0,Fleurime Michel,0.0,0.0,1.0,0.0,1.0,1e-05
1,Franklin,0.0,0.0,1.0,0.0,1.0,1e-05
2,Grant Cliatt,0.0,0.0,1.0,0.0,1.0,1e-05
3,Hassan,1.0,0.0,0.0,0.0,1.0,1e-05
4,King,0.0,1.0,0.0,0.0,1.0,1e-05
5,Williams,0.0,0.0,0.0,1.0,1.0,1e-05
6,0Kharitonenko,0.0,0.0,0.0,1.0,1.0,1e-05
7,1Amirthanayagam,1.0,0.0,0.0,0.0,1.0,1e-05
8,4R,0.0,0.0,0.0,1.0,1.0,1e-05
9,77348 Dancing Rochanavibhata,1.0,0.0,0.0,0.0,1.0,1e-05


In [8]:
races = sorted(sdf.race.unique().tolist())
races

['asian', 'hispanic', 'nh_black', 'nh_white']

In [9]:
def calc_prop(row):
    total = row['total_n']
    values = [(i/total) for i in row]
    return pd.Series(values)

In [10]:
temp = races
temp.append('total_n')
print(temp)
gdf[races] = gdf[temp].apply(calc_prop, axis=1)

['asian', 'hispanic', 'nh_black', 'nh_white', 'total_n']


In [11]:
gdf

race,name_last,asian,hispanic,nh_black,nh_white,total_n,total_norm
0,Fleurime Michel,0.0,0.0,1.0,0.0,1.0,0.000010
1,Franklin,0.0,0.0,1.0,0.0,1.0,0.000010
2,Grant Cliatt,0.0,0.0,1.0,0.0,1.0,0.000010
3,Hassan,1.0,0.0,0.0,0.0,1.0,0.000010
4,King,0.0,1.0,0.0,0.0,1.0,0.000010
...,...,...,...,...,...,...,...
849821,Zyzanski,0.0,0.0,0.0,1.0,1.0,0.000010
849822,Zyzdryn,0.0,0.0,0.0,1.0,1.0,0.000019
849823,Zyznomyrsky,0.0,0.0,0.0,1.0,1.0,0.000010
849824,Zzaman,1.0,0.0,0.0,0.0,1.0,0.000010


## Data Processing

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing import sequence


NGRAMS = 2
feature_len = 25

In [13]:
proto_df = gdf.sample(frac=0.01, random_state=10)
proto_df.shape

(8498, 7)

# Levenshtein Distance

In [14]:
# Copying the DataFrame and resetting the index so that its from 0-xxxx
train_df, test_df = train_test_split(proto_df, test_size=0.1)
train_df.reset_index(inplace=True)
train_df.drop('index',inplace=True, axis=1)
test_df.reset_index(inplace=True)
test_df.drop('index',inplace=True, axis=1)

In [15]:
import Levenshtein as lv

In [16]:
train_df

race,name_last,asian,hispanic,nh_black,nh_white,total_n,total_norm
0,Goyanes,0.0,0.852941,0.0,0.147059,1.0,0.000326
1,Morsics,0.0,0.000000,0.0,1.000000,1.0,0.000057
2,Kulkusky,0.0,0.000000,0.0,1.000000,1.0,0.000010
3,Pakbaz,0.0,0.000000,0.0,1.000000,1.0,0.000038
4,Chaze,0.0,0.000000,0.0,1.000000,1.0,0.000019
...,...,...,...,...,...,...,...
7643,Randall-Kepler,0.0,0.000000,1.0,0.000000,1.0,0.000010
7644,Ayala-Delvalle,0.0,1.000000,0.0,0.000000,1.0,0.000010
7645,Szestalo,0.0,0.000000,0.0,1.000000,1.0,0.000029
7646,Alka,0.0,0.000000,0.0,1.000000,1.0,0.000010


In [17]:
if (REGEN):
    # Creating Numpy Array to hold results
    dim = train_df.shape[0]

    lev_dist = np.zeros((dim,dim))
    for idx, row1 in train_df.iterrows():
        for j in range (idx, dim):
            if (idx == j):
                continue
            else:
                lev_dist[idx,j] = lv.distance(row1['name_last'],train_df.iloc[j]['name_last'])
                #print ('{} : {} -- {}: {} -- distance: {}'.format(idx, row1['name_last'], j, knn_df.iloc[j]['name_last'], lev_dist[idx,j]))
        if (idx % 500 == 0):
            print ('{} names were processed'.format(idx))
else:
    lev_dist = pd.read_csv('lev_distance_1per.csv').to_numpy()
    

0 names were processed
500 names were processed
1000 names were processed
1500 names were processed
2000 names were processed
2500 names were processed
3000 names were processed
3500 names were processed
4000 names were processed
4500 names were processed
5000 names were processed
5500 names were processed
6000 names were processed
6500 names were processed
7000 names were processed
7500 names were processed


In [18]:
# half filled out matrix
lev_dist

array([[ 0.,  5.,  8., ...,  8.,  6., 16.],
       [ 0.,  0.,  8., ...,  7.,  7., 15.],
       [ 0.,  0.,  0., ...,  8.,  6., 16.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  7., 15.],
       [ 0.,  0.,  0., ...,  0.,  0., 16.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [19]:
# fill out the bottom portion of the matrix
#  i.e. the distance between name[123] & name[345] is the
#  same as string[345] & name[123]

if (REGEN):
    for i in range(dim):
        for j in range (i, dim):
            if (i == j):
                continue
            else:
                lev_dist[j,i] = lev_dist[i,j]
    # Now the matrix is mirrored
    lev_dist.tofile('lev_distance_1per.csv',sep=',')

# Find K smallest values
i.e. the nearest k neighbors in our vector

In [20]:
def get_accuracy (name_df, levenstein_dist, k):
    # Get the nearest k values for the string
    #   we add +1 since 0 (the string itself)
    #   will be present in the diagnal value
    k +=1 
    values = np.argpartition(levenstein_dist, (k))
    final_pred = []
    for i in range(levenstein_dist.shape[0]):
        max_value = np.max(levenstein_dist[i][values[i][:k]])
        mask = (levenstein_dist[i] <= max_value) & (levenstein_dist[i] > 0)
        out = np.argwhere(mask)
        total_sum =  (name_df.iloc[out.reshape(-1)]['total_n'].sum())
        pred_white = (name_df.iloc[out.reshape(-1)]['nh_white'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_black = (name_df.iloc[out.reshape(-1)]['nh_black'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_hispanic = (name_df.iloc[out.reshape(-1)]['hispanic'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_asian = (name_df.iloc[out.reshape(-1)]['asian'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        predictions = [pred_asian, pred_hispanic, pred_black, pred_white]
        final_pred.append(races[predictions.index(max(predictions))])
    name_df['true_race'] = name_df[races].idxmax(axis=1)
    name_df['pred'] = final_pred
    return (name_df.loc[name_df['true_race'] == name_df['pred']]['pred'].count() / name_df.shape[0])
    

In [21]:
k_metrics = {
    3:0,
    5:0,
    7:0,
    10:0,
    15:0,
    20:0,
    100:0,
    300:0,
    500:0
}

In [22]:
for value, key in enumerate (k_metrics):
    value = get_accuracy (train_df, lev_dist, key)
    print ('For k={} the accuracy is: {}'.format(key,value))
    k_metrics[key] = value

For k=3 the accuracy is: 0.6561192468619247
For k=5 the accuracy is: 0.6596495815899581
For k=7 the accuracy is: 0.6586035564853556
For k=10 the accuracy is: 0.6555962343096234
For k=15 the accuracy is: 0.6489278242677824
For k=20 the accuracy is: 0.6423901673640168
For k=100 the accuracy is: 0.5983263598326359
For k=300 the accuracy is: 0.5618462343096234
For k=500 the accuracy is: 0.5479864016736402


In [23]:
k_metrics

{3: 0.6561192468619247,
 5: 0.6596495815899581,
 7: 0.6586035564853556,
 10: 0.6555962343096234,
 15: 0.6489278242677824,
 20: 0.6423901673640168,
 100: 0.5983263598326359,
 300: 0.5618462343096234,
 500: 0.5479864016736402}

In [24]:
def predict (name_df, corp_df, k):
    corp_name = corp_df['name_last']

    # Calculate the Levenshtein distance for the test set that we are trying to predict
    test_lev_dist = np.zeros((name_df.shape[0],corp_df.shape[0]))
    for idx, row1 in name_df.iterrows():
        for j in range(corp_df.shape[0]):
            test_lev_dist[idx,j] = lv.distance(row1['name_last'],corp_df.iloc[j]['name_last'])
        if (idx % 500 == 0):
            print ('{} names were processed'.format(idx))
    
    # Get accuracy of the model on the test set
    #   - taking the levenshtein distance calculated from test names to corpus names
    #   - finding the nearest training names to the test names
    #   - predicting the test race based on the training names
    k +=1 
    values = np.argpartition(test_lev_dist, (k))
    final_pred = []
    for i in range(test_lev_dist.shape[0]):
        max_value = np.max(test_lev_dist[i][values[i][:k]])
        mask = (test_lev_dist[i] <= max_value) & (test_lev_dist[i] > 0)
        out = np.argwhere(mask)
        total_sum =  (corp_df.iloc[out.reshape(-1)]['total_n'].sum())
        pred_white = (corp_df.iloc[out.reshape(-1)]['nh_white'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_black = (corp_df.iloc[out.reshape(-1)]['nh_black'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_hispanic = (corp_df.iloc[out.reshape(-1)]['hispanic'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_asian = (corp_df.iloc[out.reshape(-1)]['asian'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        predictions = [pred_asian, pred_hispanic, pred_black, pred_white]
        final_pred.append(races[predictions.index(max(predictions))])
        
    name_df['true_race'] = name_df[races].idxmax(axis=1)
    name_df['pred'] = final_pred
    return (name_df.loc[name_df['true_race'] == name_df['pred']]['pred'].count() / name_df.shape[0])

In [25]:
model_perf = predict(test_df, train_df, 5)
print ('Accuracy of model is: {:.2f}%'.format(model_perf))

0 names were processed
500 names were processed
Accuracy of model is: 0.66%
