# Data Prep

## Data Loading

In [1]:
import pandas as pd
import numpy as np
import collections
import Levenshtein as lv
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing import sequence
from tqdm import tqdm

In [2]:
# Set this to True when you want to regenerate the Levenshtein Distance
#  otherwise will load csv file
REGEN = True

In [3]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.lower()

# Remove unrequired first name
sdf.drop('name_first', axis=1, inplace=True)

sdf

Unnamed: 0,name_last,race
0,walker,nh_white
1,palmer,nh_white
2,mc cleod,nh_black
3,scarborough,nh_white
4,walker,nh_white
...,...,...
13653888,philpott,nh_white
13653889,walters,nh_white
13653890,sawyer,nh_white
13653891,thomas,nh_white


In [4]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanic    2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [5]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [6]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)

In [7]:
gdf.head(15)

race,name_last,asian,hispanic,nh_black,nh_white,total_n
0,fleurime michel,0.0,0.0,1.0,0.0,1.0
1,franklin,0.0,0.0,1.0,0.0,1.0
2,grant cliatt,0.0,0.0,1.0,0.0,1.0
3,hassan,1.0,0.0,0.0,0.0,1.0
4,king,0.0,1.0,0.0,0.0,1.0
5,williams,0.0,0.0,0.0,1.0,1.0
6,0kharitonenko,0.0,0.0,0.0,1.0,1.0
7,1amirthanayagam,1.0,0.0,0.0,0.0,1.0
8,4r,0.0,0.0,0.0,1.0,1.0
9,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,1.0


In [8]:
races = sorted(sdf.race.unique().tolist())
races

['asian', 'hispanic', 'nh_black', 'nh_white']

In [9]:
def calc_prop(row):
    total = row['total_n']
    values = [(i/total) for i in row]
    return pd.Series(values)

In [10]:
# Calculate the proportion of people with a particular last name
#  that identify with one of the 4 races
temp = races
temp.append('total_n')

gdf[races] = gdf[temp].apply(calc_prop, axis=1)

In [11]:
gdf

race,name_last,asian,hispanic,nh_black,nh_white,total_n
0,fleurime michel,0.0,0.0,1.0,0.0,1.0
1,franklin,0.0,0.0,1.0,0.0,1.0
2,grant cliatt,0.0,0.0,1.0,0.0,1.0
3,hassan,1.0,0.0,0.0,0.0,1.0
4,king,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
849821,zyzanski,0.0,0.0,0.0,1.0,1.0
849822,zyzdryn,0.0,0.0,0.0,1.0,1.0
849823,zyznomyrsky,0.0,0.0,0.0,1.0,1.0
849824,zzaman,1.0,0.0,0.0,0.0,1.0


## Data Processing

In [12]:
proto_df = gdf.sample(frac=0.1, random_state=10)
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(84983, 6)

In [13]:
NGRAMS = 2
feature_len = 25

In [14]:
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

# **********
# **** CHANGE THIS TO FULL DATAFRAME WHEN READY FOR FULL DATASET ****
a = vect.fit_transform(proto_df.name_last) 
tfidf = tfidf_transformer.fit_transform(a)
# **********

vocab = vect.vocabulary_

In [15]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 731


In [16]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [17]:
# build bi-grams from index of n-gram sequence
proto_df['n_gram'] = np.array(proto_df.name_last.apply(lambda c: find_ngrams(c, NGRAMS)))
proto_df['n_gram'] = (sequence.pad_sequences(proto_df['n_gram'], maxlen=feature_len)).tolist()
proto_df['tfidf'] = tfidf.toarray().tolist()

In [18]:
proto_df

race,name_last,asian,hispanic,nh_black,nh_white,total_n,n_gram,tfidf
0,bojin,0.000000,0.000000,0.000000,1.000000,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,owens-harvey,0.000000,0.000000,1.000000,0.000000,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,anelis,0.000000,0.000000,1.000000,0.000000,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,clavel rivera,0.000000,1.000000,0.000000,0.000000,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 21...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,ley,0.042705,0.241993,0.007117,0.708185,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...
84978,cioletti,0.000000,0.000000,0.000000,1.000000,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
84979,montillano,1.000000,0.000000,0.000000,0.000000,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
84980,hershbein,0.000000,0.000000,0.000000,1.000000,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
84981,delvalle rodriguez,0.000000,1.000000,0.000000,0.000000,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 72, 17, 181, 182, 43,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [19]:
# Since each row is a representation of a document, calculating the cosine similarity between the tf-idf matrix
#  should give us the cosine similarity between each vector (row) with the other vectors (rows)
#  the first row would be the cosine distance between vector 0 and vector 1,2,3,4.... n
#  this produces a dense matrix of each vector relative to the others with the diagnols being 
#  a comparison to itself
cos_sim = cosine_similarity(tfidf,tfidf)

In [20]:
# filter for all vectors that have a cosine similarity of <= 0.6 and > 0 
mask = np.logical_and(cos_sim <=0.6, cos_sim > 0)
sim_vector_idx = np.argwhere(mask)

In [21]:
# Getting the most common 100 names
#   returned results is a list of tuples of (record #, count)
common_names = collections.Counter(sim_vector_idx[:,1]).most_common(1000)

In [22]:
# Generating a list of of the names that should be passed on to Levenshtein Distance calculations
common_names_list = []
for i in range(len(common_names)):
    common_names_list.append(common_names[i][0])

In [23]:
proto_df.iloc[common_names_list]['name_last']

67132     calderon-candelario
30044    hernandez-valladares
17271     blanchard caballero
66949    manchester-arguelles
64591    martinez castellanos
                 ...         
37146      bernard louis jean
28882       anderson-stephens
4305      apichardpattanasiri
60671          marks-anderson
45889    chevalier de jimenez
Name: name_last, Length: 1000, dtype: object

# Levenshtein Distance

In [24]:
# Copying the DataFrame and resetting the index so that its from 0-xxxx
leven_df = (proto_df.iloc[common_names_list]).copy()
leven_df.reset_index(inplace=True)
leven_df.drop(['index','n_gram','tfidf'],axis=1, inplace=True)

In [25]:
leven_df

race,name_last,asian,hispanic,nh_black,nh_white,total_n
0,calderon-candelario,0.0,1.0,0.0,0.0,1.0
1,hernandez-valladares,0.0,1.0,0.0,0.0,1.0
2,blanchard caballero,0.0,1.0,0.0,0.0,1.0
3,manchester-arguelles,0.0,0.0,0.0,1.0,1.0
4,martinez castellanos,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
995,bernard louis jean,0.0,0.0,1.0,0.0,1.0
996,anderson-stephens,0.0,0.0,1.0,0.0,1.0
997,apichardpattanasiri,1.0,0.0,0.0,0.0,1.0
998,marks-anderson,0.0,0.0,0.0,1.0,1.0


In [26]:
if (REGEN):
    # Creating Numpy Array to hold results
    dim = leven_df.shape[0]

    lev_dist = np.zeros((dim,dim))
    for idx, row1 in tqdm(leven_df.iterrows()):
        for j in range (idx, dim):
            if (idx == j):
                continue
            else:
                lev_dist[idx,j] = lv.distance(row1['name_last'],leven_df.iloc[j]['name_last'])
else:
    lev_dist = pd.read_csv('lev_distance_1per.csv').to_numpy()
    

1000it [01:02, 16.13it/s]


In [27]:
# half filled out matrix
lev_dist

array([[ 0., 15., 14., ..., 16., 11., 16.],
       [ 0.,  0., 15., ..., 16., 16., 16.],
       [ 0.,  0.,  0., ..., 13., 15., 18.],
       ...,
       [ 0.,  0.,  0., ...,  0., 16., 20.],
       [ 0.,  0.,  0., ...,  0.,  0., 17.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [28]:
# fill out the bottom portion of the matrix
#  i.e. the distance between name[123] & name[345] is the
#  same as string[345] & name[123]

if (REGEN):
    for i in range(dim):
        for j in range (i, dim):
            if (i == j):
                continue
            else:
                lev_dist[j,i] = lev_dist[i,j]
    # Now the matrix is mirrored
    lev_dist.tofile('lev_distance_1per.csv',sep=',')

# Find K smallest values
i.e. the nearest k neighbors in our vector

In [29]:
def get_accuracy (name_df, levenstein_dist, k):
    # Get the nearest k values for the string
    #   we add +1 since 0 (the string itself)
    #   will be present in the diagnal value
    k +=1 
    values = np.argpartition(levenstein_dist, (k))
    final_pred = []
    for i in range(levenstein_dist.shape[0]):
        max_value = np.max(levenstein_dist[i][values[i][:k]])
        mask = (levenstein_dist[i] <= max_value) & (levenstein_dist[i] > 0)
        out = np.argwhere(mask)
        total_sum =  (name_df.iloc[out.reshape(-1)]['total_n'].sum())
        pred_white = (name_df.iloc[out.reshape(-1)]['nh_white'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_black = (name_df.iloc[out.reshape(-1)]['nh_black'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_hispanic = (name_df.iloc[out.reshape(-1)]['hispanic'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_asian = (name_df.iloc[out.reshape(-1)]['asian'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        predictions = [pred_asian, pred_hispanic, pred_black, pred_white]
        final_pred.append(races[predictions.index(max(predictions))])
    name_df['true_race'] = name_df[races].idxmax(axis=1)
    name_df['pred'] = final_pred
    return (name_df.loc[name_df['true_race'] == name_df['pred']]['pred'].count() / name_df.shape[0])
    

In [30]:
k_metrics = {
    3:0,
    5:0,
    7:0,
    10:0,
    15:0,
    20:0,
    100:0,
    300:0,
    500:0
}

In [31]:
for value, key in enumerate (k_metrics):
    value = get_accuracy (leven_df, lev_dist, key)
    print ('For k={} the accuracy is: {}'.format(key,value))
    k_metrics[key] = value

For k=3 the accuracy is: 0.729
For k=5 the accuracy is: 0.732
For k=7 the accuracy is: 0.729
For k=10 the accuracy is: 0.72
For k=15 the accuracy is: 0.718
For k=20 the accuracy is: 0.718
For k=100 the accuracy is: 0.708
For k=300 the accuracy is: 0.708
For k=500 the accuracy is: 0.708


In [32]:
k_metrics

{3: 0.729,
 5: 0.732,
 7: 0.729,
 10: 0.72,
 15: 0.718,
 20: 0.718,
 100: 0.708,
 300: 0.708,
 500: 0.708}

In [37]:
def predict (name_df, corp_df, k):
    corp_name = corp_df['name_last']

    # Calculate the Levenshtein distance for the test set that we are trying to predict
    test_lev_dist = np.zeros((name_df.shape[0],corp_df.shape[0]))
    for idx, row1 in tqdm(name_df.iterrows()):
        for j in range(corp_df.shape[0]):
            test_lev_dist[idx,j] = lv.distance(row1['name_last'],corp_df.iloc[j]['name_last'])
    
    # Get accuracy of the model on the test set
    #   - taking the levenshtein distance calculated from test names to corpus names
    #   - finding the nearest training names to the test names
    #   - predicting the test race based on the training names
    k +=1 
    values = np.argpartition(test_lev_dist, (k))
    final_pred = []
    for i in tqdm(range(test_lev_dist.shape[0])):
        max_value = np.max(test_lev_dist[i][values[i][:k]])
        mask = (test_lev_dist[i] <= max_value) & (test_lev_dist[i] > 0)
        out = np.argwhere(mask)
        total_sum =  (corp_df.iloc[out.reshape(-1)]['total_n'].sum())
        pred_white = (corp_df.iloc[out.reshape(-1)]['nh_white'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_black = (corp_df.iloc[out.reshape(-1)]['nh_black'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_hispanic = (corp_df.iloc[out.reshape(-1)]['hispanic'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_asian = (corp_df.iloc[out.reshape(-1)]['asian'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        predictions = [pred_asian, pred_hispanic, pred_black, pred_white]
        final_pred.append(races[predictions.index(max(predictions))])
        
    name_df['true_race'] = name_df[races].idxmax(axis=1)
    name_df['pred'] = final_pred
    return (name_df.loc[name_df['true_race'] == name_df['pred']]['pred'].count() / name_df.shape[0])

In [38]:
model_perf = predict(proto_df, leven_df, 5)
print ('Accuracy of model is: {:.2f}%'.format(model_perf*100))

84983it [3:02:02,  7.78it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 84983/84983 [04:36<00:00, 307.79it/s]


Accuracy of model is: 24.96%
