# Data Prep

## Data Loading

In [1]:
import pandas as pd
import numpy as np
import collections
import Levenshtein as lv
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing import sequence
from tqdm import tqdm

In [2]:
# Set this to True when you want to regenerate the Levenshtein Distance
#  otherwise will load csv file
REGEN = True

In [3]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.lower()

# Remove unrequired first name
sdf.drop('name_first', axis=1, inplace=True)

sdf

Unnamed: 0,name_last,race
0,walker,nh_white
1,palmer,nh_white
2,mc cleod,nh_black
3,scarborough,nh_white
4,walker,nh_white
...,...,...
13653888,philpott,nh_white
13653889,walters,nh_white
13653890,sawyer,nh_white
13653891,thomas,nh_white


In [4]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanic    2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [5]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [6]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)

In [7]:
gdf.head(15)

race,name_last,asian,hispanic,nh_black,nh_white,total_n
0,fleurime michel,0.0,0.0,1.0,0.0,1.0
1,franklin,0.0,0.0,1.0,0.0,1.0
2,grant cliatt,0.0,0.0,1.0,0.0,1.0
3,hassan,1.0,0.0,0.0,0.0,1.0
4,king,0.0,1.0,0.0,0.0,1.0
5,williams,0.0,0.0,0.0,1.0,1.0
6,0kharitonenko,0.0,0.0,0.0,1.0,1.0
7,1amirthanayagam,1.0,0.0,0.0,0.0,1.0
8,4r,0.0,0.0,0.0,1.0,1.0
9,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,1.0


In [8]:
races = sorted(sdf.race.unique().tolist())
races

['asian', 'hispanic', 'nh_black', 'nh_white']

In [9]:
def calc_prop(row):
    total = row['total_n']
    values = [(i/total) for i in row]
    return pd.Series(values)

In [10]:
# Calculate the proportion of people with a particular last name
#  that identify with one of the 4 races
temp = races.copy()
temp.append('total_n')

gdf[temp] = gdf[temp].apply(calc_prop, axis=1)

In [11]:
def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

In [13]:
gdf['true_race'] = gdf[races].idxmax(axis=1)
gdf['true_race'] = gdf['true_race'].apply(lambda c: get_race_idx(c,races))

In [14]:
gdf

race,name_last,asian,hispanic,nh_black,nh_white,total_n,true_race
0,fleurime michel,0.0,0.0,1.0,0.0,1.0,2
1,franklin,0.0,0.0,1.0,0.0,1.0,2
2,grant cliatt,0.0,0.0,1.0,0.0,1.0,2
3,hassan,1.0,0.0,0.0,0.0,1.0,0
4,king,0.0,1.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...
849821,zyzanski,0.0,0.0,0.0,1.0,1.0,3
849822,zyzdryn,0.0,0.0,0.0,1.0,1.0,3
849823,zyznomyrsky,0.0,0.0,0.0,1.0,1.0,3
849824,zzaman,1.0,0.0,0.0,0.0,1.0,0


## Data Processing

In [15]:
proto_df = gdf.groupby('true_race', group_keys=False).apply(lambda x: x.sample(frac=.25, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(212457, 7)

In [16]:
# Check that our sampling has generated proportionate representation in all classes
proto_df.true_race.value_counts()

3    129126
1     53493
2     22116
0      7722
Name: true_race, dtype: int64

In [17]:
NGRAMS = 2
feature_len = 25

In [18]:
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

# **********
# **** CHANGE THIS TO FULL DATAFRAME WHEN READY FOR FULL DATASET ****
a = vect.fit_transform(proto_df.name_last) 
tfidf = tfidf_transformer.fit_transform(a)
# **********

vocab = vect.vocabulary_

In [19]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 778


In [20]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [69]:
# build bi-grams from index of n-gram sequence
proto_df['n_gram'] = np.array(proto_df.name_last.apply(lambda c: find_ngrams(c, NGRAMS)))
proto_df['n_gram'] = (sequence.pad_sequences(proto_df['n_gram'], maxlen=feature_len)).tolist()
proto_df['tfidf'] = tfidf.toarray().tolist()

In [70]:
proto_df

race,name_last,asian,hispanic,nh_black,nh_white,total_n,true_race,n_gram,tfidf
0,adichirayil,1.0,0.000000,0.0,0.000000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,morillo encisco,1.0,0.000000,0.0,0.000000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.40391457325644736,..."
2,hular,1.0,0.000000,0.0,0.000000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,fuze,0.5,0.000000,0.0,0.500000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,balasingam,1.0,0.000000,0.0,0.000000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...
212452,robu,0.0,0.000000,0.0,1.000000,1.0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
212453,redisch,0.0,0.000000,0.0,1.000000,1.0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
212454,clute,0.0,0.010638,0.0,0.989362,1.0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
212455,matecki,0.0,0.000000,0.0,1.000000,1.0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [106]:
train_df, test_df = train_test_split(proto_df, test_size=.1)
train_df, valid_df = train_test_split(train_df, test_size=.2)
print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (152968, 9)
Validation set size: (38243, 9)
Test set size: (21246, 9)


In [129]:
def cos_sim(row, corpus_df, filter):
    orig_vector = np.array(row['tfidf']).reshape(1,-1)
    cossim = np.zeros(corpus_df.shape[0])
    j = 0
    for idx, row in corpus_df.iterrows():
        sim = cosine_similarity(orig_vector,np.array(row['tfidf']).reshape(1,-1))
        cossim[j] = sim
        j+=1
        
    return np.argwhere(cossim >= filter).reshape(-2,).tolist()

In [130]:
indices = cos_sim(valid_df.iloc[3], proto_df, 0.6)

In [131]:
indices

[89200, 138562]

In [132]:
def check_k(test_df, corpus_df, k):
    for idx, row in test_df.iterrows():
        indices = cos_sim(row, corpus_df, filter = .6)
        filtered_corpus_df = corpus_df.iloc[indices]
        # alternate filter = [pick top 5k]
        #calculate levenshtein between i and filtered_corpus_df
        #pick top k
        #do weighted average and produce the results
        print (filtered_corpus_df)
        return

In [19]:
# Since each row is a representation of a document, calculating the cosine similarity between the tf-idf matrix
#  should give us the cosine similarity between each vector (row) with the other vectors (rows)
#  the first row would be the cosine distance between vector 0 and vector 1,2,3,4.... n
#  this produces a dense matrix of each vector relative to the others with the diagnols being 
#  a comparison to itself
cos_sim = cosine_similarity(tfidf,tfidf)

In [69]:
# filter for all vectors that have a cosine similarity of <= 0.6 and > 0 
mask = np.logical_and(cos_sim <=0.6, cos_sim > 0)
sim_vector_idx = np.argwhere(cos_sim >=0.6)

In [70]:
# Getting the most common 100 names
#   returned results is a list of tuples of (record #, count)
common_names = collections.Counter(sim_vector_idx[:,1]).most_common(5000)

In [71]:
# Generating a list of of the names that should be passed on to Levenshtein Distance calculations
common_names_list = []
for i in range(len(common_names)):
    common_names_list.append(common_names[i][0])

In [72]:
proto_df.iloc[common_names_list]['name_last']

21519    rodriguez rodrigue
57616             lodriguez
11503          rodriguez c.
53499             rodriguea
82151      perez  rodriguez
                ...        
64570        rosero morales
81175                artino
11344                hassey
16459     meadows-rodriguez
29563                   nea
Name: name_last, Length: 5000, dtype: object

# Levenshtein Distance

In [73]:
# Copying the DataFrame and resetting the index so that its from 0-xxxx
leven_df = (proto_df.iloc[common_names_list]).copy()
leven_df.reset_index(inplace=True)
leven_df.drop(['index','n_gram','tfidf'],axis=1, inplace=True)

In [74]:
leven_df

race,name_last,asian,hispanic,nh_black,nh_white,total_n
0,rodriguez rodrigue,0.000000,1.000000,0.0,0.000000,1.0
1,lodriguez,0.000000,1.000000,0.0,0.000000,1.0
2,rodriguez c.,0.000000,1.000000,0.0,0.000000,1.0
3,rodriguea,0.000000,1.000000,0.0,0.000000,1.0
4,perez rodriguez,0.000000,1.000000,0.0,0.000000,1.0
...,...,...,...,...,...,...
4995,rosero morales,0.000000,1.000000,0.0,0.000000,1.0
4996,artino,0.000000,0.000000,0.0,1.000000,1.0
4997,hassey,0.034483,0.034483,0.0,0.931034,1.0
4998,meadows-rodriguez,0.000000,0.000000,0.0,1.000000,1.0


In [76]:
if (REGEN):
    # Creating Numpy Array to hold results
    dim = leven_df.shape[0]

    lev_dist = np.zeros((dim,dim))
    for idx, row1 in tqdm(leven_df.iterrows()):
        for j in range (idx, dim):
            if (idx == j):
                continue
            else:
                lev_dist[idx,j] = lv.distance(row1['name_last'],leven_df.iloc[j]['name_last'])
else:
    lev_dist = pd.read_csv('lev_distance_1per.csv').to_numpy()
    

5000it [25:18,  3.29it/s] 


In [77]:
# half filled out matrix
lev_dist

array([[ 0., 10.,  8., ..., 17., 11., 17.],
       [ 0.,  0.,  4., ...,  8.,  9.,  8.],
       [ 0.,  0.,  0., ..., 11., 11., 11.],
       ...,
       [ 0.,  0.,  0., ...,  0., 14.,  5.],
       [ 0.,  0.,  0., ...,  0.,  0., 15.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [78]:
# fill out the bottom portion of the matrix
#  i.e. the distance between name[123] & name[345] is the
#  same as string[345] & name[123]

if (REGEN):
    for i in range(dim):
        for j in range (i, dim):
            if (i == j):
                continue
            else:
                lev_dist[j,i] = lev_dist[i,j]
    # Now the matrix is mirrored
    lev_dist.tofile('lev_distance_1per.csv',sep=',')

# Find K smallest values
i.e. the nearest k neighbors in our vector

In [79]:
def get_accuracy (name_df, levenstein_dist, k):
    # Get the nearest k values for the string
    #   we add +1 since 0 (the string itself)
    #   will be present in the diagnal value
    k +=1 
    values = np.argpartition(levenstein_dist, (k))
    final_pred = []
    for i in tqdm(range(levenstein_dist.shape[0])):
        max_value = np.max(levenstein_dist[i][values[i][:k]])
        mask = (levenstein_dist[i] <= max_value) & (levenstein_dist[i] > 0)
        out = np.argwhere(mask)
        total_sum =  (name_df.iloc[out.reshape(-1)]['total_n'].sum())
        pred_white = (name_df.iloc[out.reshape(-1)]['nh_white'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_black = (name_df.iloc[out.reshape(-1)]['nh_black'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_hispanic = (name_df.iloc[out.reshape(-1)]['hispanic'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_asian = (name_df.iloc[out.reshape(-1)]['asian'] * name_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        predictions = [pred_asian, pred_hispanic, pred_black, pred_white]
        final_pred.append(races[predictions.index(max(predictions))])
    name_df['true_race'] = name_df[races].idxmax(axis=1)
    name_df['pred'] = final_pred
    return (classification_report(name_df['true_race'],name_df['pred']))    

In [80]:
k_metrics = {
    3:0,
    5:0,
    7:0,
    10:0,
    15:0,
    20:0,
}

In [81]:
train_df, test_df = train_test_split(proto_df.sample(frac=0.1, random_state=10), test_size=0.1)
train_df.reset_index(inplace=True)
train_df.drop(['index','n_gram','tfidf'],axis=1, inplace=True)
test_df.reset_index(inplace=True)
test_df.drop(['index','n_gram','tfidf'],axis=1, inplace=True)

In [82]:
for value, key in enumerate (k_metrics):
    value = get_accuracy (leven_df, lev_dist, key)
    print ('For k={}:\n {}\n\n-----------------------\n'.format(key,value))
    k_metrics[key] = value

100%|████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:16<00:00, 305.43it/s]


For k=3:
               precision    recall  f1-score   support

       asian       0.14      0.01      0.02        89
    hispanic       0.95      0.89      0.92      2427
    nh_black       0.62      0.40      0.48       295
    nh_white       0.81      0.93      0.86      2189

    accuracy                           0.86      5000
   macro avg       0.63      0.56      0.57      5000
weighted avg       0.85      0.86      0.85      5000


-----------------------



100%|████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:16<00:00, 308.44it/s]


For k=5:
               precision    recall  f1-score   support

       asian       0.00      0.00      0.00        89
    hispanic       0.96      0.89      0.92      2427
    nh_black       0.67      0.38      0.49       295
    nh_white       0.80      0.94      0.87      2189

    accuracy                           0.87      5000
   macro avg       0.61      0.55      0.57      5000
weighted avg       0.85      0.87      0.85      5000


-----------------------



100%|████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:16<00:00, 311.04it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


For k=7:
               precision    recall  f1-score   support

       asian       0.00      0.00      0.00        89
    hispanic       0.96      0.88      0.92      2427
    nh_black       0.69      0.38      0.49       295
    nh_white       0.80      0.95      0.87      2189

    accuracy                           0.87      5000
   macro avg       0.61      0.55      0.57      5000
weighted avg       0.86      0.87      0.85      5000


-----------------------



100%|████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:16<00:00, 304.59it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


For k=10:
               precision    recall  f1-score   support

       asian       0.00      0.00      0.00        89
    hispanic       0.96      0.88      0.92      2427
    nh_black       0.69      0.38      0.49       295
    nh_white       0.80      0.95      0.87      2189

    accuracy                           0.86      5000
   macro avg       0.61      0.55      0.57      5000
weighted avg       0.85      0.86      0.85      5000


-----------------------



100%|████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:15<00:00, 316.47it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


For k=15:
               precision    recall  f1-score   support

       asian       0.00      0.00      0.00        89
    hispanic       0.96      0.88      0.92      2427
    nh_black       0.67      0.38      0.48       295
    nh_white       0.79      0.95      0.86      2189

    accuracy                           0.86      5000
   macro avg       0.61      0.55      0.57      5000
weighted avg       0.85      0.86      0.85      5000


-----------------------



100%|████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:17<00:00, 293.80it/s]


For k=20:
               precision    recall  f1-score   support

       asian       0.00      0.00      0.00        89
    hispanic       0.96      0.87      0.91      2427
    nh_black       0.67      0.38      0.49       295
    nh_white       0.79      0.95      0.86      2189

    accuracy                           0.86      5000
   macro avg       0.61      0.55      0.57      5000
weighted avg       0.85      0.86      0.85      5000


-----------------------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
def predict (name_df, corp_df, k):
    corp_name = corp_df['name_last']

    # Calculate the Levenshtein distance for the test set that we are trying to predict
    test_lev_dist = np.zeros((name_df.shape[0],corp_df.shape[0]))
    for idx, row1 in tqdm(name_df.iterrows()):
        for j in range(corp_df.shape[0]):
            test_lev_dist[idx,j] = lv.distance(row1['name_last'],corp_df.iloc[j]['name_last'])
    
    # Get accuracy of the model on the test set
    #   - taking the levenshtein distance calculated from test names to corpus names
    #   - finding the nearest training names to the test names
    #   - predicting the test race based on the training names
    k +=1 
    values = np.argpartition(test_lev_dist, (k))
    final_pred = []
    for i in tqdm(range(test_lev_dist.shape[0])):
        max_value = np.max(test_lev_dist[i][values[i][:k]])
        mask = (test_lev_dist[i] <= max_value) & (test_lev_dist[i] > 0)
        out = np.argwhere(mask)
        total_sum =  (corp_df.iloc[out.reshape(-1)]['total_n'].sum())
        pred_white = (corp_df.iloc[out.reshape(-1)]['nh_white'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_black = (corp_df.iloc[out.reshape(-1)]['nh_black'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_hispanic = (corp_df.iloc[out.reshape(-1)]['hispanic'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        pred_asian = (corp_df.iloc[out.reshape(-1)]['asian'] * corp_df.iloc[out.reshape(-1)]['total_n']).sum() / total_sum
        predictions = [pred_asian, pred_hispanic, pred_black, pred_white]
        final_pred.append(races[predictions.index(max(predictions))])
        
    name_df['true_race'] = name_df[races].idxmax(axis=1)
    name_df['pred'] = final_pred
    return (classification_report(name_df['true_race'],name_df['pred'])) 

In [85]:
model_perf = predict(test_df, leven_df, 3)

850it [09:06,  1.56it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 850/850 [00:03<00:00, 248.55it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [86]:
print(model_perf)

              precision    recall  f1-score   support

       asian       0.00      0.00      0.00        28
    hispanic       0.69      0.63      0.66       206
    nh_black       0.62      0.10      0.17       102
    nh_white       0.72      0.91      0.81       514

    accuracy                           0.72       850
   macro avg       0.51      0.41      0.41       850
weighted avg       0.68      0.72      0.67       850



In [None]:
0 1  2  3
1 .1 .2 .7
2 .1 .3 .7
3 .7 .2 .1