# Data Prep

## Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_first'] = sdf.name_first.str.title()
sdf['name_last'] = sdf.name_last.str.title()

sdf

Unnamed: 0,name_last,name_first,race
0,Walker,Elizabeth,nh_white
1,Palmer,Alton,nh_white
2,Mc Cleod,Alicia,nh_black
3,Scarborough,Dale,nh_white
4,Walker,Daniel,nh_white
...,...,...,...
13653888,Philpott,April,nh_white
13653889,Walters,William,nh_white
13653890,Sawyer,Matthew,nh_white
13653891,Thomas,Janine,nh_white


In [3]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanic    2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [4]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [5]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)

gdf['total_norm'] = gdf['total_n']/np.max(gdf['total_n'])
gdf.reset_index(inplace=True)

In [6]:
gdf.head(15)

race,name_last,asian,hispanic,nh_black,nh_white,total_n,total_norm
0,Fleurime Michel,0.0,0.0,1.0,0.0,1.0,1e-05
1,Franklin,0.0,0.0,1.0,0.0,1.0,1e-05
2,Grant Cliatt,0.0,0.0,1.0,0.0,1.0,1e-05
3,Hassan,1.0,0.0,0.0,0.0,1.0,1e-05
4,King,0.0,1.0,0.0,0.0,1.0,1e-05
5,Williams,0.0,0.0,0.0,1.0,1.0,1e-05
6,0Kharitonenko,0.0,0.0,0.0,1.0,1.0,1e-05
7,1Amirthanayagam,1.0,0.0,0.0,0.0,1.0,1e-05
8,4R,0.0,0.0,0.0,1.0,1.0,1e-05
9,77348 Dancing Rochanavibhata,1.0,0.0,0.0,0.0,1.0,1e-05


In [7]:
races = sdf.race.unique().tolist()
races

['nh_white', 'nh_black', 'hispanic', 'asian']

In [8]:
def calc_prop(row):
    total = row['total_n']
    values = [(i/total) for i in row]
    return pd.Series(values)

In [9]:
temp = races
temp.append('total_n')
print(temp)
gdf[races] = gdf[temp].apply(calc_prop, axis=1)

['nh_white', 'nh_black', 'hispanic', 'asian', 'total_n']


In [10]:
gdf

race,name_last,asian,hispanic,nh_black,nh_white,total_n,total_norm
0,Fleurime Michel,0.0,0.0,1.0,0.0,1.0,0.000010
1,Franklin,0.0,0.0,1.0,0.0,1.0,0.000010
2,Grant Cliatt,0.0,0.0,1.0,0.0,1.0,0.000010
3,Hassan,1.0,0.0,0.0,0.0,1.0,0.000010
4,King,0.0,1.0,0.0,0.0,1.0,0.000010
...,...,...,...,...,...,...,...
849821,Zyzanski,0.0,0.0,0.0,1.0,1.0,0.000010
849822,Zyzdryn,0.0,0.0,0.0,1.0,1.0,0.000019
849823,Zyznomyrsky,0.0,0.0,0.0,1.0,1.0,0.000010
849824,Zzaman,1.0,0.0,0.0,0.0,1.0,0.000010


In [11]:
gdf.drop('total_n', axis=1, inplace=True)
gdf.columns

Index(['name_last', 'asian', 'hispanic', 'nh_black', 'nh_white', 'total_norm'], dtype='object', name='race')

## Data Processing

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing import sequence


NGRAMS = 2
feature_len = 25

In [13]:
proto_df = gdf.sample(frac=0.01, random_state=10)
proto_df.shape

(8498, 6)

# Levenshtein Distance

In [14]:
# Copying the DataFrame and resetting the index so that its from 0-xxxx
knn_df = pd.DataFrame(proto_df)
knn_df.reset_index(inplace=True)
knn_df.drop('index',inplace=True, axis=1)

In [15]:
import Levenshtein as lv

In [16]:
knn_df

race,name_last,asian,hispanic,nh_black,nh_white,total_norm
0,Bojin,0.000000,0.000000,0.000000,1.000000,0.000029
1,Owens-Harvey,0.000000,0.000000,1.000000,0.000000,0.000010
2,Anelis,0.000000,0.000000,1.000000,0.000000,0.000010
3,Clavel Rivera,0.000000,1.000000,0.000000,0.000000,0.000010
4,Ley,0.042705,0.241993,0.007117,0.708185,0.002692
...,...,...,...,...,...,...
8493,Moreno-Sevilla,0.000000,1.000000,0.000000,0.000000,0.000010
8494,Falco-Sennish,0.000000,0.000000,0.000000,1.000000,0.000010
8495,Mox,0.000000,0.000000,0.000000,1.000000,0.000038
8496,Quinonez Camacho,0.000000,1.000000,0.000000,0.000000,0.000010


In [17]:
# Creating Numpy Array to hold results
dim = knn_df.shape[0]

lev_dist = np.zeros((dim,dim))
for idx, row1 in knn_df.iterrows():
    for j in range (idx, dim):
        if (idx == j):
            continue
        else:
            lev_dist[idx,j] = lv.distance(row1['name_last'],knn_df.iloc[j]['name_last'])
            #print ('{} : {} -- {}: {} -- distance: {}'.format(idx, row1['name_last'], j, knn_df.iloc[j]['name_last'], lev_dist[idx,j]))
    if (idx % 500 == 0):
        print ('{} names were processed'.format(idx))

0 names were processed
500 names were processed
1000 names were processed
1500 names were processed
2000 names were processed
2500 names were processed
3000 names were processed
3500 names were processed
4000 names were processed
4500 names were processed
5000 names were processed
5500 names were processed
6000 names were processed
6500 names were processed
7000 names were processed
7500 names were processed
8000 names were processed


In [18]:
# half filled out matrix
lev_dist

array([[ 0., 12.,  5., ...,  4., 15.,  5.],
       [ 0.,  0., 11., ..., 12., 14., 11.],
       [ 0.,  0.,  0., ...,  6., 14.,  5.],
       ...,
       [ 0.,  0.,  0., ...,  0., 15.,  5.],
       [ 0.,  0.,  0., ...,  0.,  0., 14.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [19]:
# fill out the bottom portion of the matrix
#  i.e. the distance between name[123] & name[345] is the
#  same as string[345] & name[123]
for i in range(dim):
    for j in range (i, dim):
        if (i == j):
            continue
        else:
            lev_dist[j,i] = lev_dist[i,j]

In [23]:
# Now the matrix is mirrored
lev_dist.tofile('lev_distance_1per.csv',sep=',')