# Data Prep

## Data Loading

In [1]:
import collections
import Levenshtein as lv
import multiprocessing as mp
import numpy as np
import pandas as pd
#from pathos.multiprocessing import ProcessingPool as Pool

from predict import predict_race
from predict import check_k
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
# Set this to True when you want to regenerate the Levenshtein Distance
#  otherwise will load csv file
REGEN = True

In [3]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.lower()

# Remove unrequired first name
sdf.drop('name_first', axis=1, inplace=True)

sdf

Unnamed: 0,name_last,race
0,walker,nh_white
1,palmer,nh_white
2,mc cleod,nh_black
3,scarborough,nh_white
4,walker,nh_white
...,...,...
13653888,philpott,nh_white
13653889,walters,nh_white
13653890,sawyer,nh_white
13653891,thomas,nh_white


In [4]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanic    2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [5]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [6]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)

In [7]:
gdf.head(15)

race,name_last,asian,hispanic,nh_black,nh_white,total_n
0,fleurime michel,0.0,0.0,1.0,0.0,1.0
1,franklin,0.0,0.0,1.0,0.0,1.0
2,grant cliatt,0.0,0.0,1.0,0.0,1.0
3,hassan,1.0,0.0,0.0,0.0,1.0
4,king,0.0,1.0,0.0,0.0,1.0
5,williams,0.0,0.0,0.0,1.0,1.0
6,0kharitonenko,0.0,0.0,0.0,1.0,1.0
7,1amirthanayagam,1.0,0.0,0.0,0.0,1.0
8,4r,0.0,0.0,0.0,1.0,1.0
9,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,1.0


In [8]:
races = sorted(sdf.race.unique().tolist())
races

['asian', 'hispanic', 'nh_black', 'nh_white']

In [9]:
def calc_prop(row):
    total = row['total_n']
    values = [(i/total) for i in row]
    return pd.Series(values)

In [10]:
# Calculate the proportion of people with a particular last name
#  that identify with one of the 4 races
temp = races.copy()
temp.append('total_n')

gdf[temp] = gdf[temp].apply(calc_prop, axis=1)

In [11]:
def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

In [12]:
gdf['true_race'] = gdf[races].idxmax(axis=1)
gdf['true_race'] = gdf['true_race'].apply(lambda c: get_race_idx(c,races))

In [13]:
gdf

race,name_last,asian,hispanic,nh_black,nh_white,total_n,true_race
0,fleurime michel,0.0,0.0,1.0,0.0,1.0,2
1,franklin,0.0,0.0,1.0,0.0,1.0,2
2,grant cliatt,0.0,0.0,1.0,0.0,1.0,2
3,hassan,1.0,0.0,0.0,0.0,1.0,0
4,king,0.0,1.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...
849821,zyzanski,0.0,0.0,0.0,1.0,1.0,3
849822,zyzdryn,0.0,0.0,0.0,1.0,1.0,3
849823,zyznomyrsky,0.0,0.0,0.0,1.0,1.0,3
849824,zzaman,1.0,0.0,0.0,0.0,1.0,0


## Data Processing

In [14]:
proto_df = gdf.groupby('true_race', group_keys=False).apply(lambda x: x.sample(frac=.5, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(424914, 7)

In [15]:
# Check that our sampling has generated proportionate representation in all classes
proto_df.true_race.value_counts()

3    258253
1    106986
2     44231
0     15444
Name: true_race, dtype: int64

In [16]:
NGRAMS = 2
feature_len = 25

In [17]:
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

# **********
# **** CHANGE THIS TO FULL DATAFRAME WHEN READY FOR FULL DATASET ****
a = vect.fit_transform(proto_df.name_last) 
tfidf = tfidf_transformer.fit_transform(a)
# **********

vocab = vect.vocabulary_

In [18]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 820


In [19]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [20]:
# build tf-idf vectors
proto_df['tfidf'] = tfidf.toarray().tolist()

In [21]:
proto_df

race,name_last,asian,hispanic,nh_black,nh_white,total_n,true_race,tfidf
0,adichirayil,1.0,0.0,0.000000,0.000000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,morillo encisco,1.0,0.0,0.000000,0.000000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.403..."
2,hular,1.0,0.0,0.000000,0.000000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,fuze,0.5,0.0,0.000000,0.500000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,balasingam,1.0,0.0,0.000000,0.000000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...
424909,antonaccio,0.0,0.0,0.000000,1.000000,1.0,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
424910,bremke,0.0,0.0,0.000000,1.000000,1.0,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
424911,sevareid,0.0,0.0,0.000000,1.000000,1.0,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
424912,decatur,0.0,0.0,0.046512,0.953488,1.0,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [22]:
train_df, test_df = train_test_split(proto_df, test_size=.05)
train_df, valid_df = train_test_split(train_df, test_size=.05)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (383484, 9)
Validation set size: (20184, 9)
Test set size: (21246, 9)


# Find K smallest values
i.e. the nearest k neighbors in our vector

In [23]:
k_metrics = {
    3:0,
    5:0,
}

In [24]:
true_list = []
for idx, row in valid_df.iterrows():
    true_list.append(row['true_race'])

In [25]:
%%time
for value, key in enumerate (k_metrics):
    #print ('{} -- {}'.format(key, value))
    result = check_k(valid_df, train_df, key, 0.6)
    
    pred_list = np.array(result).reshape(-1)
    pred_list = pred_list.tolist()
    
    value = classification_report(true_list, pred_list)
    
    print ('for value of k: {} \n{}'.format(key, value))
    k_metrics[key] = value

for value of k: 3 
              precision    recall  f1-score   support

           0       0.51      0.23      0.32       701
           1       0.81      0.77      0.79      5034
           2       0.59      0.39      0.46      2148
           3       0.81      0.90      0.85     12301

    accuracy                           0.79     20184
   macro avg       0.68      0.57      0.61     20184
weighted avg       0.77      0.79      0.78     20184

for value of k: 5 
              precision    recall  f1-score   support

           0       0.54      0.19      0.29       701
           1       0.82      0.76      0.79      5034
           2       0.61      0.38      0.47      2148
           3       0.80      0.91      0.85     12301

    accuracy                           0.79     20184
   macro avg       0.69      0.56      0.60     20184
weighted avg       0.78      0.79      0.78     20184

CPU times: user 6min 57s, sys: 16min 43s, total: 23min 40s
Wall time: 15h 37min 38s


# Test Set evaluation

In [26]:
%%time
result = check_k(test_df, train_df, 3, 0.6)

CPU times: user 3min 38s, sys: 8min 49s, total: 12min 28s
Wall time: 8h 42min 37s


In [28]:
pred_list = np.array(result).reshape(-1)
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['true_race'])


value = classification_report(true_list, pred_list)

print ('for value of k: {} \n{}'.format(3, value))

for value of k: 3 
              precision    recall  f1-score   support

           0       0.53      0.22      0.32       774
           1       0.80      0.77      0.78      5405
           2       0.57      0.38      0.46      2267
           3       0.80      0.89      0.84     12800

    accuracy                           0.78     21246
   macro avg       0.67      0.57      0.60     21246
weighted avg       0.77      0.78      0.77     21246

