# Data Prep

## Data Loading

In [1]:
import collections
import Levenshtein as lv
import multiprocessing as mp
import numpy as np
import pandas as pd
#from pathos.multiprocessing import ProcessingPool as Pool

from predict import predict_race
from predict import check_k
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
# Set this to True when you want to regenerate the Levenshtein Distance
#  otherwise will load csv file
REGEN = True

In [3]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.lower()

# Remove unrequired first name
sdf.drop('name_first', axis=1, inplace=True)

sdf

Unnamed: 0,name_last,race
0,walker,nh_white
1,palmer,nh_white
2,mc cleod,nh_black
3,scarborough,nh_white
4,walker,nh_white
...,...,...
13653888,philpott,nh_white
13653889,walters,nh_white
13653890,sawyer,nh_white
13653891,thomas,nh_white


In [4]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanic    2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [5]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [6]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)

In [7]:
gdf.head(15)

race,name_last,asian,hispanic,nh_black,nh_white,total_n
0,fleurime michel,0.0,0.0,1.0,0.0,1.0
1,franklin,0.0,0.0,1.0,0.0,1.0
2,grant cliatt,0.0,0.0,1.0,0.0,1.0
3,hassan,1.0,0.0,0.0,0.0,1.0
4,king,0.0,1.0,0.0,0.0,1.0
5,williams,0.0,0.0,0.0,1.0,1.0
6,0kharitonenko,0.0,0.0,0.0,1.0,1.0
7,1amirthanayagam,1.0,0.0,0.0,0.0,1.0
8,4r,0.0,0.0,0.0,1.0,1.0
9,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,1.0


In [8]:
races = sorted(sdf.race.unique().tolist())
races

['asian', 'hispanic', 'nh_black', 'nh_white']

In [9]:
def calc_prop(row):
    total = row['total_n']
    values = [(i/total) for i in row]
    return pd.Series(values)

In [10]:
# Calculate the proportion of people with a particular last name
#  that identify with one of the 4 races
temp = races.copy()
temp.append('total_n')

gdf[temp] = gdf[temp].apply(calc_prop, axis=1)

In [11]:
def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

In [12]:
gdf['true_race'] = gdf[races].idxmax(axis=1)
gdf['true_race'] = gdf['true_race'].apply(lambda c: get_race_idx(c,races))

In [13]:
gdf

race,name_last,asian,hispanic,nh_black,nh_white,total_n,true_race
0,fleurime michel,0.0,0.0,1.0,0.0,1.0,2
1,franklin,0.0,0.0,1.0,0.0,1.0,2
2,grant cliatt,0.0,0.0,1.0,0.0,1.0,2
3,hassan,1.0,0.0,0.0,0.0,1.0,0
4,king,0.0,1.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...
849821,zyzanski,0.0,0.0,0.0,1.0,1.0,3
849822,zyzdryn,0.0,0.0,0.0,1.0,1.0,3
849823,zyznomyrsky,0.0,0.0,0.0,1.0,1.0,3
849824,zzaman,1.0,0.0,0.0,0.0,1.0,0


## Data Processing

In [14]:
proto_df = gdf.groupby('true_race', group_keys=False).apply(lambda x: x.sample(frac=.01, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(8499, 7)

In [15]:
# Check that our sampling has generated proportionate representation in all classes
proto_df.true_race.value_counts()

3    5165
1    2140
2     885
0     309
Name: true_race, dtype: int64

In [16]:
NGRAMS = 2
feature_len = 25

In [17]:
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

# **********
# **** CHANGE THIS TO FULL DATAFRAME WHEN READY FOR FULL DATASET ****
a = vect.fit_transform(proto_df.name_last) 
tfidf = tfidf_transformer.fit_transform(a)
# **********

vocab = vect.vocabulary_

In [18]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 581


In [19]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [20]:
# build tf-idf vectors
proto_df['tfidf'] = tfidf.toarray().tolist()

In [21]:
proto_df

race,name_last,asian,hispanic,nh_black,nh_white,total_n,true_race,tfidf
0,adichirayil,1.0,0.000000,0.0,0.000000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,morillo encisco,1.0,0.000000,0.0,0.000000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.4001901060838748, ..."
2,hular,1.0,0.000000,0.0,0.000000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,fuze,0.5,0.000000,0.0,0.500000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,balasingam,1.0,0.000000,0.0,0.000000,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...
8494,wals,0.0,0.000000,0.0,1.000000,1.0,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8495,judelle,0.0,0.000000,0.0,1.000000,1.0,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8496,grandjean,0.0,0.000000,0.0,1.000000,1.0,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8497,des plaines,0.0,0.000000,0.0,1.000000,1.0,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [22]:
train_df, test_df = train_test_split(proto_df, test_size=.05)
train_df, valid_df = train_test_split(train_df, test_size=.05)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (7670, 9)
Validation set size: (404, 9)
Test set size: (425, 9)


In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [109]:
%reload_ext autoreload

In [110]:
%%time
result = check_k(valid_df, train_df, 3, 0.6)

CPU times: user 1.78 s, sys: 686 ms, total: 2.47 s
Wall time: 58.4 s


In [111]:
pred_list = []
for i in result:
    print(i.get())
    pred_list.append(i.get())

AttributeError: 'list' object has no attribute 'get'

In [100]:
race_list = []
for idx, row in valid_df.iterrows():
    race_list.append(row['true_race'])

In [108]:
print(classification_report(race_list,pred_list))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.73      0.38      0.50        99
           2       0.11      0.64      0.19        42
           3       0.70      0.30      0.42       246

    accuracy                           0.35       404
   macro avg       0.39      0.33      0.28       404
weighted avg       0.62      0.35      0.40       404



# Find K smallest values
i.e. the nearest k neighbors in our vector

In [None]:
valid_df['pred_race'] = pd.Series(int)

In [None]:
k_metrics = {
    3:0,
    5:0,
    7:0,
}

In [None]:
pool_key = list(k_metrics.keys())

In [None]:
result_list = []
def log_result(result):
    result_list.append(result)

In [None]:
if __name__ == '__main__':
    result_list1 = pool.starmap_async(check_k(valid_df,train_df,3), check_k(valid_df,train_df,5))

In [None]:
result_list1

In [None]:
print(result_list1.get(timeout=1))

# Test Set evaluation

In [None]:
value = check.check_k(test_df, train_df, 3)
print ('for value of k: {} \n{}'.format(3, value))