# Data Prep

## Data Loading

In [1]:
import pandas as pd
import numpy as np
import collections
import Levenshtein as lv
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing import sequence
from tqdm import tqdm

In [2]:
# Set this to True when you want to regenerate the Levenshtein Distance
#  otherwise will load csv file
REGEN = True

In [3]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.lower()

# Remove unrequired first name
sdf.drop('name_first', axis=1, inplace=True)

sdf

Unnamed: 0,name_last,race
0,walker,nh_white
1,palmer,nh_white
2,mc cleod,nh_black
3,scarborough,nh_white
4,walker,nh_white
...,...,...
13653888,philpott,nh_white
13653889,walters,nh_white
13653890,sawyer,nh_white
13653891,thomas,nh_white


In [4]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanic    2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [5]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [6]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)

In [7]:
gdf.head(15)

race,name_last,asian,hispanic,nh_black,nh_white,total_n
0,fleurime michel,0.0,0.0,1.0,0.0,1.0
1,franklin,0.0,0.0,1.0,0.0,1.0
2,grant cliatt,0.0,0.0,1.0,0.0,1.0
3,hassan,1.0,0.0,0.0,0.0,1.0
4,king,0.0,1.0,0.0,0.0,1.0
5,williams,0.0,0.0,0.0,1.0,1.0
6,0kharitonenko,0.0,0.0,0.0,1.0,1.0
7,1amirthanayagam,1.0,0.0,0.0,0.0,1.0
8,4r,0.0,0.0,0.0,1.0,1.0
9,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,1.0


In [8]:
races = sorted(sdf.race.unique().tolist())
races

['asian', 'hispanic', 'nh_black', 'nh_white']

In [9]:
def calc_prop(row):
    total = row['total_n']
    values = [(i/total) for i in row]
    return pd.Series(values)

In [10]:
# Calculate the proportion of people with a particular last name
#  that identify with one of the 4 races
temp = races.copy()
temp.append('total_n')

gdf[temp] = gdf[temp].apply(calc_prop, axis=1)

In [11]:
def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

In [12]:
gdf['true_race'] = gdf[races].idxmax(axis=1)
gdf['true_race'] = gdf['true_race'].apply(lambda c: get_race_idx(c,races))

In [13]:
gdf

race,name_last,asian,hispanic,nh_black,nh_white,total_n,true_race
0,fleurime michel,0.0,0.0,1.0,0.0,1.0,2
1,franklin,0.0,0.0,1.0,0.0,1.0,2
2,grant cliatt,0.0,0.0,1.0,0.0,1.0,2
3,hassan,1.0,0.0,0.0,0.0,1.0,0
4,king,0.0,1.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...
849821,zyzanski,0.0,0.0,0.0,1.0,1.0,3
849822,zyzdryn,0.0,0.0,0.0,1.0,1.0,3
849823,zyznomyrsky,0.0,0.0,0.0,1.0,1.0,3
849824,zzaman,1.0,0.0,0.0,0.0,1.0,0


## Data Processing

In [14]:
proto_df = gdf.groupby('true_race', group_keys=False).apply(lambda x: x.sample(frac=.01, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(8499, 7)

In [15]:
# Check that our sampling has generated proportionate representation in all classes
proto_df.true_race.value_counts()

3    5165
1    2140
2     885
0     309
Name: true_race, dtype: int64

In [16]:
NGRAMS = 2
feature_len = 25

In [17]:
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

# **********
# **** CHANGE THIS TO FULL DATAFRAME WHEN READY FOR FULL DATASET ****
a = vect.fit_transform(proto_df.name_last) 
tfidf = tfidf_transformer.fit_transform(a)
# **********

vocab = vect.vocabulary_

In [18]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 581


In [19]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [20]:
# build bi-grams from index of n-gram sequence
proto_df['n_gram'] = np.array(proto_df.name_last.apply(lambda c: find_ngrams(c, NGRAMS)))
proto_df['n_gram'] = (sequence.pad_sequences(proto_df['n_gram'], maxlen=feature_len)).tolist()
proto_df['tfidf'] = tfidf.toarray().tolist()

In [21]:
proto_df

race,name_last,asian,hispanic,nh_black,nh_white,total_n,true_race,n_gram,tfidf
0,adichirayil,1.0,0.000000,0.0,0.000000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,morillo encisco,1.0,0.000000,0.0,0.000000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.4001901060838748, ..."
2,hular,1.0,0.000000,0.0,0.000000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,fuze,0.5,0.000000,0.0,0.500000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,balasingam,1.0,0.000000,0.0,0.000000,1.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...
8494,wals,0.0,0.000000,0.0,1.000000,1.0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8495,judelle,0.0,0.000000,0.0,1.000000,1.0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8496,grandjean,0.0,0.000000,0.0,1.000000,1.0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8497,des plaines,0.0,0.000000,0.0,1.000000,1.0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [22]:
train_df, test_df = train_test_split(proto_df, test_size=.1)
train_df, valid_df = train_test_split(train_df, test_size=.2)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (6119, 10)
Validation set size: (1530, 10)
Test set size: (850, 10)


In [23]:
def cos_sim(row_data, corpus_df, filt = 0.6):
    orig_vector = np.array(row_data['tfidf']).reshape(1,-1)
    cossim = np.zeros(corpus_df.shape[0])
    j = 0
    for idx, row in corpus_df.iterrows():
        sim = cosine_similarity(orig_vector,np.array(row['tfidf']).reshape(1,-1))
        cossim[j] = sim
        j+=1
        
    return np.argwhere(cossim >= filt).reshape(-2,).tolist()

In [24]:
def calc_leven(orig_string, filt_df):
    lev_dist = {}
    for idx, row in filt_df.iterrows():
        lev = lv.distance(orig_string, row['name_last'])
        lev_dist[idx] = lev
    return lev_dist

In [25]:
def check_k(test_df, corpus_df, k):
    final_pred = []
    for idx, row in tqdm(test_df.iterrows()):
        indices = cos_sim(row, corpus_df, filt = .6)
        filtered_corpus_df = corpus_df.iloc[indices]

        lev_dist = calc_leven(row['name_last'], filtered_corpus_df)
        values = np.array(list(lev_dist.values()))
        keys = np.array(list(lev_dist.keys()))
                        
        if (k < values.shape[0]):
            filt_values = np.argpartition(values, k)
        elif (values.shape[0] == 0):
            indices = cos_sim(row, corpus_df, filt = .1)
            filtered_corpus_df = corpus_df.iloc[indices]

            lev_dist = calc_leven(row['name_last'], filtered_corpus_df)
            values = np.array(list(lev_dist.values()))
            keys = np.array(list(lev_dist.keys()))
            filt_values = np.argpartition(values, k)
        else:
            filt_values = values.shape[0] - 1
        
        if (type(filt_values) == np.any):
            max_value  = np.max(values[filt_values[:k]])
        else:
            max_value = np.max(values[filt_values])
        
        mask = (values <= max_value) & (values > 0)
        mask_idx = np.argwhere(mask).reshape(-1)
        df_idx = keys[mask_idx]
        
        total_sum =  (corpus_df.iloc[df_idx]['total_n'].sum())
        pred_white = (corpus_df.iloc[df_idx]['nh_white'] * corpus_df.iloc[df_idx]['total_n']).sum() / total_sum
        pred_black = (corpus_df.iloc[df_idx]['nh_black'] * corpus_df.iloc[df_idx]['total_n']).sum() / total_sum
        pred_hispanic = (corpus_df.iloc[df_idx]['hispanic'] * corpus_df.iloc[df_idx]['total_n']).sum() / total_sum
        pred_asian = (corpus_df.iloc[df_idx]['asian'] * corpus_df.iloc[df_idx]['total_n']).sum() / total_sum
        predictions = [pred_asian, pred_hispanic, pred_black, pred_white]
        
        final_pred.append(predictions.index(max(predictions)))

    test_df['pred_race'] = final_pred
    
    return classification_report(test_df['true_race'], test_df['pred_race'])

# Find K smallest values
i.e. the nearest k neighbors in our vector

In [26]:
valid_df['pred_race'] = pd.Series(int)

In [27]:
k_metrics = {
    3:0,
    5:0,
    7:0,
    10:0,
}

In [28]:
for value, key in enumerate (k_metrics):
    #print ('{} -- {}'.format(key, value))
    value = check_k(valid_df, train_df, key)
    print ('for value of k: {} \n{}'.format(key, value))
    k_metrics[key] = value

1530it [1:45:01,  4.12s/it]


for value of k: 3 
              precision    recall  f1-score   support

           0       0.20      0.06      0.09        66
           1       0.81      0.48      0.60       387
           2       0.36      0.09      0.14       176
           3       0.67      0.92      0.78       901

    accuracy                           0.68      1530
   macro avg       0.51      0.39      0.40      1530
weighted avg       0.65      0.68      0.63      1530



1530it [1:45:19,  4.13s/it]


for value of k: 5 
              precision    recall  f1-score   support

           0       0.19      0.06      0.09        66
           1       0.81      0.48      0.60       387
           2       0.36      0.09      0.14       176
           3       0.67      0.92      0.78       901

    accuracy                           0.68      1530
   macro avg       0.51      0.39      0.40      1530
weighted avg       0.65      0.68      0.63      1530



1530it [1:44:14,  4.09s/it]


for value of k: 7 
              precision    recall  f1-score   support

           0       0.19      0.06      0.09        66
           1       0.81      0.48      0.60       387
           2       0.36      0.09      0.14       176
           3       0.67      0.92      0.78       901

    accuracy                           0.68      1530
   macro avg       0.51      0.39      0.40      1530
weighted avg       0.65      0.68      0.63      1530



1530it [1:45:16,  4.13s/it]

for value of k: 10 
              precision    recall  f1-score   support

           0       0.19      0.06      0.09        66
           1       0.81      0.48      0.60       387
           2       0.36      0.09      0.14       176
           3       0.67      0.92      0.78       901

    accuracy                           0.68      1530
   macro avg       0.51      0.39      0.40      1530
weighted avg       0.65      0.68      0.63      1530




