In [105]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

# Data Prep

In [2]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_first'] = sdf.name_first.str.title()
sdf['name_last'] = sdf.name_last.str.title()

sdf

Unnamed: 0,name_last,name_first,race
0,Walker,Elizabeth,nh_white
1,Palmer,Alton,nh_white
2,Mc Cleod,Alicia,nh_black
3,Scarborough,Dale,nh_white
4,Walker,Daniel,nh_white
...,...,...,...
13653888,Philpott,April,nh_white
13653889,Walters,William,nh_white
13653890,Sawyer,Matthew,nh_white
13653891,Thomas,Janine,nh_white


In [3]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanc     2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [69]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count']).reset_index()

In [70]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)

gdf['total_norm'] = gdf['total_n']/np.max(gdf['total_n'])
gdf['name_last'] = gdf.index

In [71]:
gdf[:15]

race,asian,hispanc,nh_black,nh_white,total_n,total_norm,name_last
name_last,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Fleurime Michel,0.0,0.0,1.0,0.0,1.0,1e-05,Fleurime Michel
Franklin,0.0,0.0,1.0,0.0,1.0,1e-05,Franklin
Grant Cliatt,0.0,0.0,1.0,0.0,1.0,1e-05,Grant Cliatt
Hassan,1.0,0.0,0.0,0.0,1.0,1e-05,Hassan
King,0.0,1.0,0.0,0.0,1.0,1e-05,King
Williams,0.0,0.0,0.0,1.0,1.0,1e-05,Williams
0Kharitonenko,0.0,0.0,0.0,1.0,1.0,1e-05,0Kharitonenko
1Amirthanayagam,1.0,0.0,0.0,0.0,1.0,1e-05,1Amirthanayagam
4R,0.0,0.0,0.0,1.0,1.0,1e-05,4R
77348 Dancing Rochanavibhata,1.0,0.0,0.0,0.0,1.0,1e-05,77348 Dancing Rochanavibhata


In [75]:
races = sdf.race.unique().tolist()
races

['nh_white', 'nh_black', 'hispanc', 'asian']

In [76]:
# converting races to proportions
for r in races:
    gdf[r] = gdf[r]/gdf['total_n']

In [77]:
# checking some last names that identify as Asian and some other race
df_mask = (gdf['asian'] < 1) & (gdf['asian'] > 0)
filt_df = gdf[df_mask]
filt_df

race,asian,hispanc,nh_black,nh_white,total_n,total_norm,name_last
name_last,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aamir,0.187500,0.000000,0.000000,0.062500,4.0,0.000038,Aamir
Aanonsen,0.005917,0.000000,0.000000,0.071006,13.0,0.000125,Aanonsen
Aaron,0.000004,0.000028,0.000458,0.000983,679.0,0.006506,Aaron
Aasim,0.500000,0.000000,0.000000,0.000000,2.0,0.000019,Aasim
Ababa,0.500000,0.000000,0.000000,0.000000,2.0,0.000019,Ababa
...,...,...,...,...,...,...,...
Zych,0.000118,0.000591,0.000000,0.010161,92.0,0.000881,Zych
Zylinski,0.008264,0.000000,0.000000,0.082645,11.0,0.000105,Zylinski
Zymberi,0.111111,0.000000,0.000000,0.222222,3.0,0.000029,Zymberi
Zyung,0.500000,0.000000,0.000000,0.000000,2.0,0.000019,Zyung


# Data Processing

In [110]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from keras.preprocessing import sequence
from sklearn.metrics import classification_report

NGRAMS = 2
feature_len = 25

In [12]:
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(sdf.name_last)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

num_words = 1423


In [78]:
gdf.columns

Index(['asian', 'hispanc', 'nh_black', 'nh_white', 'total_n', 'total_norm',
       'name_last'],
      dtype='object', name='race')

In [79]:
gdf.drop('total_n', axis=1, inplace=True)

In [83]:
y_arr = np.array(gdf.iloc[:,:-1])
y_arr

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        9.58138911e-06],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        9.58138911e-06],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        9.58138911e-06],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        9.58138911e-06],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        9.58138911e-06],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        9.58138911e-06]])

In [84]:
# build X from index of n-gram sequence
X = np.array(gdf.name_last.apply(lambda c: find_ngrams(c, NGRAMS)))

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y_arr, test_size=0.3, random_state=10)

In [101]:
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)

# KNNRegressor Model

In [115]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [118]:
knn = KNeighborsRegressor()

param = {'n_neighbors': range(3,8)}
clf = GridSearchCV(knn, param, verbose=2)

clf.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ......................................n_neighbors=3; total time=22.7min
[CV] END ......................................n_neighbors=3; total time=21.8min
[CV] END ......................................n_neighbors=3; total time=19.6min
[CV] END ......................................n_neighbors=3; total time=27.7min
[CV] END ......................................n_neighbors=3; total time=26.4min
[CV] END ......................................n_neighbors=4; total time=29.3min
[CV] END ......................................n_neighbors=4; total time=28.1min
[CV] END ......................................n_neighbors=4; total time=23.8min
[CV] END ......................................n_neighbors=4; total time=25.7min
[CV] END ......................................n_neighbors=4; total time=32.0min
[CV] END ......................................n_neighbors=5; total time=26.0min
[CV] END ......................................n_

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': range(3, 8)}, verbose=2)

In [119]:
clf.best_params_

{'n_neighbors': 7}

In [125]:
knn = clf.best_estimator_

In [126]:
y_pred = knn.predict(X_test)

In [127]:
y_pred[:5]

array([[0.00000000e+00, 2.85714286e-02, 3.90625000e-03, 3.66927083e-01,
        4.65381757e-05],
       [0.00000000e+00, 2.85714286e-01, 4.28571429e-01, 2.85714286e-01,
        9.58138911e-06],
       [1.33588939e-05, 1.42939829e-01, 1.42875139e-01, 1.22132140e-01,
        5.83095966e-04],
       [0.00000000e+00, 7.25274725e-01, 0.00000000e+00, 1.42857143e-01,
        2.60066276e-05],
       [1.43314286e-01, 4.42539683e-03, 2.28571429e-04, 4.61387488e-01,
        7.11760334e-05]])

In [128]:
print (mean_squared_error(y_test, y_pred))

0.06891562246641425
