In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [30]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_first'] = sdf.name_first.str.title()
sdf['name_last'] = sdf.name_last.str.title()

sdf

Unnamed: 0,name_last,name_first,race
0,Walker,Elizabeth,nh_white
1,Palmer,Alton,nh_white
2,Mc Cleod,Alicia,nh_black
3,Scarborough,Dale,nh_white
4,Walker,Daniel,nh_white
...,...,...,...
13653888,Philpott,April,nh_white
13653889,Walters,William,nh_white
13653890,Sawyer,Matthew,nh_white
13653891,Thomas,Janine,nh_white


In [33]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanc     2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [58]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'])['race'].agg(['count']).reset_index()

In [61]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

In [72]:
# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)

In [75]:
gdf[:15]

race,asian,hispanc,nh_black,nh_white,total_n
name_last,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fleurime Michel,0.0,0.0,1.0,0.0,1.0
Franklin,0.0,0.0,1.0,0.0,1.0
Grant Cliatt,0.0,0.0,1.0,0.0,1.0
Hassan,1.0,0.0,0.0,0.0,1.0
King,0.0,1.0,0.0,0.0,1.0
Williams,0.0,0.0,0.0,1.0,1.0
0Kharitonenko,0.0,0.0,0.0,1.0,1.0
1Amirthanayagam,1.0,0.0,0.0,0.0,1.0
4R,0.0,0.0,0.0,1.0,1.0
77348 Dancing Rochanavibhata,1.0,0.0,0.0,0.0,1.0


In [82]:
races = sdf.race.unique().tolist()
races

['nh_white', 'nh_black', 'hispanc', 'asian']

In [83]:
# converting races to proportions
for r in races:
    gdf[r] = gdf[r]/gdf['total_n']

In [90]:
# checking some last names that identify as Asian and some other race
df_mask = (gdf['asian'] < 1) & (gdf['asian'] > 0)
filt_df = gdf[df_mask]
filt_df

race,asian,hispanc,nh_black,nh_white,total_n
name_last,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aamir,0.750000,0.000000,0.000000,0.250000,4.0
Aanonsen,0.076923,0.000000,0.000000,0.923077,13.0
Aaron,0.002946,0.019146,0.310751,0.667158,679.0
Abad,0.079012,0.777778,0.007407,0.135802,405.0
Abadi,0.080000,0.400000,0.040000,0.480000,25.0
...,...,...,...,...,...
Zwolinski,0.029412,0.000000,0.000000,0.970588,34.0
Zych,0.010870,0.054348,0.000000,0.934783,92.0
Zylinski,0.090909,0.000000,0.000000,0.909091,11.0
Zymberi,0.333333,0.000000,0.000000,0.666667,3.0
