## Predict Race/Ethnicity from Unseen Last Name Using KNN (Popular Names)

Using the Florida Voting Registration data, we build a knn classifier that predicts the ethnicity of an **unseen** name. We estimate distance between names using cosine distance across bi-char tokens of the name.

In [1]:
import collections
import multiprocessing as mp
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

### Read in the data + Normalize Last Name

In [2]:
# Florida voter file
df = pd.read_csv('./dataverse_files/fl_reg_name_race.csv.gz', usecols=['name_last', 'race'])
df.dropna(subset=['name_last'], inplace=True)

# We assume unknown as missing at random
sdf = df[df.race.isin(['unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.strip().str.lower()

sdf

Unnamed: 0,name_last,race
0,walker,nh_white
1,palmer,nh_white
2,mc cleod,nh_black
3,scarborough,nh_white
4,walker,nh_white
...,...,...
13653889,walters,nh_white
13653890,sawyer,nh_white
13653891,thomas,nh_white
13653892,campbell,multi_racial


In [3]:
# check the different races filtered
sdf.race.value_counts()

nh_white         8714118
hispanic         2174408
nh_black         1847266
asian             253306
other             208250
multi_racial       94119
native_indian      45459
Name: race, dtype: int64

In [4]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [5]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

gdf.reset_index(inplace=True)
gdf.head(15)

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other
0,0kharitonenko,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1amirthanayagam,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4r,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,a de feria,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,a de fernandez,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,a f r stephenson,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,a felix,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,a ghaffar,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,a malivert,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
races = sorted(sdf.race.unique().tolist())
races

['asian',
 'hispanic',
 'multi_racial',
 'native_indian',
 'nh_black',
 'nh_white',
 'other']

In [7]:
%%time
# Getting the totals of each last name
gdf['total_n'] = gdf[races].sum(axis=1)
gdf.loc[:, races] = gdf.loc[:, races].div(gdf.total_n, axis=0)
gdf

CPU times: user 86.3 ms, sys: 36 ms, total: 122 ms
Wall time: 81.9 ms


race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n
0,0kharitonenko,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1amirthanayagam,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4r,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,a de feria,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
868653,zyzanski,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
868654,zyzdryn,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
868655,zyznomyrsky,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
868656,zzaman,0.5,0.0,0.0,0.0,0.0,0.0,0.5,2.0


In [8]:
def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

In [9]:
# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)

gdf['true_race'] = gdf[races].idxmax(axis=1)
gdf['true_race'] = gdf['true_race'].apply(lambda c: get_race_idx(c,races))
gdf.head()

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n,true_race
0,0kharitonenko,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,5
1,1amirthanayagam,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,4r,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,5
3,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,a de feria,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1


## Data Processing

In [10]:
proto_df = gdf.groupby('true_race', group_keys=False).apply(lambda x: x.sample(frac=1, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(868658, 10)

In [11]:
# Check that our sampling has generated proportionate representation in all classes
proto_df.true_race.value_counts()

5    514516
1    213834
4     87831
0     30058
6     12600
2      7538
3      2281
Name: true_race, dtype: int64

In [12]:
train_df, test_df = train_test_split(proto_df, test_size=.05, random_state=21)
train_df, valid_df = train_test_split(train_df, test_size=.05, random_state=21)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (783963, 11)
Validation set size: (41262, 11)
Test set size: (43433, 11)


## Find the best K

In [13]:
top_25 = train_df.sort_values(["total_n"], ascending=False).head(25)

In [14]:
from sklearn.metrics import mean_squared_error

# Predict Race Using Cosine Distance

def predict_popular_race(arg):
    # reading the tuple passed on by the calling function
    idx, row_data, test_df, corpus_df, k = arg
            
    # Order by cosine distance and pick top k
    popular_df = top_25.head(k)
    
    true_prob = row_data[races]
    pred_prob = popular_df[races].mean()
    rmse = mean_squared_error(true_prob[:-1], pred_prob[:-1], squared=False)
    pred_race = pred_prob.argmax()
    test_df.loc[idx, 'pred_race'] = pred_race
    test_df.loc[idx, 'rmse'] = rmse
    return pred_race

def check_popular_k(test_df, corpus_df, k):
    results = []

    num_cpu = mp.cpu_count() 
    pool = mp.pool.ThreadPool(processes=12)

    # for idx, row in tqdm(test_df.iterrows()):
    r = pool.map(predict_popular_race, [(idx, row, test_df, corpus_df, k)
                                for idx, row in test_df.iterrows()])
    results.append(r)

    pool.close()
    pool.join()

    return results


In [15]:
k_metrics = {
    3:0,
    5:0,
    25:0,
}

In [16]:
# Convert numbers to strings
replacement = {0: 'asian', 1: 'hispanic', 2: 'multi_racial', 3: 'native_indian', 4: 'nh_black', 5: 'nh_white', 6: 'other'}

In [17]:
true_list = []
for idx, row in valid_df.iterrows():
    true_list.append(row['true_race'])

In [18]:
%%time
for value, key in enumerate (k_metrics):
    #print ('{} -- {}'.format(key, value))
    result = check_popular_k(valid_df, train_df, key)
    
    pred_list = np.array(result).reshape(-1)
    pred_list = pred_list.tolist()
    
    true_list = pd.Series(true_list).replace(replacement).to_list()
    pred_list = pd.Series(pred_list).replace(replacement).to_list()
    
    value = classification_report(true_list, pred_list, zero_division = 0)
    
    avg_rmse = valid_df.rmse.mean()
    print ('for value of k: {} avg. rmse: {}\n{}'.format(key, avg_rmse, value))
    k_metrics[key] = value

for value of k: 3 avg. rmse: 0.3101065444259398
               precision    recall  f1-score   support

        asian       0.00      0.00      0.00      1464
     hispanic       0.00      0.00      0.00     10265
 multi_racial       0.00      0.00      0.00       357
native_indian       0.00      0.00      0.00       109
     nh_black       0.00      0.00      0.00      4226
     nh_white       0.59      1.00      0.74     24182
        other       0.00      0.00      0.00       659

     accuracy                           0.59     41262
    macro avg       0.08      0.14      0.11     41262
 weighted avg       0.34      0.59      0.43     41262

for value of k: 5 avg. rmse: 0.304229119941816
               precision    recall  f1-score   support

        asian       0.00      0.00      0.00      1464
     hispanic       0.00      0.00      0.00     10265
 multi_racial       0.00      0.00      0.00       357
native_indian       0.00      0.00      0.00       109
     nh_black       0

# Test Set evaluation

In [19]:
%%time
result = check_popular_k(test_df, train_df, 25)

CPU times: user 3min 41s, sys: 44.2 s, total: 4min 25s
Wall time: 3min 46s


In [20]:
pred_list = np.array(result).reshape(-1)
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['true_race'])

true_list = pd.Series(true_list).replace(replacement).to_list()
pred_list = pd.Series(pred_list).replace(replacement).to_list()
    
value = classification_report(true_list, pred_list, zero_division = 0)

avg_rmse = test_df.rmse.mean()
print ('for value of k: {} avg. rmse: {}\n{}'.format(5, avg_rmse, value))


for value of k: 5 avg. rmse: 0.29617408930912986
               precision    recall  f1-score   support

        asian       0.00      0.00      0.00      1447
     hispanic       0.00      0.00      0.00     10729
 multi_racial       0.00      0.00      0.00       364
native_indian       0.00      0.00      0.00       114
     nh_black       0.00      0.00      0.00      4386
     nh_white       0.59      1.00      0.74     25732
        other       0.00      0.00      0.00       661

     accuracy                           0.59     43433
    macro avg       0.08      0.14      0.11     43433
 weighted avg       0.35      0.59      0.44     43433

