## Predict Race/Ethnicity from Unseen Last Name Using KNN (Cosine Distance)

Using the Florida Voting Registration data, we build a knn classifier that predicts the ethnicity of an **unseen** name. We estimate distance between names using cosine distance across bi-char tokens of the name.

In [1]:
import collections
import multiprocessing as mp
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
!pwd

/home/jupyter/notebooks/ethnicolr


### Read in the data + Normalize Last Name

In [3]:
# Florida voter file
df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz', usecols=['name_last', 'race'])
df.dropna(subset=['name_last'], inplace=True)

# We assume unknown as missing at random
sdf = df[df.race.isin(['unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.strip().str.lower()

sdf

Unnamed: 0,name_last,race
0,hessler-smith,nh_white
1,rogers,nh_white
2,bartolome,nh_white
3,bailey,nh_white
4,carlson,nh_white
...,...,...
15455105,ballew,nh_white
15455106,watts,nh_white
15455107,mcrae,nh_white
15455108,ward,nh_white


In [4]:
# check the different races filtered
sdf.race.value_counts()

nh_white         9446770
hispanic         2722579
nh_black         2086582
asian             329034
other             290262
multi_racial       85888
native_indian      48158
Name: race, dtype: int64

In [5]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [6]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)

In [7]:
gdf.head(15)

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n
0,*,957.0,8847.0,416.0,239.0,10408.0,53943.0,1009.0,75819.0
1,a,0.0,1.0,0.0,0.0,0.0,2.0,0.0,3.0
2,a arup,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,a bitang,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,a de feria,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
5,a f r stephenson,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6,a felix,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7,a ghaffar,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
8,a latif,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9,a lauture,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [8]:
races = sorted(sdf.race.unique().tolist())
races

['asian',
 'hispanic',
 'multi_racial',
 'native_indian',
 'nh_black',
 'nh_white',
 'other']

In [9]:
%%time
gdf.iloc[:, 1:] = gdf.iloc[:, 1:].div(gdf.total_n, axis=0)
gdf

CPU times: user 145 ms, sys: 64.3 ms, total: 209 ms
Wall time: 185 ms


race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n
0,*,0.012622,0.116686,0.005487,0.003152,0.137274,0.711471,0.013308,1.0
1,a,0.000000,0.333333,0.000000,0.000000,0.000000,0.666667,0.000000,1.0
2,a arup,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0
3,a bitang,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.0
4,a de feria,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...
1056787,zyzanski,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0
1056788,zyzdryn,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0
1056789,zyznomyrsky,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0
1056790,zzaman,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.0


In [10]:
def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

In [11]:
# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)
gdf['true_race'] = gdf[races].idxmax(axis=1)
gdf['true_race'] = gdf['true_race'].apply(lambda c: get_race_idx(c,races))

In [12]:
gdf

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n,true_race
0,*,0.012622,0.116686,0.005487,0.003152,0.137274,0.711471,0.013308,1.0,5
1,a,0.000000,0.333333,0.000000,0.000000,0.000000,0.666667,0.000000,1.0,5
2,a arup,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0,5
3,a bitang,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.0,4
4,a de feria,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,1
...,...,...,...,...,...,...,...,...,...,...
1056787,zyzanski,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0,5
1056788,zyzdryn,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0,5
1056789,zyznomyrsky,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0,5
1056790,zzaman,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.0,6


## Data Processing

In [13]:
proto_df = gdf.groupby('true_race', group_keys=False).apply(lambda x: x.sample(frac=1, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(1056792, 10)

In [14]:
# Check that our sampling has generated proportionate representation in all classes
proto_df.true_race.value_counts()

5    561509
1    324712
4    103749
0     36794
6     21295
2      6231
3      2502
Name: true_race, dtype: int64

In [15]:
%%time
# build n-gram list
NGRAMS = 2
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=.005, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

a = vect.fit_transform(proto_df.name_last) 
tfidf = tfidf_transformer.fit_transform(a)

vocab = vect.vocabulary_

CPU times: user 35 s, sys: 1.53 s, total: 36.6 s
Wall time: 37.2 s


In [16]:
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
print(words_list[1:10])
num_words = len(words_list)
print("num_words = %d" % num_words)

['ho', 'om', 'mm', 'ma', 'ac', 'ch', 'ha', 'ck', 'ja']
num_words = 322


In [17]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [18]:
proto_df['tfidf_index'] = proto_df.index

In [19]:
proto_df

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n,true_race,tfidf_index
0,phommachack,1.000,0.0,0.0,0.0,0.0,0.000,0.00,1.0,0,0
1,janiola,1.000,0.0,0.0,0.0,0.0,0.000,0.00,1.0,0,1
2,sondagar,1.000,0.0,0.0,0.0,0.0,0.000,0.00,1.0,0,2
3,chayangkoor,1.000,0.0,0.0,0.0,0.0,0.000,0.00,1.0,0,3
4,siriphanthong,0.625,0.0,0.0,0.0,0.0,0.125,0.25,1.0,0,4
...,...,...,...,...,...,...,...,...,...,...,...
1056787,iporac,0.000,0.0,0.0,0.0,0.0,0.000,1.00,1.0,6,1056787
1056788,fonseca sierra,0.000,0.0,0.0,0.0,0.0,0.000,1.00,1.0,6,1056788
1056789,seebarrana,0.000,0.0,0.0,0.0,0.0,0.000,1.00,1.0,6,1056789
1056790,fonseca-nader,0.000,0.0,0.0,0.0,0.0,0.000,1.00,1.0,6,1056790


In [20]:
train_df, test_df = train_test_split(proto_df, test_size=.05)
train_df, valid_df = train_test_split(train_df, test_size=.05)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (953754, 12)
Validation set size: (50198, 12)
Test set size: (52840, 12)


## Find the best K

In [21]:
# Predict Race Using Cosine Distance

def predict_cosine_race(arg):
    # reading the tuple passed on by the calling function
    idx, row_data, test_df, corpus_df, corp_vector, k = arg
    
    # resizing the tf-idf (1, m) & corpus vectors to be (n, m)
    #  n = number of samples
    #  m = number of dimentions
    orig_vector = tfidf[row_data['tfidf_index']].reshape(1, -1)

    # calculating the cosine similarity beteween the name vector
    #   and the corpus vectors.  Then filtering for only values
    #   that are greater that what was passed on
    cossim = cosine_similarity(orig_vector, corp_vector)
        
    # Order by cosine distance and pick top k
    cossim_df = corpus_df.iloc[np.flip(cossim.flatten().argsort())[:k]]
    
    pred_race = cossim_df[races].mean().argmax()
    test_df.loc[idx, 'pred_race'] = pred_race
        
    return pred_race

def check_cosine_k(test_df, corpus_df, k):
    results = []

    num_cpu = mp.cpu_count() 
    pool = mp.pool.ThreadPool(processes=8)

    corp_vector = tfidf[corpus_df['tfidf_index']]

    # for idx, row in tqdm(test_df.iterrows()):
    r = pool.map(predict_cosine_race, [(idx, row, test_df, corpus_df, corp_vector, k)
                                for idx, row in test_df.iterrows()])
    results.append(r)

    pool.close()
    pool.join()

    return results


In [22]:
k_metrics = {
    3:0,
    5:0,
    25:0,
}

In [23]:
# Convert numbers to strings
replacement = {0: 'asian', 1: 'hispanic', 2: 'multi_racial', 3: 'native_indian', 4: 'nh_black', 5: 'nh_white', 6: 'other'}

In [24]:
true_list = []
for idx, row in valid_df.iterrows():
    true_list.append(row['true_race'])

In [25]:
%%time
for value, key in enumerate (k_metrics):
    #print ('{} -- {}'.format(key, value))
    result = check_cosine_k(valid_df, train_df, key)
    
    pred_list = np.array(result).reshape(-1)
    pred_list = pred_list.tolist()
    
    true_list = pd.Series(true_list).replace(replacement).to_list()
    pred_list = pd.Series(pred_list).replace(replacement).to_list()
    
    value = classification_report(true_list, pred_list, zero_division = 0)
    
    print ('for value of k: {} \n{}'.format(key, value))
    k_metrics[key] = value

for value of k: 3 
               precision    recall  f1-score   support

        asian       0.35      0.24      0.28      1722
     hispanic       0.84      0.82      0.83     15330
 multi_racial       0.01      0.00      0.01       266
native_indian       0.00      0.00      0.00       128
     nh_black       0.53      0.40      0.46      4893
     nh_white       0.78      0.87      0.82     26824
        other       0.16      0.04      0.07      1035

     accuracy                           0.76     50198
    macro avg       0.38      0.34      0.35     50198
 weighted avg       0.74      0.76      0.75     50198

for value of k: 5 
               precision    recall  f1-score   support

        asian       0.50      0.19      0.28      1722
     hispanic       0.87      0.82      0.84     15330
 multi_racial       0.00      0.00      0.00       266
native_indian       0.00      0.00      0.00       128
     nh_black       0.56      0.39      0.46      4893
     nh_white       0.7

# Test Set evaluation

In [26]:
%%time
result = check_cosine_k(test_df, train_df, 5)

CPU times: user 10h 55min 11s, sys: 1h 27min 38s, total: 12h 22min 49s
Wall time: 5h 26min 46s


In [27]:
pred_list = np.array(result).reshape(-1)
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['true_race'])

true_list = pd.Series(true_list).replace(replacement).to_list()
pred_list = pd.Series(pred_list).replace(replacement).to_list()
    
value = classification_report(true_list, pred_list, zero_division = 0)

print ('for value of k: {} \n{}'.format(5, value))

for value of k: 5 
               precision    recall  f1-score   support

        asian       0.48      0.20      0.29      1821
     hispanic       0.87      0.81      0.84     16180
 multi_racial       0.12      0.00      0.01       327
native_indian       0.00      0.00      0.00       124
     nh_black       0.54      0.39      0.45      5063
     nh_white       0.77      0.90      0.83     28239
        other       0.19      0.03      0.05      1086

     accuracy                           0.78     52840
    macro avg       0.42      0.33      0.35     52840
 weighted avg       0.75      0.78      0.75     52840

