## Predict Race/Ethnicity from Unseen Last Name Using KNN (Cosine Distance)

Using the Florida Voting Registration data, we build a knn classifier that predicts the ethnicity of an **unseen** name. We estimate distance between names using cosine distance across bi-char tokens of the name.

In [1]:
import collections
import multiprocessing as mp
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
# Florida voter
df = pd.read_csv('data/fl_2022_LastName.csv.gz')

In [3]:
races = sorted(df.race.unique().tolist())
races

['asian', 'hispanic', 'nh_black', 'nh_white', 'other']

In [6]:
%%time
# build n-gram list
NGRAMS = 2
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=.005, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

a = vect.fit_transform(df.name_last) 
tfidf = tfidf_transformer.fit_transform(a)

vocab = vect.vocabulary_

CPU times: user 8.86 s, sys: 1.55 s, total: 10.4 s
Wall time: 10.4 s


In [7]:
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
print(words_list[1:10])
num_words = len(words_list)
print("num_words = %d" % num_words)

['Ar', 'ru', ' B', 'Bi', 'it', 'ta', 'an', 'ng', ' D']
num_words = 394


In [8]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [10]:
df['tfidf_index'] = df.index

In [11]:
train_df, test_df = train_test_split(df, test_size=.05)
train_df, valid_df = train_test_split(train_df, test_size=.05)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (953621, 11)
Validation set size: (50191, 11)
Test set size: (52833, 11)


## Find the best K

In [12]:
# Predict Race Using Cosine Distance

def predict_cosine_race(arg):
    # reading the tuple passed on by the calling function
    idx, row_data, test_df, corpus_df, corp_vector, k = arg
    
    # resizing the tf-idf (1, m) & corpus vectors to be (n, m)
    #  n = number of samples
    #  m = number of dimentions
    orig_vector = tfidf[row_data['tfidf_index']].reshape(1, -1)

    # calculating the cosine similarity beteween the name vector
    #   and the corpus vectors.  Then filtering for only values
    #   that are greater that what was passed on
    cossim = cosine_similarity(orig_vector, corp_vector)
        
    # Order by cosine distance and pick top k
    cossim_df = corpus_df.iloc[np.flip(cossim.flatten().argsort())[:k]]
    
    pred_race = cossim_df[races].mean().argmax()
    test_df.loc[idx, 'pred_race'] = pred_race
        
    return pred_race

def check_cosine_k(test_df, corpus_df, k):
    results = []

    num_cpu = mp.cpu_count() 
    pool = mp.pool.ThreadPool(processes=8)

    corp_vector = tfidf[corpus_df['tfidf_index']]

    # for idx, row in tqdm(test_df.iterrows()):
    r = pool.map(predict_cosine_race, [(idx, row, test_df, corpus_df, corp_vector, k)
                                for idx, row in test_df.iterrows()])
    results.append(r)

    pool.close()
    pool.join()

    return results


In [13]:
k_metrics = {
    3:0,
    5:0,
    25:0,
}

In [15]:
true_list = []
for idx, row in valid_df.iterrows():
    true_list.append(row['race'])

In [16]:
replacement = {0: 'asian', 1: 'hispanic', 2: 'nh_black', 3: 'nh_white', 4: 'other'}

In [18]:
%%time
for value, key in enumerate (k_metrics):
    #print ('{} -- {}'.format(key, value))
    result = check_cosine_k(valid_df, train_df, key)
    
    pred_list = np.array(result).reshape(-1)
    pred_list = pred_list.tolist()
    
    true_list = pd.Series(true_list).replace(replacement).to_list()
    pred_list = pd.Series(pred_list).replace(replacement).to_list()
    
    value = classification_report(true_list, pred_list, zero_division = 0)
    
    print ('for value of k: {} \n{}'.format(key, value))
    k_metrics[key] = value

for value of k: 3 
              precision    recall  f1-score   support

       asian       0.34      0.24      0.28      1657
    hispanic       0.85      0.82      0.83     15573
    nh_black       0.52      0.39      0.44      4974
    nh_white       0.78      0.87      0.82     26644
       other       0.16      0.05      0.08      1343

    accuracy                           0.76     50191
   macro avg       0.53      0.47      0.49     50191
weighted avg       0.74      0.76      0.75     50191

for value of k: 5 
              precision    recall  f1-score   support

       asian       0.48      0.19      0.27      1657
    hispanic       0.87      0.82      0.85     15573
    nh_black       0.55      0.38      0.45      4974
    nh_white       0.77      0.90      0.83     26644
       other       0.20      0.04      0.06      1343

    accuracy                           0.78     50191
   macro avg       0.57      0.47      0.49     50191
weighted avg       0.76      0.78      

## Test Set evaluation

In [19]:
%%time
result = check_cosine_k(test_df, train_df, 5)

CPU times: user 5h 46min 16s, sys: 47min 32s, total: 6h 33min 48s
Wall time: 1h 19min 7s


In [20]:
pred_list = np.array(result).reshape(-1)
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['race'])

true_list = pd.Series(true_list).replace(replacement).to_list()
pred_list = pd.Series(pred_list).replace(replacement).to_list()
    
value = classification_report(true_list, pred_list, zero_division = 0)

print ('for value of k: {} \n{}'.format(5, value))

for value of k: 5 
              precision    recall  f1-score   support

       asian       0.47      0.19      0.27      1812
    hispanic       0.88      0.82      0.85     16243
    nh_black       0.56      0.38      0.45      5145
    nh_white       0.77      0.90      0.83     28183
       other       0.22      0.04      0.07      1450

    accuracy                           0.78     52833
   macro avg       0.58      0.47      0.49     52833
weighted avg       0.76      0.78      0.76     52833

