## Predict Race/Ethnicity of Unseen Last Name Using KNN (Cosine Distance and Levenshtein)

In [1]:
import collections
import Levenshtein as lv
import multiprocessing as mp
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
# Florida voter file
df = pd.read_csv('./dataverse_files/fl_reg_name_race.csv.gz', usecols=['name_last', 'race'])
df.dropna(subset=['name_last'], inplace=True)

# We assume unknown as missing at random
sdf = df[df.race.isin(['unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.strip().str.lower()

sdf

#sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]

Unnamed: 0,name_last,race
0,walker,nh_white
1,palmer,nh_white
2,mc cleod,nh_black
3,scarborough,nh_white
4,walker,nh_white
...,...,...
13653889,walters,nh_white
13653890,sawyer,nh_white
13653891,thomas,nh_white
13653892,campbell,multi_racial


In [3]:
# check the different races filtered
sdf.race.value_counts()

nh_white         8714118
hispanic         2174408
nh_black         1847266
asian             253306
other             208250
multi_racial       94119
native_indian      45459
Name: race, dtype: int64

In [4]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [5]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
#gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)

In [6]:
gdf.head(15)

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other
0,0kharitonenko,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1amirthanayagam,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4r,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,a de feria,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,a de fernandez,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,a f r stephenson,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,a felix,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,a ghaffar,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,a malivert,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
races = sorted(sdf.race.unique().tolist())
races

['asian',
 'hispanic',
 'multi_racial',
 'native_indian',
 'nh_black',
 'nh_white',
 'other']

In [8]:
def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

In [9]:
# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)

gdf['true_race'] = gdf[races].idxmax(axis=1)
gdf['true_race'] = gdf['true_race'].apply(lambda c: get_race_idx(c,races))

In [10]:
gdf

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,true_race
0,0kharitonenko,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5
1,1amirthanayagam,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,4r,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5
3,77348 dancing rochanavibhata,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,a de feria,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
868653,zyzanski,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5
868654,zyzdryn,0.0,0.0,0.0,0.0,0.0,2.0,0.0,5
868655,zyznomyrsky,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5
868656,zzaman,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0


## Data Processing

In [11]:
proto_df = gdf.groupby('true_race', group_keys=False).apply(lambda x: x.sample(frac=1, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(868658, 9)

In [12]:
# Check that our sampling has generated proportionate representation in all classes
proto_df.true_race.value_counts()

5    514516
1    213834
4     87831
0     30058
6     12600
2      7538
3      2281
Name: true_race, dtype: int64

In [13]:
%%time
# build n-gram list
NGRAMS = 2
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=.005, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

a = vect.fit_transform(proto_df.name_last) 
tfidf = tfidf_transformer.fit_transform(a)

vocab = vect.vocabulary_

CPU times: user 6.67 s, sys: 50.6 ms, total: 6.72 s
Wall time: 6.73 s


In [14]:
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
print(words_list[1:10])
num_words = len(words_list)
print("num_words = %d" % num_words)

['er', 'rs', 'sa', 'ab', 'ba', 'al', 'pa', 'at', 'tt']
num_words = 320


In [15]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [16]:
# build tf-idf vectors
proto_df['tfidf_index'] = proto_df.index

In [17]:
proto_df

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,true_race,tfidf_index
0,bersabal,5.0,0.0,0.0,0.0,0.0,1.0,2.0,0,0
1,pattana,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
2,patankar,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0,2
3,kalapa,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3
4,pao,18.0,14.0,0.0,0.0,1.0,6.0,3.0,0,4
...,...,...,...,...,...,...,...,...,...,...
868653,maravall,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6,868653
868654,tisoh,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6,868654
868655,basanta perez,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6,868655
868656,marcenaro vizquerra,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6,868656


In [18]:
train_df, test_df = train_test_split(proto_df, test_size=.05)
train_df, valid_df = train_test_split(train_df, test_size=.05)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (783963, 11)
Validation set size: (41262, 11)
Test set size: (43433, 11)


# Find the Best K
i.e. the nearest k neighbors in our vector

In [19]:

def predict_race(arg):
    # reading the tuple passed on by the calling function
    idx, row_data, test_df, corpus_df, corp_vector, k, filt = arg

    # resizing the tf-idf (1, m) & corpus vectors to be (n, m)
    #  n = number of samples
    #  m = number of dimentions
    orig_vector = tfidf[row_data['tfidf_index']].reshape(1, -1)
    # corp_vector = np.array([x for x in corpus_df['tfidf']])

    # calculating the cosine similarity beteween the name vector
    #   and the corpus vectors.  Then filtering for only values
    #   that are greater that what was passed on
    cossim = cosine_similarity(orig_vector, corp_vector)
    filt_result = np.argwhere(cossim >= filt).reshape(-1)

    # if we don't get any matches on cosine similarity >= "value"
    #    we open up the critiria to 0.1 to get something
    if (len(filt_result) == 0):
        # this is to handle if we still are not getting anything
        #  after opening up cosine similarity.  Just return the
        #  most common class, which is nh_white/3
        return 3

    # filtering the corpus dataframe to only inclue the items
    #   that met the cosine similarity filter
    filtered_corpus_df = corpus_df.iloc[filt_result]

    # calculate the levenshtein distance between our vector
    #   and the filtered corpus vectors.
    # Levenshtein is an expensive operation so we don't
    #   want to calculate it for every name in the corpus
    lev_dist = calc_leven_vector(row_data['name_last'],
                          filtered_corpus_df['name_last'])

    # The calc_leven function returns a dictionary
    #  we seperate the keys from the values into arrays
    #  that we can use which names are the most similar
    #  i.e. smallest levenstein distance
    values = np.array(list(lev_dist.values()))
    keys = np.array(list(lev_dist.keys()))

    if (k < values.shape[0]):
        # This is when k is smaller than the size of the
        #   values array, we can partition it by the smallest
        #   k values
        filt_values = np.argpartition(values, k)
        max_value = np.max(values[filt_values[:k]])
    else:
        # Otherwise whatever the filt_value are will be the
        #   k nearest neighbors to our string
        filt_values = values.shape[0] - 1
        max_value = np.max(values[filt_values])

    # if (isinstance(filt_values, np.ndarray)):
    #     max_value = np.max(values[filt_values[:k]])
    # else:
    #     max_value = np.max(values[filt_values])

    mask = (values <= max_value) & (values > 0)
    mask_idx = np.argwhere(mask).reshape(-1)
    df_idx = keys[mask_idx]    
    
    filter_df = corpus_df.iloc[df_idx]

    pred_race = filter_df[races].mean().argmax()
    
    test_df.loc[test_df['name_last'] == row_data['name_last'],
                'pred_race'] = pred_race

    return pred_race


def calc_leven(orig_string, filt_df):
    lev_dist = {}
    if not (isinstance(filt_df, str)):
        for idx, row in filt_df.iteritems():
            lev = lv.distance(orig_string, row)
            lev_dist[idx] = lev
    else:
        lev = lv.distance(orig_string, filt_df)
        lev_dist[0] = lev
    return lev_dist

def calc_leven_vector(orig_string, filt_df):
    if not (isinstance(filt_df, str)):
        lev_dist = filt_df.apply(lambda c: lv.distance(orig_string, c))
        return lev_dist.to_dict()
    else:
        lev = lv.distance(orig_string, filt_df)
        return {0: lev}

def check_k(test_df, corpus_df, k, filt):
    results = []

    num_cpu = mp.cpu_count() 
    pool = mp.pool.ThreadPool(processes=8)

    corp_vector = tfidf[corpus_df['tfidf_index']]

    # for idx, row in tqdm(test_df.iterrows()):
    r = pool.map(predict_race, [(idx, row, test_df, corpus_df, corp_vector, k, filt)
                                for idx, row in test_df.iterrows()])
    results.append(r)

    pool.close()
    pool.join()

    return results

In [20]:
k_metrics = {
    3:0,
    5:0,
    25:0
}

In [21]:
# Convert numbers to strings
replacement = {0: 'asian', 1: 'hispanic', 2: 'multi_racial', 3: 'native_indian', 4: 'nh_black', 5: 'nh_white', 6: 'other'}

In [22]:
true_list = []
for idx, row in valid_df.iterrows():
    true_list.append(row['true_race'])

In [23]:
%%time
for value, key in enumerate (k_metrics):
    #print ('{} -- {}'.format(key, value))
    result = check_k(valid_df, train_df, key, 0.6)
    
    pred_list = np.array(result).reshape(-1)
    pred_list = pred_list.tolist()
    
    true_list = pd.Series(true_list).replace(replacement).to_list()
    pred_list = pd.Series(pred_list).replace(replacement).to_list()
    
    value = classification_report(true_list, pred_list, zero_division = 0)
    
    print ('for value of k: {} \n{}'.format(key, value))
    k_metrics[key] = value

for value of k: 3 
               precision    recall  f1-score   support

        asian       0.48      0.24      0.32      1411
     hispanic       0.81      0.80      0.80     10232
 multi_racial       0.06      0.01      0.01       334
native_indian       0.02      0.02      0.02       129
     nh_black       0.54      0.42      0.47      4177
     nh_white       0.80      0.89      0.84     24384
        other       0.15      0.02      0.03       595

     accuracy                           0.77     41262
    macro avg       0.41      0.34      0.36     41262
 weighted avg       0.75      0.77      0.75     41262

for value of k: 5 
               precision    recall  f1-score   support

        asian       0.48      0.21      0.29      1411
     hispanic       0.81      0.79      0.80     10232
 multi_racial       0.00      0.00      0.00       334
native_indian       0.03      0.02      0.02       129
     nh_black       0.55      0.39      0.45      4177
     nh_white       0.7

# Test Set evaluation

In [24]:
%%time
result = check_k(test_df, train_df, 3, 0.6)

CPU times: user 1h 11min 32s, sys: 11min 55s, total: 1h 23min 27s
Wall time: 33min 29s


In [25]:
pred_list = np.array(result).reshape(-1)
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['true_race'])

true_list = pd.Series(true_list).replace(replacement).to_list()
pred_list = pd.Series(pred_list).replace(replacement).to_list()

value = classification_report(true_list, pred_list, zero_division = 0)

print(value)
print()

               precision    recall  f1-score   support

        asian       0.48      0.23      0.31      1510
     hispanic       0.80      0.80      0.80     10634
 multi_racial       0.09      0.01      0.01       389
native_indian       0.00      0.00      0.00       119
     nh_black       0.54      0.43      0.48      4398
     nh_white       0.80      0.89      0.84     25733
        other       0.12      0.02      0.03       650

     accuracy                           0.77     43433
    macro avg       0.41      0.34      0.35     43433
 weighted avg       0.74      0.77      0.75     43433


