## Predict Race/Ethnicity from Unseen Last Name Using Bigram MinHash LSH

Using the Florida Voting Registration data, we build a knn classifier that predicts the ethnicity of an **unseen** name. We estimate distance between names using cosine distance across bi-char tokens of the name.

In [55]:
import collections
import multiprocessing as mp
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

from datasketch import MinHashLSHForest, MinHash

### Read in the data + Normalize Last Name

In [2]:
# Florida voter file
df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz', usecols=['name_last', 'race'])
df.dropna(subset=['name_last'], inplace=True)

# We assume unknown as missing at random
sdf = df[df.race.isin(['unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.strip().str.lower()

sdf

Unnamed: 0,name_last,race
0,hessler-smith,nh_white
1,rogers,nh_white
2,bartolome,nh_white
3,bailey,nh_white
4,carlson,nh_white
...,...,...
15455105,ballew,nh_white
15455106,watts,nh_white
15455107,mcrae,nh_white
15455108,ward,nh_white


In [3]:
# check the different races filtered
sdf.race.value_counts()

nh_white         9446770
hispanic         2722579
nh_black         2086582
asian             329034
other             290262
multi_racial       85888
native_indian      48158
Name: race, dtype: int64

In [4]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])

In [5]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)

In [6]:
gdf.head(15)

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n
0,*,957.0,8847.0,416.0,239.0,10408.0,53943.0,1009.0,75819.0
1,a,0.0,1.0,0.0,0.0,0.0,2.0,0.0,3.0
2,a arup,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,a bitang,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,a de feria,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
5,a f r stephenson,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6,a felix,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7,a ghaffar,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
8,a latif,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9,a lauture,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [7]:
races = sorted(sdf.race.unique().tolist())
races

['asian',
 'hispanic',
 'multi_racial',
 'native_indian',
 'nh_black',
 'nh_white',
 'other']

In [8]:
%%time
gdf.iloc[:, 1:] = gdf.iloc[:, 1:].div(gdf.total_n, axis=0)
gdf

CPU times: user 29.8 ms, sys: 58.4 ms, total: 88.2 ms
Wall time: 58.6 ms


race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n
0,*,0.012622,0.116686,0.005487,0.003152,0.137274,0.711471,0.013308,1.0
1,a,0.000000,0.333333,0.000000,0.000000,0.000000,0.666667,0.000000,1.0
2,a arup,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0
3,a bitang,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.0
4,a de feria,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...
1056787,zyzanski,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0
1056788,zyzdryn,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0
1056789,zyznomyrsky,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0
1056790,zzaman,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.0


In [9]:
def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

In [10]:
# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)
gdf['true_race'] = gdf[races].idxmax(axis=1)
gdf['true_race'] = gdf['true_race'].apply(lambda c: get_race_idx(c,races))

In [11]:
gdf

race,name_last,asian,hispanic,multi_racial,native_indian,nh_black,nh_white,other,total_n,true_race
0,*,0.012622,0.116686,0.005487,0.003152,0.137274,0.711471,0.013308,1.0,5
1,a,0.000000,0.333333,0.000000,0.000000,0.000000,0.666667,0.000000,1.0,5
2,a arup,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0,5
3,a bitang,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.0,4
4,a de feria,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,1
...,...,...,...,...,...,...,...,...,...,...
1056787,zyzanski,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0,5
1056788,zyzdryn,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0,5
1056789,zyznomyrsky,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.0,5
1056790,zzaman,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.0,6


## Data Processing

In [12]:
proto_df = gdf.groupby('true_race', group_keys=False).apply(lambda x: x.sample(frac=1, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(1056792, 10)

In [13]:
# Check that our sampling has generated proportionate representation in all classes
proto_df.true_race.value_counts()

5    561509
1    324712
4    103749
0     36794
6     21295
2      6231
3      2502
Name: true_race, dtype: int64

In [14]:
train_df, test_df = train_test_split(proto_df, test_size=.05)
train_df, valid_df = train_test_split(train_df, test_size=.05)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (953754, 11)
Validation set size: (50198, 11)
Test set size: (52840, 11)


In [15]:
def get_bigrams(text):
    bigrams = []
    for i in range(len(text) - 1):
        bigram = text[i:i + 2]
        bigrams.append(bigram)
    return bigrams

In [16]:
num_perm = 256  # Specify the desired number of permutations
forest = MinHashLSHForest(num_perm=num_perm)

# Step 3: Iterate over each row in the sparse matrix
for i in range(train_df.shape[0]):
    # Step 4: Create a new MinHash object for each row
    minhash = MinHash(num_perm=num_perm)
    set_of_bigrams = get_bigrams(train_df.name_last[i])
    for term in set_of_bigrams:
        minhash.update(term.encode('utf-8'))

    key = f"{i}"
    forest.add(key, minhash)

In [17]:
# Index the forest
forest.index()

# Query the forest to find the k-nearest neighbors for a given query document
query = test_df.name_last[0]
query_minhash = MinHash(num_perm=256)
query_terms = get_bigrams(test_df.name_last[0])
for j in query_terms:
    query_minhash.update(j.encode('utf-8'))

# Step 11: Find the nearest neighbors using KNN search
k = 5  # Number of nearest neighbors to retrieve
result = forest.query(query_minhash, k)

print(query)
# Step 12: Print the nearest neighbors
for key in result:
    print("Nearest neighbor:", train_df.name_last[int(key)])

birdyshaw
Nearest neighbor: adyshov
Nearest neighbor: shaudys
Nearest neighbor: birdashaw
Nearest neighbor: houdyshell
Nearest neighbor: woodyshek


## Find the best K

In [52]:
from sklearn.metrics import classification_report

def estimate_knn_performance(forest, test_df, k_values):
    performance = {}

    for k in k_values:
        correct_predictions = 0
        total_examples = 0
        predicted_labels = []

        for i in range(len(test_df)):
            query_minhash = MinHash(num_perm=256)
            query_terms = get_bigrams(test_df.name_last[i])
            for term in query_terms:
                query_minhash.update(term.encode('utf-8'))

            result = forest.query(query_minhash, k)
            label_counts = {}

            for j in result:
                try:
                    index = int(j)
                    predicted_label = train_df.true_race[index]

                    if predicted_label in label_counts:
                        label_counts[predicted_label] += 1
                    else:
                        label_counts[predicted_label] = 1

                except (KeyError, ValueError):
                    continue

            # Determine the predicted label based on the majority count
            predicted_label = max(label_counts, key=label_counts.get)
            predicted_labels.append(predicted_label)

            # Compare the predicted label with the true label
            true_label = test_df.true_race[i]
            if predicted_label == true_label:
                correct_predictions += 1

            total_examples += 1

        accuracy = correct_predictions / total_examples
        performance[k] = accuracy
        report = classification_report(pd.Series(test_df.true_race).replace(replacement).to_list(), pd.Series(predicted_labels).replace(replacement).to_list(), zero_division='warn')
        print(f"Classification Report (k={k}):\n{report}\n")

    return performance

In [53]:
%%time
k_values = [3, 5, 25]
performance = estimate_knn_performance(forest, valid_df, k_values)
print(performance)

Classification Report (k=3):
               precision    recall  f1-score   support

        asian       0.30      0.20      0.24      1707
     hispanic       0.76      0.79      0.78     15490
 multi_racial       0.02      0.01      0.01       334
native_indian       0.00      0.00      0.00       112
     nh_black       0.42      0.32      0.36      4942
     nh_white       0.76      0.81      0.78     26609
        other       0.09      0.04      0.06      1004

     accuracy                           0.71     50198
    macro avg       0.34      0.31      0.32     50198
 weighted avg       0.69      0.71      0.70     50198


Classification Report (k=5):
               precision    recall  f1-score   support

        asian       0.37      0.15      0.22      1707
     hispanic       0.77      0.80      0.79     15490
 multi_racial       0.00      0.00      0.00       334
native_indian       0.00      0.00      0.00       112
     nh_black       0.46      0.29      0.36      4942
  

  _warn_prf(average, modifier, msg_start, len(result))


Classification Report (k=25):
               precision    recall  f1-score   support

        asian       0.49      0.06      0.10      1707
     hispanic       0.76      0.79      0.78     15490
 multi_racial       0.00      0.00      0.00       334
native_indian       0.00      0.00      0.00       112
     nh_black       0.55      0.18      0.28      4942
     nh_white       0.73      0.88      0.80     26609
        other       0.00      0.00      0.00      1004

     accuracy                           0.73     50198
    macro avg       0.36      0.27      0.28     50198
 weighted avg       0.69      0.73      0.69     50198


{3: 0.7138929837842145, 5: 0.7303279015100204, 25: 0.7313438782421611}
CPU times: user 13min 52s, sys: 0 ns, total: 13min 52s
Wall time: 13min 51s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
estimate_knn_performance(forest, test_df, [5])

Classification Report (k=5):
               precision    recall  f1-score   support

        asian       0.38      0.16      0.22      1870
     hispanic       0.77      0.80      0.79     16243
 multi_racial       0.00      0.00      0.00       298
native_indian       0.00      0.00      0.00       114
     nh_black       0.45      0.31      0.37      5048
     nh_white       0.75      0.84      0.80     28200
        other       0.13      0.02      0.03      1067

     accuracy                           0.73     52840
    macro avg       0.36      0.30      0.31     52840
 weighted avg       0.70      0.73      0.71     52840




{5: 0.7327592732778199}