## Predict Race/Ethnicity from Unseen Full Name Using KNN LSH MinHash Parallelized

Using the Florida Voting Registration data, we build a knn classifier that predicts the ethnicity of an **unseen** name. We estimate approximate jaccard distance between names using bi-char tokens of the name.

In [1]:
import collections
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial

import tempfile
import pickle
import os

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from datasketch import MinHashLSHForest, MinHash

In [2]:
# Florida voter
train_df = pd.read_csv('data/fl_2022_FullName_train.csv.gz', usecols=['full_name', 'race'])
val_df = pd.read_csv('data/fl_2022_FullName_val.csv.gz', usecols=['full_name', 'race'])
test_df = pd.read_csv('data/fl_2022_FullName_test.csv.gz', usecols=['full_name', 'race'])

In [3]:
train_df = train_df.groupby('race', group_keys=False).apply(lambda x: x.sample(frac=.7, random_state=10))
train_df.reset_index(inplace=True)
train_df.drop('index', axis=1, inplace=True)
train_df.shape

(5050426, 2)

In [4]:
print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(val_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (5050426, 2)
Validation set size: (901862, 2)
Test set size: (901862, 2)


## Find the best K

In [5]:
def get_bigrams(text):
    bigrams = []
    for i in range(len(text) - 1):
        bigram = text[i:i + 2]
        bigrams.append(bigram)
    return bigrams

In [6]:
def create_minhash(row):
    set_of_bigrams = get_bigrams(row)
    minhash = MinHash(num_perm=num_perm)
    for term in set_of_bigrams:
        minhash.update(term.encode('utf-8'))
    return minhash

In [7]:
num_processes = mp.cpu_count()
num_perm = 256  # Specify the desired number of permutations

with Pool(processes=num_processes) as pool:
    minhashes = list(tqdm(pool.imap(create_minhash, train_df.full_name), total=train_df.shape[0], desc='Processing rows', unit='row'))

# Index the forest
forest = MinHashLSHForest(num_perm=num_perm)
for i, minhash in enumerate(minhashes):
    key = f"{i}"
    forest.add(key, minhash)
forest.index()

Processing rows:  15%|█████████████▉                                                                            | 781605/5050426 [17:59<1:47:58, 658.92row/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing rows:  74%|█████████████████████████████████████████████████████████████████▊                       | 3735104/5050426 [1:24:03<29:38, 739.38row/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing rows:  83%|████████████████████████████

In [8]:
def estimate_knn_performance(forest, test_df, k_values, batch_size=100):
    performance = {}

    for k in k_values:
        correct_predictions = 0
        total_examples = len(test_df)
        predicted_labels = []
        true_labels = []
        
        with tempfile.TemporaryDirectory() as temp_dir:
            num_batches = int(np.ceil(len(test_df) / batch_size))
            batches = np.array_split(test_df, num_batches)
            
            for batch_idx, batch in tqdm(enumerate(batches), desc=f'Processing queries (k={k})', total=num_batches):
                result_file = f'{temp_dir}/results_batch_{batch_idx}.pickle'
                process_query_batch(batch, k, result_file)
                
                # Read batch results from disk
                with open(result_file, 'rb') as f:
                    batch_predicted_labels, batch_true_labels = pickle.load(f)
                
                predicted_labels.extend(batch_predicted_labels)
                true_labels.extend(batch_true_labels)
                
                # Delete the result file after reading
                os.remove(result_file)
        
        correct_predictions = sum(pred == true for pred, true in zip(predicted_labels, true_labels))
        accuracy = correct_predictions / total_examples
        performance[k] = accuracy
        
        report = classification_report(pd.Series(true_labels).replace(replacement).to_list(),
                                           pd.Series(predicted_labels).replace(replacement).to_list(),
                                           zero_division='warn')
        print(f"Classification Report (k={k}) - Batch {result_file}:\n{report}\n")
    

    return performance

def process_query_batch(rows, k, result_file):
    batch_size = len(rows)
    query_minhashes = [MinHash(num_perm=256) for _ in range(batch_size)]
    query_terms_list = [get_bigrams(row['full_name']) for _, row in rows.iterrows()]
    
    for i, query_terms in enumerate(query_terms_list):
        for term in query_terms:
            query_minhashes[i].update(term.encode('utf-8'))
    
    result_batch = []
    for query_minhash in query_minhashes:
        result = forest.query(query_minhash, int(k))
        result_batch.append(result)
    
    predicted_labels = []
    true_labels = []
    
    for result, (_, row) in zip(result_batch, rows.iterrows()):
        label_counts = {}

        if result:
            for j in result:
                try:
                    index = int(j)
                    predicted_label = train_df.race[index]

                    if predicted_label in label_counts:
                        label_counts[predicted_label] += 1
                    else:
                        label_counts[predicted_label] = 1

                except (KeyError, ValueError):
                    continue

            # Determine the predicted label based on the majority count
            predicted_label = max(label_counts, key=label_counts.get)
        else:
            # Default to most frequent label from the training data if query result is empty
            predicted_label = most_frequent_label(train_df.race)

        predicted_labels.append(predicted_label)
        true_labels.append(row['race'])
    
    # Write batch results to disk
    with open(result_file, 'wb') as f:
        pickle.dump((predicted_labels, true_labels), f)

In [9]:
replacement = {0: 'asian', 1: 'hispanic', 3: 'nh_black', 4: 'nh_white', 5: 'other'}

In [12]:
%%time
k_values = [10, 25]
performance = estimate_knn_performance(forest, val_df, k_values)
print(performance)

Processing queries (k=10): 100%|███████████████████████████████████████████████████████████████████████████████████████| 9019/9019 [1:25:37<00:00,  1.76it/s]


Classification Report (k=10) - Batch /tmp/tmpawqiqp7r/results_batch_9018.pickle:
              precision    recall  f1-score   support

       asian       0.64      0.18      0.28     25755
    hispanic       0.72      0.65      0.68    163525
    nh_black       0.58      0.31      0.40    133471
    nh_white       0.75      0.91      0.82    552738
       other       0.31      0.02      0.04     26373

    accuracy                           0.73    901862
   macro avg       0.60      0.41      0.44    901862
weighted avg       0.70      0.73      0.70    901862




Processing queries (k=25): 100%|███████████████████████████████████████████████████████████████████████████████████████| 9019/9019 [1:30:06<00:00,  1.67it/s]


Classification Report (k=25) - Batch /tmp/tmp0a57yd9d/results_batch_9018.pickle:
              precision    recall  f1-score   support

       asian       0.70      0.13      0.22     25755
    hispanic       0.73      0.62      0.67    163525
    nh_black       0.63      0.24      0.35    133471
    nh_white       0.73      0.93      0.82    552738
       other       0.36      0.01      0.02     26373

    accuracy                           0.72    901862
   macro avg       0.63      0.39      0.42    901862
weighted avg       0.70      0.72      0.68    901862


{10: 0.7281368989934158, 25: 0.7246330369834852}
CPU times: user 2h 59min 5s, sys: 0 ns, total: 2h 59min 5s
Wall time: 2h 56min 9s


In [13]:
estimate_knn_performance(forest, test_df, [10])

Processing queries (k=10): 100%|███████████████████████████████████████████████████████████████████████████████████████| 9019/9019 [1:28:33<00:00,  1.70it/s]


Classification Report (k=10) - Batch /tmp/tmpav_drr0h/results_batch_9018.pickle:
              precision    recall  f1-score   support

       asian       0.64      0.18      0.28     25756
    hispanic       0.72      0.65      0.68    163525
    nh_black       0.58      0.31      0.40    133471
    nh_white       0.75      0.91      0.82    552737
       other       0.30      0.02      0.03     26373

    accuracy                           0.73    901862
   macro avg       0.60      0.41      0.44    901862
weighted avg       0.70      0.73      0.70    901862




{10: 0.7275869257159078}