### RF ML Model (to aid basic interpretation)

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
# Florida voter
df = pd.read_csv("train_validation_test/fl_2022_fullname.csv.gz",
                 usecols=['full_name', 'race', 'race_code'])

In [3]:
# stratified sample
proto_df = df.groupby('race', group_keys=False).apply(lambda x: x.sample(frac=.2, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(1803724, 3)

In [4]:
race_id_df = proto_df[['race', 'race_code']].drop_duplicates().sort_values('race_code')
race_to_id = dict(race_id_df.values)
id_to_race = dict(race_id_df[['race_code', 'race']].values)
id_to_race

{0: 'asian', 1: 'hispanic', 2: 'nh_black', 3: 'nh_white', 4: 'other'}

In [6]:
X = proto_df.full_name
y = proto_df.race_code

In [7]:
NGRAMS = 2
#vect = TfidfVectorizer(analyzer='char', sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, NGRAMS), lowercase=False)
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=5, ngram_range=(1, NGRAMS), lowercase=False) 

features = vect.fit_transform(X).toarray()
labels = y
features.shape

(1803724, 1338)

In [8]:
from sklearn.feature_selection import chi2

N = 5
for race_code, race in id_to_race.items():
  features_chi2 = chi2(features, y == race_code)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(vect.get_feature_names_out())[indices]
  bigrams = [v for v in feature_names if len(v) == 2]
  print("# '{}':".format(race))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

# 'asian':
  . Most correlated bigrams:
       . Zh
       . g 
       . uy
       . ng
       . Ng
# 'hispanic':
  . Most correlated bigrams:
       . ue
       . o 
       . a 
       . z 
       . ez
# 'nh_black':
  . Most correlated bigrams:
       . s 
       . sh
       . z 
       . a 
       . o 
# 'nh_white':
  . Most correlated bigrams:
       . ue
       . o 
       . a 
       . z 
       . ez
# 'other':
  . Most correlated bigrams:
       . Bh
       . aj
       . ah
       . ha
       . Kh


In [9]:
# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

X_train_vect = vect.fit_transform(X_train)
feature_names = vect.get_feature_names_out()

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

clf = RandomForestClassifier(n_estimators = 4,
                             max_samples  = .5, 
                             criterion    = 'entropy', 
                             random_state = 21)
clf.fit(X_train_vect, y_train)

X_test = vect.transform(X_test).toarray()
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [11]:
from sklearn.inspection import permutation_importance

# Asian
class_index = 0  
result = permutation_importance(clf, vect.transform(X_train).toarray(), y_train == class_index, n_repeats=1, max_samples = 1000, random_state=42)

# Get the feature importance scores
importance_scores = result.importances_mean

# Get the indices of features sorted by importance in descending order
sorted_indices = importance_scores.argsort()[::-1]

In [12]:
# Print the feature importance scores and corresponding feature names
for idx in sorted_indices[0:20]:
    print(f"Feature: {feature_names[idx]}, Importance Score: {importance_scores[idx]}")

Feature: C, Importance Score: 0.0012266436309883925
Feature: ez, Importance Score: 0.0002266436309883925
Feature: rr, Importance Score: 0.0002266436309883925
Feature: gu, Importance Score: 0.0002266436309883925
Feature: -, Importance Score: 0.0002266436309883925
Feature: ma, Importance Score: 0.0002266436309883925
Feature: n-, Importance Score: 0.0002266436309883925
Feature: s , Importance Score: 0.0002266436309883925
Feature: a , Importance Score: 0.0002266436309883925
Feature:  J, Importance Score: 0.0002266436309883925
Feature: Ce, Importance Score: 0.0002266436309883925
Feature: ue, Importance Score: 0.0002266436309883925
Feature: R , Importance Score: -0.0007733563690116075
Feature: Si, Importance Score: -0.0007733563690116075
Feature: Sc, Importance Score: -0.0007733563690116075
Feature: Se, Importance Score: -0.0007733563690116075
Feature: Sf, Importance Score: -0.0007733563690116075
Feature: Sg, Importance Score: -0.0007733563690116075
Feature: Sh, Importance Score: -0.00077335

In [13]:
# Hispanic
class_index = 1  
result = permutation_importance(clf, vect.transform(X_train).toarray(), y_train == class_index, n_repeats=1, max_samples = 1000, random_state=42)

# Get the feature importance scores
importance_scores = result.importances_mean

# Get the indices of features sorted by importance in descending order
sorted_indices = importance_scores.argsort()[::-1]

In [14]:
# Print the feature importance scores and corresponding feature names
for idx in sorted_indices[0:20]:
    print(f"Feature: {feature_names[idx]}, Importance Score: {importance_scores[idx]}")

Feature: A, Importance Score: 0.03821364413480721
Feature: k, Importance Score: 0.03521364413480721
Feature: a , Importance Score: 0.032213644134807234
Feature: an, Importance Score: 0.032213644134807234
Feature: K, Importance Score: 0.031213644134807234
Feature: W, Importance Score: 0.031213644134807234
Feature: ha, Importance Score: 0.031213644134807234
Feature: ar, Importance Score: 0.031213644134807234
Feature: ne, Importance Score: 0.030213644134807233
Feature: n , Importance Score: 0.030213644134807233
Feature: o , Importance Score: 0.030213644134807233
Feature: z , Importance Score: 0.030213644134807233
Feature: ll, Importance Score: 0.030213644134807233
Feature: al, Importance Score: 0.030213644134807233
Feature: ia, Importance Score: 0.029213644134807232
Feature: do, Importance Score: 0.029213644134807232
Feature: el, Importance Score: 0.029213644134807232
Feature: w, Importance Score: 0.029213644134807232
Feature: g , Importance Score: 0.02821364413480723
Feature: g, Importan

## Confusion Matrix

In [16]:
target_names = list(df.race.astype('category').cat.categories)
print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       asian       0.37      0.26      0.31     10302
    hispanic       0.69      0.72      0.70     65410
    nh_black       0.49      0.42      0.45     53389
    nh_white       0.78      0.84      0.80    221095
       other       0.14      0.02      0.03     10549

    accuracy                           0.71    360745
   macro avg       0.49      0.45      0.46    360745
weighted avg       0.69      0.71      0.70    360745

[[  2712   1411    804   5159    216]
 [   828  46858   2204  15360    160]
 [   806   3224  22324  26771    264]
 [  2246  14743  18599 184948    559]
 [   765   1613   1609   6371    191]]


## Save model

In [None]:
joblib.dump(clf, "models/rf_fullname_interp.joblib", compress=3)  # compression is ON!