### RF ML Model (to aid basic interpretation)

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# visualisations
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("whitegrid")
sns.set(rc = {'figure.figsize':(15, 10)})

In [None]:
%%time
NGRAMS = 2
SAMPLE = 1000000

# Florida voter
df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)
df['race'] = df.race.map({'native_indian': 'other', 'asian': 'asian', 'nh_black': 'nh_black', 'hispanic': 'hispanic', 'nh_white': 'nh_white', 'other': 'other', 'multi_racial': 'other', 'unknown': 'unknown'})
df

In [None]:
sdf = df[df.race.isin(['unknown']) == False].groupby(['race']).sample(int(SAMPLE/5), random_state=21)
del df

# Additional features
sdf['name_first'] = sdf.name_first.str.strip().str.title()
sdf['name_last'] = sdf.name_last.str.strip().str.title()
sdf

In [None]:
sdf.groupby('race').agg({'name_last': 'count'})

In [None]:
sdf.groupby('race').agg({'name_last': 'nunique'})

## Preprocessing the input data

In [None]:
# concat last name and first name
sdf['name'] = sdf['name_last'] + ' ' + sdf['name_first']
sdf['race_id'] = sdf.race.factorize()[0]
X = sdf.name
y = sdf.race_id

In [None]:
race_id_df = sdf[['race', 'race_id']].drop_duplicates().sort_values('race_id')
race_to_id = dict(race_id_df.values)
id_to_race = dict(race_id_df[['race_id', 'race']].values)

In [None]:
#vect = TfidfVectorizer(analyzer='char', sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, NGRAMS), lowercase=False)
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=5, ngram_range=(1, NGRAMS), lowercase=False) 

features = vect.fit_transform(sdf.name).toarray()
labels = sdf.race_id
features.shape

In [None]:
from sklearn.feature_selection import chi2

N = 5
for race, race_id in sorted(race_to_id.items()):
  features_chi2 = chi2(features, labels == race_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(vect.get_feature_names_out())[indices]
  #unigrams = [v for v in feature_names if len(v) == 1]
  bigrams = [v for v in feature_names if len(v) == 2]
  print("# '{}':".format(race))
  #print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))
    

In [None]:
# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

# build n-gram list

vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(1, NGRAMS), lowercase=False) 
#vect = TfidfVectorizer(analyzer='char', sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, NGRAMS), lowercase=False)
X_train_vect = vect.fit_transform(X_train)
feature_names = vect.get_feature_names_out()

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

clf = RandomForestClassifier(n_estimators=40, criterion = 'entropy', random_state=21)
clf.fit(X_train_vect, y_train)

X_test = vect.transform(X_test).toarray()
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.inspection import permutation_importance

# Asian
class_index = 0  
result = permutation_importance(clf, vect.transform(X_train).toarray(), y_train == class_index, n_repeats=1, max_samples = 10000, random_state=42)

# Get the feature importance scores
importance_scores = result.importances_mean

# Get the indices of features sorted by importance in descending order
sorted_indices = importance_scores.argsort()[::-1]

In [None]:
race_to_id

In [None]:
# Print the feature importance scores and corresponding feature names
for idx in sorted_indices[0:20]:
    print(f"Feature: {feature_names[idx]}, Importance Score: {importance_scores[idx]}")

In [None]:
# Hispanic
class_index = 1  
result = permutation_importance(clf, vect.transform(X_train).toarray(), y_train == class_index, n_repeats=1, max_samples = 10000, random_state=42)

# Get the feature importance scores
importance_scores = result.importances_mean

# Get the indices of features sorted by importance in descending order
sorted_indices = importance_scores.argsort()[::-1]

In [None]:
# Print the feature importance scores and corresponding feature names
for idx in sorted_indices[0:20]:
    print(f"Feature: {feature_names[idx]}, Importance Score: {importance_scores[idx]}")

## Confusion Matrix

In [None]:
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred))

## Save model

In [None]:
joblib.dump(clf, "fl_voter_name_2022_rf_interp_model.joblib", compress=3)  # compression is ON!