### RF ML Model (to aid basic interpretation)

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# visualisations
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("whitegrid")
sns.set(rc = {'figure.figsize':(15, 10)})

In [2]:
%%time
NGRAMS = 2
SAMPLE = 1000000

# Florida voter
df = pd.read_csv('instate_processed_clean.csv.gz')

CPU times: user 2min 28s, sys: 35.9 s, total: 3min 4s
Wall time: 3min 3s


In [3]:
df.shape

(421190808, 3)

In [4]:
## Let's sample as my df is pretty big (and let's overwrite)
df = df.sample(n = 400000, replace=False, random_state=31415)

## Preprocessing the input data

In [5]:
# concat last name and first name
df['state_id'] = df.state.factorize()[0]
X = df.last_name
y = df.state_id

In [6]:
state_id_df = df[['state', 'state_id']].drop_duplicates().sort_values('state_id')
state_to_id = dict(state_id_df.values)
id_to_state = dict(state_id_df[['state_id', 'state']].values)

In [7]:
#vect = TfidfVectorizer(analyzer='char', sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, NGRAMS), lowercase=False)
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=10, ngram_range=(1, NGRAMS)) 

features = vect.fit_transform(df.last_name).toarray()
labels = df.state_id
features.shape

(400000, 455)

In [10]:
#from sklearn.feature_selection import chi2

#N = 5
#for state, state_id in sorted(state_to_id.items()):
#  features_chi2 = chi2(features, labels == state_id)
#  indices = np.argsort(features_chi2[0])
#  feature_names = np.array(vect.get_feature_names_out())[indices]
#  unigrams = [v for v in feature_names if len(v) == 1]
#  bigrams = [v for v in feature_names if len(v) == 2]
#  print("# '{}':".format(state))
#  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
#  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))
    

In [11]:
# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

# build n-gram list

vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(1, NGRAMS), lowercase=False) 
#vect = TfidfVectorizer(analyzer='char', sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, NGRAMS), lowercase=False)
X_train = vect.fit_transform(X_train)

In [12]:
%%time
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100, criterion = 'entropy', random_state=21)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

X_test = vect.transform(X_test).toarray()
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6733125
CPU times: user 14min 35s, sys: 53.3 ms, total: 14min 35s
Wall time: 14min 35s


## Confusion Matrix

In [14]:
target_names = list(df.state.astype('category').cat.categories)
print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

     andaman       0.57      0.71      0.63     12784
      andhra       0.83      0.90      0.86     15529
   arunachal       0.39      0.21      0.28      3536
       assam       0.67      0.32      0.43      3432
       bihar       0.46      0.69      0.55     13036
  chandigarh       0.42      0.04      0.08      2325
       dadra       0.76      0.23      0.36      3815
       daman       0.66      0.56      0.60      1816
       delhi       0.77      0.62      0.69      1780
         goa       0.81      0.84      0.83      4434
         guj       0.90      0.83      0.86      4296
         har       0.92      0.94      0.93      6703
         jha       0.81      0.66      0.72      1073
          jk       0.67      0.47      0.55      1219
         kar       0.76      0.64      0.70       189
      kerala       0.86      0.75      0.80      1233
 maharashtra       0.39      0.06      0.10      1115
     manipur       0.90    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Save model

In [None]:
joblib.dump(clf, "instate_lname_rf_model.joblib", compress=3)  # compression is ON!