# Model Selection

## Preparing data for modeling

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import statsmodels.api as sm
%matplotlib inline

In [42]:
# read in CSV 
df = pd.read_csv('clean_data/2019-fec-contr-census.csv', index_col=0)
df.head()

Unnamed: 0,cand_nm,contbr_zip,contb_receipt_amt,converted_date,income,target
0,"Sanders, Bernard",20001,100.0,201906,85976.0,2
1,"Sanders, Bernard",20001,3.0,201906,85976.0,2
2,"Sanders, Bernard",20001,27.0,201906,85976.0,2
3,"Sanders, Bernard",20007,3.0,201903,119267.0,2
4,"Sanders, Bernard",20001,27.0,201906,85976.0,2


In [43]:
y = df.target

In [44]:
df.contbr_zip = df.contbr_zip.astype(str)
X_feats = ['contbr_zip',
           'converted_date',
           'contb_receipt_amt']
X = pd.get_dummies(df[X_feats],
                   drop_first=True)
print(X.shape)
X.head()

(11502, 23)


Unnamed: 0,converted_date,contb_receipt_amt,contbr_zip_20002,contbr_zip_20003,contbr_zip_20004,contbr_zip_20005,contbr_zip_20006,contbr_zip_20007,contbr_zip_20008,contbr_zip_20009,...,contbr_zip_20015,contbr_zip_20016,contbr_zip_20017,contbr_zip_20018,contbr_zip_20019,contbr_zip_20020,contbr_zip_20024,contbr_zip_20032,contbr_zip_20036,contbr_zip_20037
0,201906,100.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,201906,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,201906,27.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,201903,3.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,201906,27.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )

In [46]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)

## Logistic Regression for multi-class problem

In [47]:
clf = LogisticRegression(  C=2, 
                           solver='lbfgs', 
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2',
                           class_weight={0: .89 ,1: .78, 2: .98, 3: .87})

In [48]:
clf.fit(scaled_X_train, y_train)
y_hat_train = clf.predict(scaled_X_train)
clf.score(scaled_X_train, y_train)

0.44027823062710575

In [49]:
# create confusion matrix
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,120,435,293,289
actual Buttigieg,86,1197,539,1174
actual Sanders,18,241,992,930
actual Warren,31,488,626,1742


In [50]:
scaled_X_test = scaler.transform(X_test)
clf.score(scaled_X_test, y_test)

0.4211212516297262

## Logistic regression optimizing for Sanders metrics

In [51]:
clf_sanders = LogisticRegression(  C=0.01, 
                                   solver='saga', 
                                   multi_class='multinomial', 
                                   max_iter=1000,
                                  penalty = 'l2',
                                   class_weight={0: .4 ,1: .2, 2: 1.8, 3: .4})
clf_sanders.fit(scaled_X_train, y_train)
y_hat_train = clf_sanders.predict(scaled_X_train)

In [52]:
# create confusion matrix
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,143,45,945,4
actual Buttigieg,168,158,2665,5
actual Sanders,9,10,2161,1
actual Warren,13,14,2860,0


In [53]:
# sanders metric! 
corr = cm[2][2]
total = sum(cm[2])
true_pos = corr/total
print('Predicted Correctly:', corr,'\n', 
      'Total Sanders:', total,'\n', 
      'True Pos:', round(100*true_pos,2))

# predicted sanders and it wasn't 
incorrect = cm[:,2]
#print(incorrect)
false_pos = (sum(incorrect) - corr)/ sum(incorrect)
print('False Pos:', round(100*false_pos, 2))

Predicted Correctly: 2161 
 Total Sanders: 2181 
 True Pos: 99.08
False Pos: 74.96


## K-Nearest Neighbors

In [54]:
knn = KNeighborsClassifier(n_neighbors = 3, 
                           weights = 'uniform',
                           algorithm = 'auto',
                           leaf_size = 30, 
                           p = 2, 
                           metric = 'minkowski')

In [55]:
# fit knn to scaled train data
knn.fit(scaled_X_train, y_train)
y_hat_train = knn.predict(scaled_X_train)
score = accuracy_score(y_train, y_hat_train)

# print KNN train score
print('KNN:', score)

KNN: 0.597543745245082


In [56]:
# scale all X data for cross-validation 
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X.shape

(11502, 23)

In [57]:
param_grid = {'n_neighbors': [20, 40, 60],
              'weights': ['distance'],
              'algorithm': ['auto'], 
              'leaf_size': [3, 5],
              'p':[1]
             }

# call GridSearchCV with knn estimator 
clf = GridSearchCV(
    knn, 
    param_grid, 
    n_jobs=-1,
    scoring='accuracy',
    return_train_score=True,
    verbose=1,
    cv=3
    )

# fit model to scaled data 
clf.fit(scaled_X, y)
# print best params and score
print(clf.best_params_)
clf.best_score_

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   46.2s finished


{'algorithm': 'auto', 'leaf_size': 5, 'n_neighbors': 20, 'p': 1, 'weights': 'distance'}


0.4194053208137715

In [58]:
# run split test data using knn and best params
knn = KNeighborsClassifier(n_neighbors = 20, 
                           weights = 'distance',
                           algorithm = 'auto',
                           leaf_size = 5, 
                           p = 1, 
                           metric = 'minkowski')

scaled_X_test = scaler.transform(X_test)
knn.fit(scaled_X_train, y_train)
y_hat_test = knn.predict(scaled_X_test)
score = accuracy_score(y_test, y_hat_test)

# print test score 
print('KNN:', score)

KNN: 0.46631899174272057
