# Model Selection

## Preparing data for modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
# read in CSV 
df = pd.read_csv('clean_data/2019-fec-contr-census.csv', 
                 index_col=0
                )

# target variable set as candidate class 
y = df.target

# predictor values of zip, date, and contribution amount 
df.contbr_zip = df.contbr_zip.astype(str)
X_feats = ['contbr_zip',
           'converted_date',
           'contb_receipt_amt']
X = pd.get_dummies(df[X_feats],
                   drop_first=True)

In [3]:
# split train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )
# scale train data 
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)

# scale test data 
scaled_X_test = scaler.transform(X_test)

# scale all X data for cross-validation 
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X.shape

(11502, 23)

## Logistic regression optimizing for multi-class problem

In [4]:
# logistic regression model 
lr = LogisticRegression(C=2, 
                           solver='lbfgs', 
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2',
                           class_weight={0: .89 ,1: .78, 2: .98, 3: .87}
                       )

# fit train data to model 
lr.fit(scaled_X_train, y_train)

# predict train values 
y_hat_train = lr.predict(scaled_X_train)
print('Logistic regression train score:', 
      round(lr.score(scaled_X_train, y_train),2)
     )

Logistic regression train score: 0.44


In [5]:
# predict and print score 
y_hat_test = lr.predict(scaled_X_test)
print('Logistic regression test score:', 
      round(lr.score(scaled_X_test, y_test),2)
     )

Logistic regression test score: 0.42


In [6]:
# create confusion matrix for test results 
cm = confusion_matrix(y_test, y_hat_test)
conf_matrix = pd.DataFrame(cm, 
                           index=['actual Biden',
                                  'actual Buttigieg',
                                  'actual Sanders',
                                  'actual Warren'], 
                           columns=['Biden', 
                                    'Buttigieg',
                                    'Sanders',
                                    'Warren']
                          )
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,25,99,83,77
actual Buttigieg,13,297,136,304
actual Sanders,8,43,246,248
actual Warren,9,140,172,401


## Logistic regression optimizing for single class

In [7]:
# logistic regression model 
lr_sanders = LogisticRegression(C=0.01, 
                                solver='saga', 
                                multi_class='multinomial', 
                                max_iter=1000,
                                penalty = 'l2',
                                class_weight={0: .4 ,1: .2, 2: 1.8, 3: .4}
                               )

# fit model 
lr_sanders.fit(scaled_X_train, y_train)

# predict test 
y_hat_test = lr_sanders.predict(scaled_X_test)

In [8]:
# create confusion matrix
cm = confusion_matrix(y_test, y_hat_test)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren']
                          )
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,42,11,229,2
actual Buttigieg,36,34,678,2
actual Sanders,0,1,544,0
actual Warren,8,7,706,1


In [9]:
# sanders metrics
corr = cm[2][2]
total = sum(cm[2])
true_pos = corr/total

incorrect = cm[:,2]
false_pos = (sum(incorrect) - corr)/ sum(incorrect)

print('Predicted Sanders and was true Sanders count:', corr,'\n', 
      'Total true Sanders count:', total,'\n', 
      'True positive rate:', round(100*true_pos,2),'\n',
      'False positive rate:', round(100*false_pos, 2)
     )

Predicted Sanders and was true Sanders count: 544 
 Total true Sanders count: 545 
 True positive rate: 99.82 
 False positive rate: 74.78


## K-Nearest Neighbors (KNN) for multi-class

In [10]:
# knn model 
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': [20, 40, 60],
              'weights': ['distance'],
              'algorithm': ['auto'], 
              'leaf_size': [3, 5],
              'p':[1]
             }

# call GridSearchCV with knn estimator 
gs = GridSearchCV(knn, 
                  param_grid, 
                  n_jobs=-1,
                  scoring='accuracy',
                  cv=3
                 )

# fit model 
gs.fit(scaled_X, y)

# print best params and score from gridsearchcv
gs.best_params_
print('KNN gridsearchcv best score:', 
      round(gs.best_score_,2)
     )

KNN gridsearchcv best score: 0.42


## Support Vector Machines (SVM) for multi-class

In [11]:
# run svm with linear kernel 
svm = SVC(kernel='linear', 
          class_weight='balanced'
         )

# fit model to scaled train data
svm.fit(scaled_X_train, y_train)

# predict and print train score
y_hat_train = svm.predict(scaled_X_train)
score = accuracy_score(y_train, y_hat_train)
print('SVM linear train train score:', 
      round(score,2)
     )

SVM linear train train score: 0.36


In [12]:
# run test data 
svm.fit(scaled_X_train, y_train)
y_hat_test = svm.predict(scaled_X_test)
score = accuracy_score(y_test, y_hat_test)

# print test score 
print('SVM linear test score:', 
      round(score,2)
     )

SVM linear test score: 0.36


In [13]:
# run svm with radial basis function 
svm = SVC(kernel='rbf', 
          class_weight='balanced'
         )

# fit to scaled train data
svm.fit(scaled_X_train, y_train)

# predict and print train score
y_hat_train = svm.predict(scaled_X_train)
score = accuracy_score(y_train, y_hat_train)
print('SVM rbf train score:', 
      round(score,2)
     )

SVM rbf train score: 0.38


In [14]:
# run test data 
svm.fit(scaled_X_train, y_train)

# print test score 
y_hat_test = svm.predict(scaled_X_test)
score = accuracy_score(y_test, y_hat_test)
print('SVM rbf test score:', 
      round(score, 2)
     )

SVM rbf test score: 0.37


## Random Forest (RF) for multi-class

In [15]:
# run train data 
rf = RandomForestClassifier()

# fit model to train data
rf.fit(scaled_X_train, y_train)

# print train score 
y_hat_train = rf.predict(scaled_X_train)
score = accuracy_score(y_train, y_hat_train)
print('Random forest train score:', 
      round(score,2)
     )

Random forest train score: 0.7




In [16]:
# predict and print test score 
y_hat_test = rf.predict(scaled_X_test)
score = accuracy_score(y_test, y_hat_test)
print('Random forest test score:', 
      round(score,2)
     )

Random forest test score: 0.54
