# Notebook 1

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import seaborn as sns
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [2]:
import sys
import os
sys.path.append('../src/models')
from predict_model import create_perf_metrics

In [3]:
# open predictor and target variables 
sys.path.append('../data/processed/')
with open('predictor.pickle', 'rb') as file:
    X = pickle.load(file)
with open('target.pickle', 'rb') as file:
    y = pickle.load(file)

In [4]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, 
                                                    stratify=y)

## Logistic regression

In [5]:
log_reg = LogisticRegression(C=2, penalty='l2', 
                        solver='lbfgs', 
                        multi_class='auto', 
                        max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_test = log_reg.predict(X_test)

In [6]:
# call performance metric function
log_reg_metrics = create_perf_metrics(y_test, y_pred_test)

                Pred Nonvoter/Other  Pred Clinton  Pred Trump
Nonvoter/Other                   18            96          81
Clinton                          19           635          55
Trump                            10            79         607 


 89.60000000000001 percent that were predicted Clinton were actually Clinton
 87.2 percent that were predicted Trump were actually Trump



In [7]:
# storing each group's coeficients as dicts
other_list = log_reg.coef_[0]
other_coef = dict(zip(X.columns, other_list))
clinton_list = log_reg.coef_[1]
clinton_coef = dict(zip(X.columns, clinton_list))
trump_list = log_reg.coef_[2]
trump_coef = dict(zip(X.columns, trump_list))

## Random forest

In [8]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_test = rf.predict(X_test)

In [9]:
rf_metrics = create_perf_metrics(y_test, y_pred_test)

                Pred Nonvoter/Other  Pred Clinton  Pred Trump
Nonvoter/Other                    3           101          91
Clinton                          13           633          63
Trump                             9            72         615 


 89.3 percent that were predicted Clinton were actually Clinton
 88.4 percent that were predicted Trump were actually Trump



## K Nearest Neighbors

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_test = knn.predict(X_test)

In [None]:
knn_metrics = create_perf_metrics(y_test, y_pred_test)

## Support vector machines

In [None]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_test = svm.predict(X_test)

In [None]:
# call performance metric function
svm_metrics = create_perf_metrics(y_test, y_pred_test)

## K-means clustering

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)

labels = kmeans.labels_
plt.figure(figsize=(5,5))
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='black', s=300)

y_pred = kmeans.predict(X)
metrics.silhouette_score(X, labels, metric='euclidean')
metrics.calinski_harabasz_score(X, labels)

y_pred_df = pd.DataFrame(y_pred, columns=['predicted'])
d = df_target.join(y_pred_df)
num_cluster_trump = d.loc[(d.presvote16post_2016 == 2) & (d.predicted == 0)].count()
num_cluster_clinton = d.loc[(d.presvote16post_2016 == 1)].count()