## Data cleaning, feature engineering, and modeling

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
df = pd.read_csv('data/raw/voter-survey-december16.csv')

In [None]:
pattern = '(imiss_).{1}(_2016)'
def return_issue_cols(df):
    issue_cols = []
    for col in df.columns:
        if re.search(pattern, col):
            issue_cols.append(col)
    return issue_cols

issue_col_list = return_issue_cols(df)

In [None]:
# target column list  
target_col_list = ['presvote16post_2016']
df_target = df[target_col_list]

In [None]:
def fill_nans(df):
    df.fillna(0)
    return df

In [None]:
def to_str_dummies(df):
    df = df.astype(str)
    df_dummies = pd.get_dummies(df)
    return df_dummies

In [None]:
df_dummies.corr()['presvote16post_2016'].abs().sort_values(ascending=False)

In [None]:
df_predictors = df_dummies.drop('presvote16post_2016', axis=1)

## K means clustering

In [None]:
X = df_predictors

In [None]:
k_means = KMeans(n_clusters=2)
k_means.fit(X)

In [None]:
labels = k_means.labels_

In [None]:
y_pred = k_means.predict(X)

In [None]:
plt.figure(figsize=(5,5))
cl_centers = k_means.cluster_centers_
plt.scatter(cl_centers[:, 0], cl_centers[:, 1], c='black', s=300)

In [None]:
metrics.silhouette_score(X, labels, metric='euclidean')

In [None]:
metrics.calinski_harabasz_score(X, labels)

In [None]:
df_target = df[target_col_list]
print(df_target.shape)
df_target_arr = np.ravel(df_target)

In [None]:
df_target_arr
y_pred.shape

In [None]:
dataset = pd.DataFrame({ 'y_pred': y_pred}, 
                        columns=[ 'y_pred']
                        )

In [None]:
dataset.head()

In [None]:
target_pred_df = df_predictors.join(df_target).join(dataset)
target_pred_df.head(10)

In [None]:
count = 0
for row in target_pred_df.itertuples():
    if (row.presvote16post_2016 == 2): #and (row.y_pred == 0):
        count +=1
print(count)


In [None]:
2578/3545

In [None]:
2795/3479

## Logistic regression

In [None]:
X = df_predictors
y = df_target
X.head()

In [None]:
X.isna().sum().any()
y.isna().sum()

In [None]:
y = y.fillna(0)
y = np.ravel(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.2, 
                                                    stratify=y)

In [None]:
lr = LogisticRegression(C=2, penalty='l2', 
                        solver='lbfgs', 
                        multi_class='auto', 
                        max_iter=1000)
lr.fit(X_train, y_train)
y_pred_test = lr.predict(X_test)

In [None]:
df_target.columns

In [None]:
cm = confusion_matrix(y_test, y_pred_test)
conf_matrix = pd.DataFrame(cm, index=['None','Clinton','Trump','Johnson','Stein','McMullin','Other','Did not vote'], 
                           columns=['Pred None','Pred Clinton','Pred Trump','Pred Johnson','Pred Stein','Pred McMullin','Pred Other','Pred Did Not Vote'])


conf_matrix

In [None]:
# correct Clinton
correct = cm[1][1]
total = sum(cm[1])
pred_correct = correct/total
pred_correct

In [None]:
# correct Trump
correct = cm[2][2]
total = sum(cm[2])
pred_correct = correct/total
pred_correct

In [None]:
len(lr.coef_[1])

In [None]:
coef_dict = dict(zip(X.columns, lr.coef_[0]))
coef_dict

## Feature importance

In [None]:
predictors = X_train
selector = RFE(lr, n_features_to_select=1)
selector = selector.fit(predictors, y_train)

In [None]:
len(X_train.columns)

In [None]:
order = selector.ranking_
len(order)
order

In [None]:
df_predictors.columns

In [None]:
for i in df_predictors.index:
    rank = np.array(order)
    feat = np.array(df_predictors.columns)
    rank_feat = pd.DataFrame(rank,feat, columns=['rank'])

In [None]:
rank_feat.sort_values(by='rank').tail(10)

## Cross-validation

In [None]:
y = y.fillna(0)

In [None]:
X.isna().sum().any()

In [None]:
y = np.ravel(y)
y.shape

In [None]:
scores = cross_val_score(lr, X, y, cv=5)

In [None]:
scores

## Random forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_test = rf.predict(X_test)
score = accuracy_score(y_test, y_pred_test)

In [None]:
score

In [None]:
scores = cross_val_score(rf, X, y, cv=5)

In [None]:
scores

In [None]:
knn = KNeighborsClassifier()
scores = cross_val_score(knn, X, y, cv=5)
scores

In [None]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_test = svm.predict(X_test)
score = accuracy_score(y_test, y_pred_test)
score

In [None]:
scores = cross_val_score(svm, X, y, cv=5)
scores

In [None]:
svm = SVC(kernel='linear', class_weight='balanced')
svm.fit(X_train, y_train)
y_pred_test = svm.predict(X_test)
score = accuracy_score(y_test, y_pred_test)
score