# Machine Learning with Voter Preferences

In [1]:
# Arshiya Sabzevari, Colton Ragland, Noumik Thadani, Trevor Huis in 't Veld

In [2]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [3]:
%matplotlib inline

## Data Prep
Cleaning the rest of our data set to best fit our needs of analysis

In [4]:
data = pd.read_csv("cces18_common_vv2.csv")
#removing all rows with a NaN value
data = data.dropna()
print(data.shape)
data.head()

(31945, 31)


Unnamed: 0,birthyr,gender,educ,race,marstat,region,National Economy,Income,Trump Approval,Congress Approval,...,M4A,Repeal ACA,Paris Accord,TPP,Ideology,sexuality,inputstate,EPA CO2 Regulate,Iran Deal,Party
0,1950,2,6,2,3.0,1,4.0,3.0,4.0,4.0,...,1.0,2.0,2.0,2.0,2.0,1.0,34,1.0,2.0,1.0
3,1970,2,5,1,6.0,2,4.0,3.0,4.0,3.0,...,1.0,2.0,2.0,2.0,4.0,1.0,27,1.0,2.0,1.0
4,1971,2,4,1,3.0,4,1.0,1.0,1.0,2.0,...,2.0,1.0,1.0,1.0,4.0,1.0,8,2.0,1.0,3.0
5,1957,2,5,3,1.0,4,3.0,2.0,4.0,4.0,...,1.0,2.0,2.0,2.0,2.0,1.0,53,1.0,2.0,1.0
6,1973,2,5,1,3.0,4,4.0,5.0,4.0,4.0,...,1.0,2.0,2.0,2.0,3.0,1.0,53,1.0,2.0,1.0


In [24]:
#class label
data_Y = data['Party']
#features
data_X = data.drop(['Party'], axis=1)
#removing ideology due to high correlation with the class label
data_X = data_X.drop(['Ideology'], axis=1)
#Vote 2016 removed due to all inputs being '1' - everyone having voted in the 2016 election
#This was already feature engineered in our beginning process - cleaning up here
data_X = data_X.drop(['Vote 2016'], axis=1)
print(data_X.shape)
print(data_Y.shape)
data_X.head()

(31945, 28)
(31945,)


Unnamed: 0,birthyr,gender,educ,race,marstat,region,National Economy,Income,Trump Approval,Congress Approval,...,Abortion-General,Wall,M4A,Repeal ACA,Paris Accord,TPP,sexuality,inputstate,EPA CO2 Regulate,Iran Deal
0,1950,2,6,2,3.0,1,4.0,3.0,4.0,4.0,...,1.0,2.0,1.0,2.0,2.0,2.0,1.0,34,1.0,2.0
3,1970,2,5,1,6.0,2,4.0,3.0,4.0,3.0,...,1.0,2.0,1.0,2.0,2.0,2.0,1.0,27,1.0,2.0
4,1971,2,4,1,3.0,4,1.0,1.0,1.0,2.0,...,2.0,1.0,2.0,1.0,1.0,1.0,1.0,8,2.0,1.0
5,1957,2,5,3,1.0,4,3.0,2.0,4.0,4.0,...,1.0,2.0,1.0,2.0,2.0,2.0,1.0,53,1.0,2.0
6,1973,2,5,1,3.0,4,4.0,5.0,4.0,4.0,...,1.0,2.0,1.0,2.0,2.0,2.0,1.0,53,1.0,2.0


## Data Exploration
Alternative looks into our dataset for better comprehension of the features

In [25]:
#list of all options for each data point
for col in data_X:
    print(col,':', sorted(data_X[col].unique()))

birthyr : [1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000]
gender : [1, 2]
educ : [1, 2, 3, 4, 5, 6]
race : [1, 2, 3, 4, 5, 6, 7, 8]
marstat : [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
region : [1, 2, 3, 4]
National Economy : [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
Income : [1.0, 2.0, 3.0, 4.0, 5.0]
Trump Approval : [1.0, 2.0, 3.0, 4.0, 5.0]
Congress Approval : [1.0, 2.0, 3.0, 4.0, 5.0]
Supreme Court Approval : [1.0, 2.0, 3.0, 4.0, 5.0]
Governor Approval : [1.0, 2.0, 3.0, 4.0, 5.0]
Legislature Approval : [1.0, 2.0, 3.0, 4.0, 5.0]
Rep Approval : [1.0, 2.0, 3.0, 4.0, 5.0]
Senator 1 Approval : [1.0, 2.

## Neural Network
Using a Neural Network MLFClassifier & gaining a measure of accuracy to assess our model

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scaler = StandardScaler()
clf = MLPClassifier()
pipeline = Pipeline(steps = [('scaler', scaler), ('clf', clf)])

param_grid = {'clf__hidden_layer_sizes': [10, 20, 30, 40, 50, 60],
             'clf__activation': ['logistic', 'tanh', 'relu']}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)

scores = cross_val_score(grid_search, data_X, data_Y, cv=5)
print("Accuracy:", scores.mean()*100)

## Final Model
Using the Neural Network model & classifer & creating a final model to predict newly inputted records & gaining insight into voter preferences, decisions & outlooks

In [None]:
import pickle

scaler = StandardScaler()
clf = MLPClassifier()
pipeline = Pipeline(steps = [('scaler', scaler), ('clf', clf)])

param_grid = {'clf__hidden_layer_sizes': [10, 20, 30, 40, 50, 60],
             'clf__activation': ['logistic', 'tanh', 'relu']}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
final = grid_search.fit(data_X, data_Y)

#final model
final_model = final

filename = 'finalized_model.sav'
pickle.dump(final_model, open(filename, 'wb'))

### Record Selection
Selecting a random combination of inputs to test in our finalized model

In [26]:
import random

for col in data_X:
    print(col, ':', random.choice(data_X[col].unique()))

birthyr : 1961
gender : 1
educ : 4
race : 6
marstat : 6.0
region : 4
National Economy : 4.0
Income : 3.0
Trump Approval : 5.0
Congress Approval : 3.0
Supreme Court Approval : 3.0
Governor Approval : 2.0
Legislature Approval : 2.0
Rep Approval : 1.0
Senator 1 Approval : 4.0
Senator 2 Approval : 2.0
President Vote 2016 : 1.0
Assault Rifle Ban : 2.0
Abortion-General : 2.0
Wall : 1.0
M4A : 1.0
Repeal ACA : 2.0
Paris Accord : 2.0
TPP : 2.0
sexuality : 2.0
inputstate : 15
EPA CO2 Regulate : 1.0
Iran Deal : 2.0


In [None]:
record = [[ 1961, 1, 4, 6, 6.0, 4,
  4.0, 3.0, 5.0, 3.0, 3.0, 2.0,
 2.0, 1.0, 4.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 15, 1.0, 
 2.0 ]]

 
# load the model
loaded_model = pickle.load(open(filename, 'rb'))

prediction = loaded_model.predict(record)
if prediction == 1:
    print('Democrat')
elif prediction == 2:
    print('Republican')
elif prediction == 3:
    print('Independent')
elif prediction == 4:
     print('Other')
else:
    print('Not sure')