In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import csv
import json
import os
import contextily as cx
from collections import defaultdict
import re
from typing import Callable

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

from zipfile import ZipFile
zf = ZipFile('data.zip', 'r')
zf.extractall('.')

# more readable exceptions
%pip install --quiet iwut
%load_ext iwut
%wut on



Note: you may need to restart the kernel to use updated packages.


## Data Cleaning

In the survey, we collect features of voters (such as primary language and smoker or not)and their attitudes towards 3 propositions. We'll load our data and drop N/A values.

In [75]:
voting_data = pd.read_csv("data/voting-data.csv")
voting_data

Unnamed: 0,Timestamp,"California Proposition 1, Right to Reproductive Freedom Amendment (2022)","California Proposition 30, Tax on Income Above $2 Million for Zero-Emissions Vehicles and Wildfire Prevention Initiative (2022)","California Proposition 31, Flavored Tobacco Products Ban Referendum (2022)",What is your ethnicity?,My primary language is...,What is your average family income?,What is your gender?,What is your age?,What is your party self-identification?,...,How important are the following issues to you? [Healthcare],How important are the following issues to you? [Environment],How important are the following issues to you? [Education],How important are the following issues to you? [Women's Rights],Do you have healthcare?,Do you smoke?,I am registered to vote.,I plan to vote in the November 2022 election.,Unnamed: 20,Unnamed: 21
0,2022/07/20 11:16:12 PM MDT,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,...,,,,,Yes,No,,,5.0,5.0
1,2022/07/20 11:36:03 PM MDT,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,...,,,,,Yes,No,,,5.0,5.0
2,2022/07/21 9:42:16 AM MDT,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,...,Important,Very Important,Very Important,Very Important,Yes,No,No,Yes,,
3,2022/07/21 9:49:05 AM MDT,Yes,Yes,Yes,Caucasian,Spanish,"$50,000- $100,000",Female,65+,Republican,...,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes,,
4,2022/07/21 12:19:15 PM MDT,No,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,...,Important,Important,Important,Important,Yes,No,Yes,Yes,,
5,2022/07/21 1:33:39 PM MDT,Yes,Yes,Yes,Caucasian,English,"$20,000 - $50,000",Female,65+,Democrat,...,Very Important,Very Important,Important,Very Important,Yes,No,Yes,Yes,,
6,2022/07/21 4:02:15 PM MDT,Yes,Yes,Yes,Asian,English,"$0 - $20,000",Male,18-25,Democrat,...,Very Important,Important,Important,Very Important,Yes,No,Yes,Yes,,
7,2022/07/21 4:12:39 PM MDT,Yes,Yes,No,Asian,English,"$100,000 - $400,000",Female,18-25,Democrat,...,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes,,
8,2022/07/21 4:23:52 PM MDT,Yes,Yes,No,Asian,Chinese,"$100,000 - $400,000",Female,18-25,Independent,...,Somewhat Important,Somewhat Unimportant,Important,Somewhat Important,Yes,No,No,Maybe,,
9,2022/07/21 5:50:12 PM MDT,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Male,65+,Democrat,...,Important,Important,Somewhat Important,Important,Yes,No,Yes,Yes,,


In [76]:
#Clean the data by dropping N/A Values
voting_data_dropped = voting_data.drop(columns = ["Timestamp", "Unnamed: 20", "Unnamed: 21"])

In [100]:
# Rename Propositions to make it easier to understand
voting_data_simplified = voting_data_dropped.rename(columns={"California Proposition 1, Right to Reproductive Freedom Amendment (2022)": "Prop-1", "California Proposition 30, Tax on Income Above $2 Million for Zero-Emissions Vehicles and Wildfire Prevention Initiative (2022)": "Prop-30", "California Proposition 31, Flavored Tobacco Products Ban Referendum (2022)":"Prop-31"})
voting_data_simplified

Unnamed: 0,Prop-1,Prop-30,Prop-31,What is your ethnicity?,My primary language is...,What is your average family income?,What is your gender?,What is your age?,What is your party self-identification?,What is your education background?,How important are the following issues to you? [Cost of Living],How important are the following issues to you? [Healthcare],How important are the following issues to you? [Environment],How important are the following issues to you? [Education],How important are the following issues to you? [Women's Rights],Do you have healthcare?,Do you smoke?,I am registered to vote.,I plan to vote in the November 2022 election.
0,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,Bachelor's degree (for example: BA. BS),,,,,,Yes,No,,
1,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",,,,,,Yes,No,,
2,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,High school,Important,Important,Very Important,Very Important,Very Important,Yes,No,No,Yes
3,Yes,Yes,Yes,Caucasian,Spanish,"$50,000- $100,000",Female,65+,Republican,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
4,No,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,Some college,Important,Important,Important,Important,Important,Yes,No,Yes,Yes
5,Yes,Yes,Yes,Caucasian,English,"$20,000 - $50,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",Very Important,Very Important,Very Important,Important,Very Important,Yes,No,Yes,Yes
6,Yes,Yes,Yes,Asian,English,"$0 - $20,000",Male,18-25,Democrat,Some college,Very Important,Very Important,Important,Important,Very Important,Yes,No,Yes,Yes
7,Yes,Yes,No,Asian,English,"$100,000 - $400,000",Female,18-25,Democrat,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
8,Yes,Yes,No,Asian,Chinese,"$100,000 - $400,000",Female,18-25,Independent,Some college,Somewhat Important,Somewhat Important,Somewhat Unimportant,Important,Somewhat Important,Yes,No,No,Maybe
9,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Male,65+,Democrat,Ph.D. or higher,Somewhat Unimportant,Important,Important,Somewhat Important,Important,Yes,No,Yes,Yes


## Prepare input and output for Machine Learning
We transform categorical data to numerical values, extract features as input and answers to propositions as output.


In [102]:
# This dataframe is used as input 
voting_data_input = voting_data_simplified.drop(columns = ["Prop-1", "Prop-30", "Prop-31"])
voting_data_input

Unnamed: 0,What is your ethnicity?,My primary language is...,What is your average family income?,What is your gender?,What is your age?,What is your party self-identification?,What is your education background?,How important are the following issues to you? [Cost of Living],How important are the following issues to you? [Healthcare],How important are the following issues to you? [Environment],How important are the following issues to you? [Education],How important are the following issues to you? [Women's Rights],Do you have healthcare?,Do you smoke?,I am registered to vote.,I plan to vote in the November 2022 election.
0,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,Bachelor's degree (for example: BA. BS),,,,,,Yes,No,,
1,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",,,,,,Yes,No,,
2,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,High school,Important,Important,Very Important,Very Important,Very Important,Yes,No,No,Yes
3,Caucasian,Spanish,"$50,000- $100,000",Female,65+,Republican,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
4,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,Some college,Important,Important,Important,Important,Important,Yes,No,Yes,Yes
5,Caucasian,English,"$20,000 - $50,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",Very Important,Very Important,Very Important,Important,Very Important,Yes,No,Yes,Yes
6,Asian,English,"$0 - $20,000",Male,18-25,Democrat,Some college,Very Important,Very Important,Important,Important,Very Important,Yes,No,Yes,Yes
7,Asian,English,"$100,000 - $400,000",Female,18-25,Democrat,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
8,Asian,Chinese,"$100,000 - $400,000",Female,18-25,Independent,Some college,Somewhat Important,Somewhat Important,Somewhat Unimportant,Important,Somewhat Important,Yes,No,No,Maybe
9,Caucasian,English,"$100,000 - $400,000",Male,65+,Democrat,Ph.D. or higher,Somewhat Unimportant,Important,Important,Somewhat Important,Important,Yes,No,Yes,Yes


In [90]:
# Encoding all features into dummies variables
voting_data_input_dummies = pd.get_dummies(voting_data_input)
voting_data_input_dummies

Unnamed: 0,What is your ethnicity?_Asian,What is your ethnicity?_Black or African American,What is your ethnicity?_Caucasian,What is your ethnicity?_Hispanic or Latino(a),My primary language is..._Chinese,My primary language is..._English,My primary language is..._Spanish,"What is your average family income?_$0 - $20,000","What is your average family income?_$100,000 - $400,000","What is your average family income?_$20,000 - $50,000",...,How important are the following issues to you? [Women's Rights]_Somewhat Unimportant,How important are the following issues to you? [Women's Rights]_Very Important,Do you have healthcare?_No,Do you have healthcare?_Yes,Do you smoke?_No,Do you smoke?_Yes,I am registered to vote._No,I am registered to vote._Yes,I plan to vote in the November 2022 election._Maybe,I plan to vote in the November 2022 election._Yes
0,0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
1,0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
2,0,0,1,0,0,1,0,0,1,0,...,0,1,0,1,1,0,1,0,0,1
3,0,0,1,0,0,0,1,0,0,0,...,0,1,0,1,1,0,0,1,0,1
4,0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,1,0,0,1,0,1
5,0,0,1,0,0,1,0,0,0,1,...,0,1,0,1,1,0,0,1,0,1
6,1,0,0,0,0,1,0,1,0,0,...,0,1,0,1,1,0,0,1,0,1
7,1,0,0,0,0,1,0,0,1,0,...,0,1,0,1,1,0,0,1,0,1
8,1,0,0,0,1,0,0,0,1,0,...,0,0,0,1,1,0,1,0,1,0
9,0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,1,0,0,1,0,1


In [101]:
# Process output as numerical values 
map_values = {'Yes': 1, 'No': 0}
voting_data_simplified['Prop-1'] = voting_data_simplified['Prop-1'].map(map_values)
voting_data_simplified['Prop-30'] = voting_data_simplified['Prop-30'].map(map_values)
voting_data_simplified['Prop-31'] = voting_data_simplified['Prop-31'].map(map_values)
voting_data_simplified


Unnamed: 0,Prop-1,Prop-30,Prop-31,What is your ethnicity?,My primary language is...,What is your average family income?,What is your gender?,What is your age?,What is your party self-identification?,What is your education background?,How important are the following issues to you? [Cost of Living],How important are the following issues to you? [Healthcare],How important are the following issues to you? [Environment],How important are the following issues to you? [Education],How important are the following issues to you? [Women's Rights],Do you have healthcare?,Do you smoke?,I am registered to vote.,I plan to vote in the November 2022 election.
0,1,1,1,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,Bachelor's degree (for example: BA. BS),,,,,,Yes,No,,
1,1,1,1,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",,,,,,Yes,No,,
2,1,1,1,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,High school,Important,Important,Very Important,Very Important,Very Important,Yes,No,No,Yes
3,1,1,1,Caucasian,Spanish,"$50,000- $100,000",Female,65+,Republican,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
4,0,1,1,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,Some college,Important,Important,Important,Important,Important,Yes,No,Yes,Yes
5,1,1,1,Caucasian,English,"$20,000 - $50,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",Very Important,Very Important,Very Important,Important,Very Important,Yes,No,Yes,Yes
6,1,1,1,Asian,English,"$0 - $20,000",Male,18-25,Democrat,Some college,Very Important,Very Important,Important,Important,Very Important,Yes,No,Yes,Yes
7,1,1,0,Asian,English,"$100,000 - $400,000",Female,18-25,Democrat,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
8,1,1,0,Asian,Chinese,"$100,000 - $400,000",Female,18-25,Independent,Some college,Somewhat Important,Somewhat Important,Somewhat Unimportant,Important,Somewhat Important,Yes,No,No,Maybe
9,1,1,1,Caucasian,English,"$100,000 - $400,000",Male,65+,Democrat,Ph.D. or higher,Somewhat Unimportant,Important,Important,Somewhat Important,Important,Yes,No,Yes,Yes


In [97]:
voting_output = voting_data_simplified[['Prop-1', 'Prop-30', 'Prop-31']]
voting_output 

Unnamed: 0,Prop-1,Prop-30,Prop-31
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,1
4,0,1,1
5,1,1,1
6,1,1,1
7,1,1,0
8,1,1,0
9,1,1,1


## Machine Learning
We split dataset into 80% training data and 20% test data. We want use some features of the voter to predict their attitudes on proposition 1, 30, and 31.

## Explore: Use PCA to figure out dominant features
We're curious about whether any feature is a dominant one for predicting the result. We apply PCA on data for proposition 1.
Result: Even the most important feature could only explain 18% of the variance in data. Therefore, we need more than 1 feature to make predictions on answer to propositions. Thus we reject models like simple linear regression, and consider models like multiple linear regression.

In [12]:
mod0.coef_

array([ 0.18605042, -0.04816337, -0.14445485,  0.0065678 ,  0.03493356,
       -0.01134358, -0.02358998,  0.10756694, -0.169619  ,  0.00576545,
       -0.11115083,  0.16743745, -0.10266631,  0.10266631, -0.21087993,
        0.10188283, -0.03990794,  0.14890504,  0.19088406, -0.01322981,
       -0.01444857, -0.16320568,  0.04288345,  0.04672681,  0.09910368,
        0.04672681,  0.02099101, -0.25643177, -0.00872387, -0.13659875,
        0.06350494,  0.00475017,  0.02107773,  0.00825543, -0.11115083,
        0.00475017,  0.02159109,  0.00794989, -0.1028954 , -0.00371308,
       -0.12232082, -0.03709001,  0.08234333, -0.08597585, -0.06038147,
        0.00794989, -0.04251393,  0.10385385,  0.08387196, -0.08387196,
        0.1028954 , -0.1028954 ,  0.09910368, -0.17617119, -0.09053397,
        0.01346647])

In [13]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca = pca.fit(X_train1)
comp = pca.components_

In [14]:
pca.explained_variance_ratio_

array([0.18618873, 0.17490958])

In [15]:
comp_x = comp[0]
comp_y = comp[1]
X_train1.columns[np.argmax(abs(comp_x))]

'How important are the following issues to you? [Healthcare]_Important'

In [16]:
X_train1.columns[np.argmax(abs(comp_y))]

'What is your gender?_Female'

## Model 1: Logistic Regression
We apply logistic regression because our goal is to predict a binary categorical value from as a linear function of features.
Result: test scores for all propositions are negative, implying that linear regression behave poorly in predicting the outcome. 

In [129]:
# Machine Learning
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression

#Test on Prop1
X1 = voting_data_input_dummies
y1 = voting_output["Prop-1"]


from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)


mod1 = LogisticRegression(random_state=42).fit(X_train1,y_train1)
train1_score = mod1.score(X_train1,y_train1)
y_pred1 = mod1.predict(X_test1)
test1_score = r2_score(y_pred1,y_test1)
print("train score for Prop-1:"+ str(train1_score)+", test score for Prop-1:" + str(test1_score))

train score for Prop-1:0.9473684210526315, test score for Prop-1:-0.24999999999999956


In [114]:
#Test on Prop30
X2 = voting_data_input_dummies
y2 = voting_output["Prop-30"]


from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

mod2 = LogisticRegression(random_state=42).fit(X_train2,y_train2)
train2_score = mod2.score(X_train2,y_train2)
y_pred_2 = mod2.predict(X_test2)
test2_score = r2_score(y_pred_2,y_test2)
print("train score for Prop-30:"+ str(train2_score)+", test score for Prop-30:" + str(test2_score))

train score for Prop-30:1.0, test score for Prop-30:1.0


In [115]:
#Test on Prop31
X3 = voting_data_input_dummies
y3 = voting_data_simplified["Prop-31"]


from sklearn.model_selection import train_test_split
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=42)

mod3 = LogisticRegression(random_state=42).fit(X_train3,y_train3)
train3_score = mod3.score(X_train3,y_train3)
y_pred_3 = mod3.predict(X_test3)
test3_score = r2_score(y_pred_3,y_test3)
print("train score for Prop-31:"+ str(train3_score)+", test score for Prop-31:" + str(test3_score))

train score for Prop-31:1.0, test score for Prop-31:0.0


## Model 2: Decision Tree
A decision tree is a tree of questions that must be answered in sequence to yield a predicted classification. The test score is fairly well.

In [125]:
voting_features = voting_data_simplified.iloc[:, 3:]

In [137]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

classifier1 = DecisionTreeClassifier(random_state=0)
classifier1.fit(X_train1, y_train1)

y_pred_test1 = classifier1.predict(X_test1)
y_pred_train1 = classifier1.predict(X_train1)
print("train score for Prop-1:", accuracy_score(y_train1, y_pred_train1), " Test Score for Prop-1:",accuracy_score(y_test1, y_pred_test1))

train score for Prop-1: 1.0  Test Score for Prop-1: 0.8


In [139]:
classifier2 = DecisionTreeClassifier(random_state=0)
classifier2.fit(X_train2, y_train2)

y_pred_test2 = classifier2.predict(X_test2)
y_pred_train2 = classifier2.predict(X_train2)
print("train score for Prop-30:", accuracy_score(y_train2, y_pred_train2), " Test Score for Prop-30:",accuracy_score(y_test2, y_pred_test2))

train score for Prop-30: 1.0  Test Score for Prop-30: 0.8


In [140]:
classifier3 = DecisionTreeClassifier(random_state=0)
classifier3.fit(X_train3, y_train3)

y_pred_test3 = classifier2.predict(X_test3)
y_pred_train3 = classifier2.predict(X_train3)
print("train score for Prop-31:", accuracy_score(y_train3, y_pred_train3), " Test Score for Prop-31:",accuracy_score(y_test3, y_pred_test3))

train score for Prop-31: 0.6842105263157895  Test Score for Prop-31: 0.6


## Model 3: Random Forests

Though decision trees are pretty good, we'd love to see if problem of overfitting could be overrided by random forest.

Procedures: Bagging: Short for Bootstrap AGGregatING. Generate bootstrap resamples of training data. Fit one model for each resample. Final model = average predictions of each small model

Result: For propositions 1 and 30, we have perfect test results. For Proposition 31, the test score remains 0.6. Overall, it performs the best among all three models.


In [141]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [142]:
classifier_rf_prop1 = RandomForestClassifier(random_state=42,oob_score=True)
classifier_rf_prop1 = classifier_rf_prop1.fit(X_train1, y_train1)
train_score_prop1 = accuracy_score(classifier_rf_prop1.predict(X_train1),y_train1)
# The out-of-bag (OOB) error is the average error for each calculated using predictions 
# from the trees that do not contain in their respective bootstrap sample. 
pred_prop1 = classifier_rf_prop1.predict(X_test1)
test_score_prop1 = accuracy_score(pred_prop1,y_test1)
print("train score for Prop-1:"+ str(train_score_prop1)+", test score for Prop-1:" + str(test_score_prop1))

train score for Prop-1:1.0, test score for Prop-1:1.0


In [143]:
classifier_rf_prop30 = RandomForestClassifier(random_state=42, oob_score=True)
classifier_rf_prop30 = classifier_rf_prop30.fit(X_train2, y_train2)
train_score_prop30 = classifier_rf_prop30.oob_score_
pred_prop30 = classifier_rf_prop30.predict(X_test2)
test_score_prop30 = accuracy_score(pred_prop30,y_test2)
print("train score for Prop-30:"+ str(train_score_prop30)+", test score for Prop-30:" + str(test_score_prop30))

train score for Prop-30:0.8421052631578947, test score for Prop-30:1.0


In [144]:
classifier_rf_prop31 = RandomForestClassifier(random_state=42, oob_score=True)
classifier_rf_prop31 = classifier_rf_prop31.fit(X_train3, y_train3)
train_score_prop31 = classifier_rf_prop31.oob_score_
pred_prop31 = classifier_rf_prop31.predict(X_test3)
test_score_prop31 = accuracy_score(pred_prop31,y_test3)
print("train score for Prop-31:"+ str(train_score_prop31)+", test score for Prop-31:" + str(test_score_prop31))

train score for Prop-31:0.631578947368421, test score for Prop-31:0.6


## We should proceeed with random forests.