In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import csv
import json
import os
import contextily as cx
from collections import defaultdict
import re
from typing import Callable

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

from zipfile import ZipFile
zf = ZipFile('data.zip', 'r')
zf.extractall('.')

# more readable exceptions
%pip install --quiet iwut
%load_ext iwut
%wut on



Note: you may need to restart the kernel to use updated packages.


In [2]:
voting_data = pd.read_csv("data/voting-data.csv")
voting_data

Unnamed: 0,Timestamp,"California Proposition 1, Right to Reproductive Freedom Amendment (2022)","California Proposition 30, Tax on Income Above $2 Million for Zero-Emissions Vehicles and Wildfire Prevention Initiative (2022)","California Proposition 31, Flavored Tobacco Products Ban Referendum (2022)",What is your ethnicity?,My primary language is...,What is your average family income?,What is your gender?,What is your age?,What is your party self-identification?,...,How important are the following issues to you? [Healthcare],How important are the following issues to you? [Environment],How important are the following issues to you? [Education],How important are the following issues to you? [Women's Rights],Do you have healthcare?,Are you a smoker?,I am registered to vote.,I plan to vote in the November 2022 election.,Unnamed: 20,Unnamed: 21
0,2022/07/20 11:16:12 PM MDT,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,...,,,,,Yes,No,,,5.0,5.0
1,2022/07/20 11:36:03 PM MDT,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,...,,,,,Yes,No,,,5.0,5.0
2,2022/07/21 9:42:16 AM MDT,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,...,Important,Very Important,Very Important,Very Important,Yes,No,No,Yes,,
3,2022/07/21 9:49:05 AM MDT,Yes,Yes,Yes,Caucasian,Spanish,"$50,000- $100,000",Female,65+,Republican,...,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes,,
4,2022/07/21 12:19:15 PM MDT,No,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,...,Important,Important,Important,Important,Yes,No,Yes,Yes,,
5,2022/07/21 1:33:39 PM MDT,Yes,Yes,Yes,Caucasian,English,"$20,000 - $50,000",Female,65+,Democrat,...,Very Important,Very Important,Important,Very Important,Yes,No,Yes,Yes,,
6,2022/07/21 4:02:15 PM MDT,Yes,Yes,Yes,Asian,English,"$0 - $20,000",Male,18-25,Democrat,...,Very Important,Important,Important,Very Important,Yes,No,Yes,Yes,,
7,2022/07/21 4:12:39 PM MDT,Yes,Yes,No,Asian,English,"$100,000 - $400,000",Female,18-25,Democrat,...,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes,,
8,2022/07/21 4:23:52 PM MDT,Yes,Yes,No,Asian,Chinese,"$100,000 - $400,000",Female,18-25,Independent,...,Somewhat Important,Somewhat Unimportant,Important,Somewhat Important,Yes,No,No,Maybe,,


In [3]:
#Clean the data by dropping N/A Values
voting_data_dropped = voting_data.drop(columns = ["Timestamp", "Unnamed: 20", "Unnamed: 21"])

In [4]:
# Rename Propositions to make it easier to understand
voting_data_simplified = voting_data_dropped.rename(columns={"California Proposition 1, Right to Reproductive Freedom Amendment (2022)": "Prop-1", "California Proposition 30, Tax on Income Above $2 Million for Zero-Emissions Vehicles and Wildfire Prevention Initiative (2022)": "Prop-30", "California Proposition 31, Flavored Tobacco Products Ban Referendum (2022)":"Prop-31"})
voting_data_simplified

Unnamed: 0,Prop-1,Prop-30,Prop-31,What is your ethnicity?,My primary language is...,What is your average family income?,What is your gender?,What is your age?,What is your party self-identification?,What is your education background?,How important are the following issues to you? [Cost of Living],How important are the following issues to you? [Healthcare],How important are the following issues to you? [Environment],How important are the following issues to you? [Education],How important are the following issues to you? [Women's Rights],Do you have healthcare?,Are you a smoker?,I am registered to vote.,I plan to vote in the November 2022 election.
0,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,Bachelor's degree (for example: BA. BS),,,,,,Yes,No,,
1,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",,,,,,Yes,No,,
2,Yes,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,High school,Important,Important,Very Important,Very Important,Very Important,Yes,No,No,Yes
3,Yes,Yes,Yes,Caucasian,Spanish,"$50,000- $100,000",Female,65+,Republican,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
4,No,Yes,Yes,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,Some college,Important,Important,Important,Important,Important,Yes,No,Yes,Yes
5,Yes,Yes,Yes,Caucasian,English,"$20,000 - $50,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",Very Important,Very Important,Very Important,Important,Very Important,Yes,No,Yes,Yes
6,Yes,Yes,Yes,Asian,English,"$0 - $20,000",Male,18-25,Democrat,Some college,Very Important,Very Important,Important,Important,Very Important,Yes,No,Yes,Yes
7,Yes,Yes,No,Asian,English,"$100,000 - $400,000",Female,18-25,Democrat,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
8,Yes,Yes,No,Asian,Chinese,"$100,000 - $400,000",Female,18-25,Independent,Some college,Somewhat Important,Somewhat Important,Somewhat Unimportant,Important,Somewhat Important,Yes,No,No,Maybe


In [5]:
# One-hot-encoding trial
# from sklearn.preprocessing import OneHotEncoder
# rt = voting_data.copy()
# enc = OneHotEncoder(handle_unknown='ignore')
# enc_df = pd.DataFrame(enc.fit_transform(rt[['What is your education background?']]).toarray())
# rt.join(enc_df)

In [6]:
# This dataframe is used as input 
voting_data_input = voting_data_simplified.drop(columns = ["Prop-1", "Prop-30", "Prop-31"])
voting_data_input

Unnamed: 0,What is your ethnicity?,My primary language is...,What is your average family income?,What is your gender?,What is your age?,What is your party self-identification?,What is your education background?,How important are the following issues to you? [Cost of Living],How important are the following issues to you? [Healthcare],How important are the following issues to you? [Environment],How important are the following issues to you? [Education],How important are the following issues to you? [Women's Rights],Do you have healthcare?,Are you a smoker?,I am registered to vote.,I plan to vote in the November 2022 election.
0,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,Bachelor's degree (for example: BA. BS),,,,,,Yes,No,,
1,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",,,,,,Yes,No,,
2,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,High school,Important,Important,Very Important,Very Important,Very Important,Yes,No,No,Yes
3,Caucasian,Spanish,"$50,000- $100,000",Female,65+,Republican,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
4,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,Some college,Important,Important,Important,Important,Important,Yes,No,Yes,Yes
5,Caucasian,English,"$20,000 - $50,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",Very Important,Very Important,Very Important,Important,Very Important,Yes,No,Yes,Yes
6,Asian,English,"$0 - $20,000",Male,18-25,Democrat,Some college,Very Important,Very Important,Important,Important,Very Important,Yes,No,Yes,Yes
7,Asian,English,"$100,000 - $400,000",Female,18-25,Democrat,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
8,Asian,Chinese,"$100,000 - $400,000",Female,18-25,Independent,Some college,Somewhat Important,Somewhat Important,Somewhat Unimportant,Important,Somewhat Important,Yes,No,No,Maybe


In [7]:
# Encoding all features into dummies variables
voting_data_input_dummies = pd.get_dummies(voting_data_input)
voting_data_input_dummies

Unnamed: 0,What is your ethnicity?_Asian,What is your ethnicity?_Caucasian,My primary language is..._Chinese,My primary language is..._English,My primary language is..._Spanish,"What is your average family income?_$0 - $20,000","What is your average family income?_$100,000 - $400,000","What is your average family income?_$20,000 - $50,000","What is your average family income?_$50,000- $100,000",What is your gender?_Female,...,How important are the following issues to you? [Education]_Very Important,How important are the following issues to you? [Women's Rights]_Important,How important are the following issues to you? [Women's Rights]_Somewhat Important,How important are the following issues to you? [Women's Rights]_Very Important,Do you have healthcare?_Yes,Are you a smoker?_No,I am registered to vote._No,I am registered to vote._Yes,I plan to vote in the November 2022 election._Maybe,I plan to vote in the November 2022 election._Yes
0,0,1,0,1,0,0,1,0,0,1,...,0,0,0,0,1,1,0,0,0,0
1,0,1,0,1,0,0,1,0,0,1,...,0,0,0,0,1,1,0,0,0,0
2,0,1,0,1,0,0,1,0,0,1,...,1,0,0,1,1,1,1,0,0,1
3,0,1,0,0,1,0,0,0,1,1,...,1,0,0,1,1,1,0,1,0,1
4,0,1,0,1,0,0,1,0,0,1,...,0,1,0,0,1,1,0,1,0,1
5,0,1,0,1,0,0,0,1,0,1,...,0,0,0,1,1,1,0,1,0,1
6,1,0,0,1,0,1,0,0,0,0,...,0,0,0,1,1,1,0,1,0,1
7,1,0,0,1,0,0,1,0,0,1,...,1,0,0,1,1,1,0,1,0,1
8,1,0,1,0,0,0,1,0,0,1,...,0,0,1,0,1,1,1,0,1,0


In [8]:
# Process output as numerical values 
map_values = {'Yes': 1, 'No': 0}
voting_data_simplified['Prop-1'] = voting_data_simplified['Prop-1'].map(map_values)
voting_data_simplified['Prop-30'] = voting_data_simplified['Prop-30'].map(map_values)
voting_data_simplified['Prop-31'] = voting_data_simplified['Prop-31'].map(map_values)
voting_data_simplified

Unnamed: 0,Prop-1,Prop-30,Prop-31,What is your ethnicity?,My primary language is...,What is your average family income?,What is your gender?,What is your age?,What is your party self-identification?,What is your education background?,How important are the following issues to you? [Cost of Living],How important are the following issues to you? [Healthcare],How important are the following issues to you? [Environment],How important are the following issues to you? [Education],How important are the following issues to you? [Women's Rights],Do you have healthcare?,Are you a smoker?,I am registered to vote.,I plan to vote in the November 2022 election.
0,1,1,1,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,Bachelor's degree (for example: BA. BS),,,,,,Yes,No,,
1,1,1,1,Caucasian,English,"$100,000 - $400,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",,,,,,Yes,No,,
2,1,1,1,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,High school,Important,Important,Very Important,Very Important,Very Important,Yes,No,No,Yes
3,1,1,1,Caucasian,Spanish,"$50,000- $100,000",Female,65+,Republican,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
4,0,1,1,Caucasian,English,"$100,000 - $400,000",Female,18-25,Democrat,Some college,Important,Important,Important,Important,Important,Yes,No,Yes,Yes
5,1,1,1,Caucasian,English,"$20,000 - $50,000",Female,65+,Democrat,"Master's degree (for example: MA, MS, MEng, ME...",Very Important,Very Important,Very Important,Important,Very Important,Yes,No,Yes,Yes
6,1,1,1,Asian,English,"$0 - $20,000",Male,18-25,Democrat,Some college,Very Important,Very Important,Important,Important,Very Important,Yes,No,Yes,Yes
7,1,1,0,Asian,English,"$100,000 - $400,000",Female,18-25,Democrat,Bachelor's degree (for example: BA. BS),Very Important,Very Important,Very Important,Very Important,Very Important,Yes,No,Yes,Yes
8,1,1,0,Asian,Chinese,"$100,000 - $400,000",Female,18-25,Independent,Some college,Somewhat Important,Somewhat Important,Somewhat Unimportant,Important,Somewhat Important,Yes,No,No,Maybe


In [9]:
# Machine Learning

#Test on Prop1
from sklearn.metrics import r2_score
X1 = voting_data_input_dummies
y1 = voting_data_simplified["Prop-1"]


from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)
mod0 = LinearRegression().fit(X_train1,y_train1)
train1_score = mod0.score(X_train1,y_train1)
y_pred = mod0.predict(X_test1)
test1_score = r2_score(y_pred,y_test1)
print("train score for Prop-1:"+ str(train1_score)+", test score for Prop-1:" + str(test1_score))

train score for Prop-1:1.0, test score for Prop-1:-0.7829311277308089


In [10]:
#Test on Prop1
from sklearn.metrics import r2_score
X2 = voting_data_input_dummies
y2 = voting_data_simplified["Prop-30"]


from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)
mod2 = LinearRegression().fit(X_train2,y_train2)
train2_score = mod2.score(X_train2,y_train2)
y_pred_2 = mod2.predict(X_test2)
test2_score = r2_score(y_pred_2,y_test2)
print("train score for Prop-30:"+ str(train2_score)+", test score for Prop-30:" + str(test2_score))

train score for Prop-30:1.0, test score for Prop-30:1.0


In [12]:
#Test on Prop31
from sklearn.metrics import r2_score
X3 = voting_data_input_dummies
y3 = voting_data_simplified["Prop-31"]


from sklearn.model_selection import train_test_split
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=42)
mod3 = LinearRegression().fit(X_train3,y_train3)
train3_score = mod3.score(X_train3,y_train3)
y_pred_3 = mod3.predict(X_test3)
test3_score = r2_score(y_pred_3,y_test3)
print("train score for Prop-31:"+ str(train3_score)+", test score for Prop-31:" + str(test3_score))

train score for Prop-31:1.0, test score for Prop-31:-724.1182934774081
