In [1]:
# imports and loading clean data
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt

from IPython.display import display, Markdown

from modules.lv_utils import load_households, load_voters

# load the data
households = load_households('data_clean/20180725_fullset_households_district3.csv')
voters = load_voters('data_clean/20180725_fullset_voters_district3.csv')
elections = pd.read_csv('data_clean/20180621_election_data.csv')


Blog/article on manipulating multi_index: https://jakevdp.github.io/PythonDataScienceHandbook/03.05-hierarchical-indexing.html

In [2]:
voters.shape

(13307, 148)

In [3]:
print(voters.columns[0:75])
print(voters.columns[75:])

Index(['Vid', 'Abbr', 'Precinct', 'PrecinctSub', 'Party', 'PartyMain',
       'RegDate', 'PAV', 'RegDateOriginal', 'E6_110816', 'E5_060716',
       'E4_110414', 'E3_060314', 'E2_110612', 'E1_060512', 'District',
       'VScore', 'VScorePos', 'VScorePct', 'BirthYear',
       'OldestInHouseBirthYear', 'IsOldestInHouse', 'havePhone',
       'BirthPlaceState', 'BirthPlaceStateRegion', 'BirthPlaceCountry',
       'BirthPlaceCountryRegion', 'Gender', 'sameMailAddress', 'MailCountry',
       'isApt', 'Zip', 'StreetType', 'EmailProvider', 'E5_060716BT',
       'E1_060512BT', 'Hid', 'cHid', 'E34_nVotesPos', 'E34_nVotes',
       'E34_nVotesPct', 'E56_nVotesPos', 'E56_nVotes', 'E56_nVotesPct',
       'E78_nVotesPos', 'E78_nVotes', 'E78_nVotesPct', 'E12_nVotesPos',
       'E12_nVotes', 'E12_nVotesPct', 'E14_nVotesPos', 'E14_nVotes',
       'E14_nVotesPct', 'E16_nVotesPos', 'E16_nVotes', 'E16_nVotesPct',
       'Eap_nVotesPos', 'Eap_nVotes', 'Eap_nVotesPct', 'Eag_nVotesPos',
       'Eag_nVotes', 'E

In [4]:
#full col list
data_cols = ['Abbr', 'Precinct', 'PrecinctSub', 'Party', 'PartyMain',
       'RegDate', 'PAV', 'RegDateOriginal','VScorePct', 'nVScorePctInHH', 'BirthYear',
       'OldestInHouseBirthYear', 'IsOldestInHouse', 'havePhone',
       'BirthPlaceState', 'BirthPlaceStateRegion', 'BirthPlaceCountry',
       'BirthPlaceCountryRegion', 'Gender', 'sameMailAddress', 'MailCountry',
       'isApt', 'Zip', 'StreetType', 'EmailProvider','nVotersInHH', 'PAVCode', 'nPAVInHH',
       'HasParty', 'nAffInHH', 'isDEM', 'isREP', 'isNPP', 'nDEMInHH',
       'nREPInHH', 'nNPPInHH','BirthYear','nVScorePctInHH',
       'CityArea', 'mostAfflsInHH', 'mixedAfflsInHH',
       'allAffInHH', 'uniformAffInHH']
# cut down list for checking/developing code
#data_cols = ['Party', 'PartyMain','PAV','nVotersInHH', 'PAVCode','HasParty', 'nAffInHH', 'isDEM', 'isREP',]
e_cols = ['E6_110816', 'E5_060716','E4_110414', 'E3_060314', 'E2_110612', 'E1_060512']
gdt_cols = ['E1_GndTth', 'E2_GndTth', 'E3_GndTth', 'E4_GndTth','E5_GndTth', 'E6_GndTth']
nvp_cols = ['E6_nVotesPct','E5_nVotesPct','E4_nVotesPct','E3_nVotesPct','E2_nVotesPct','E1_nVotesPct']
nvphh_cols = ['E1_nVotesPctInHH', 'E2_nVotesPctInHH', 'E3_nVotesPctInHH',
              'E4_nVotesPctInHH', 'E5_nVotesPctInHH', 'E6_nVotesPctInHH']
cv_cols = ['E34_nVotesPct', 'E56_nVotesPct', 'E78_nVotesPct']
cvhh_cols = ['E34_nVotesPctInHH','E56_nVotesPctInHH','E78_nVotesPctInHH']

#Make all series start with E1 (turn around where required:)
e_cols = e_cols[::-1]
nvp_cols = nvp_cols[::-1]

In [5]:
voters.loc[:5,cv_cols+cvhh_cols]

Unnamed: 0,E34_nVotesPct,E56_nVotesPct,E78_nVotesPct,E34_nVotesPctInHH,E56_nVotesPctInHH,E78_nVotesPctInHH
0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,0.666667,0.8,1.0,0.5,0.6
2,1.0,0.75,0.833333,0.8,0.545455,0.705882
3,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,-1.0,-1.0,-1.0,0.5,0.25,0.333333


In [6]:
cols = e_cols+gdt_cols+nvp_cols+nvphh_cols+cv_cols+cvhh_cols
dfw = voters.loc[:,cols+data_cols]
dfw.iloc[:5,5:15]

Unnamed: 0,E6_110816,E1_GndTth,E2_GndTth,E3_GndTth,E4_GndTth,E5_GndTth,E6_GndTth,E1_nVotesPct,E2_nVotesPct,E3_nVotesPct
0,A,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,A,-1.0,1.0,0.0,1.0,1.0,1.0,-1.0,1.0,0.0
2,A,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
3,N,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
4,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [7]:
# relevant question on unstack rather than stack
#https://stackoverflow.com/questions/33057946/unstack-or-pivot-only-some-columns


In [8]:
ele = ['E1','E2','E3','E4','E5','E6','E7','E8']
dfw_data = dfw.drop(columns=cols)
dfw_data.index.rename('vid', inplace=True)

dfw_elecs = dfw[e_cols]
dfw_elecs.index.rename('vid', inplace=True)
dfw_elecs.columns = ele[:6]
dfw_elecs = dfw_elecs.join(pd.DataFrame(np.nan, index=dfw_elecs.index, columns=['E7','E8']))

dfw_gdt = dfw[gdt_cols]
dfw_gdt.index.rename('vid', inplace=True)
dfw_gdt.columns = ele[:6]

dfw_nvp = dfw[nvp_cols]
dfw_nvp.index.rename('vid', inplace=True)
dfw_nvp.columns = ele[1:7]

dfw_nvphh = dfw[nvphh_cols]
dfw_nvphh.index.rename('vid', inplace=True)
dfw_nvphh.columns = ele[1:7]

dfw_cv = dfw[list(np.repeat(cv_cols,2))]
dfw_cv.index.rename('vid', inplace=True)
dfw_cv.columns = ele[2:8]

dfw_cvhh = dfw[list(np.repeat(cvhh_cols,2))]
dfw_cvhh.index.rename('vid', inplace=True)
dfw_cvhh.columns = ele[2:8]

#display(dfw_data)
#display(dfw_elecs)
#display(dfw_gdt)
#display(dfw_nvp)

In [9]:
#pd.DataFrame(dfw_elecs.unstack(level=1).reorder_levels([1,0]))
ustk_e = pd.DataFrame(dfw_elecs.unstack(level=1))
ustk_e.columns = ['vote']
ustk_g = pd.DataFrame(dfw_gdt.unstack(level=1))
ustk_g.columns = ['gndtth']
ustk_l = pd.DataFrame(dfw_nvp.unstack(level=1))
ustk_l.columns = ['lastelec']
ustk_lhh = pd.DataFrame(dfw_nvphh.unstack(level=1))
ustk_lhh.columns = ['lastelechh']
ustk_r = pd.DataFrame(dfw_cv.unstack(level=1))
ustk_r.columns = ['lastcycle']
ustk_rhh = pd.DataFrame(dfw_cvhh.unstack(level=1))
ustk_rhh.columns = ['lastcyclehh']
#display(ustk_r)
#display(ustk_g)

# combining the various reshaped election tables and the person data
data = ustk_e.join([ustk_g, ustk_l, ustk_lhh, ustk_r, ustk_rhh]).fillna(-1)\
            .join(dfw_data).reorder_levels([1,0]).sort_index(level=0)
data.index.levels[1].name = 'e'

In [10]:
print(data.shape)
data.iloc[:16,:14]

(106456, 49)


Unnamed: 0_level_0,Unnamed: 1_level_0,vote,gndtth,lastelec,lastelechh,lastcycle,lastcyclehh,Abbr,Precinct,PrecinctSub,Party,PartyMain,RegDate,PAV,RegDateOriginal
vid,e,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,E1,V,1.0,-1.0,-1.0,-1.0,-1.0,82,832400,0,DEM,DEM,1992-10-05,Y,1992-10-05
0,E2,V,1.0,1.0,1.0,-1.0,-1.0,82,832400,0,DEM,DEM,1992-10-05,Y,1992-10-05
0,E3,V,1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,DEM,1992-10-05,Y,1992-10-05
0,E4,V,1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,DEM,1992-10-05,Y,1992-10-05
0,E5,A,1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,DEM,1992-10-05,Y,1992-10-05
0,E6,A,1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,DEM,1992-10-05,Y,1992-10-05
0,E7,-1,-1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,DEM,1992-10-05,Y,1992-10-05
0,E8,-1,-1.0,-1.0,-1.0,1.0,1.0,82,832400,0,DEM,DEM,1992-10-05,Y,1992-10-05
1,E1,,-1.0,-1.0,-1.0,-1.0,-1.0,82,832910,0,DEM,DEM,2012-11-06,Y,2012-10-26
1,E2,V,1.0,-1.0,-1.0,-1.0,-1.0,82,832910,0,DEM,DEM,2012-11-06,Y,2012-10-26


In [11]:
elections.index = ele[::-1]
elections.index.name = 'e'
elections

Unnamed: 0_level_0,elections,dates,cycle,etype,president,us_senate_maj,us_repre_maj,ca_governor,ca_lt_govnor,ca_senate_maj,ca_assembly_maj
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
E8,E8_110618,2018-11-06,Cong,General,REP,4,42,DEM,DEM,-13,-28
E7,E7_060518,2018-06-05,Cong,Primary,REP,4,42,DEM,DEM,-13,-28
E6,E6_110816,2016-11-08,Pres,General,DEM,10,60,DEM,DEM,-13,-24
E5,E5_060716,2016-06-07,Pres,Primary,DEM,10,58,DEM,DEM,-13,-24
E4,E4_110414,2014-11-04,Cong,General,DEM,-8,34,DEM,DEM,-13,-31
E3,E3_060314,2014-06-03,Cong,Primary,DEM,-8,34,DEM,DEM,-13,-31
E2,E2_110612,2012-11-06,Pres,General,DEM,-3,50,DEM,DEM,-11,-25
E1,E1_060512,2012-06-05,Pres,Primary,DEM,-3,52,DEM,DEM,-11,-25


In [12]:
data = data.join(elections)
data = data.drop(columns=['elections','dates'])  # duplicated information

In [13]:
print(data.shape)
data.iloc[:16,:5+50:]

(106456, 58)


Unnamed: 0_level_0,Unnamed: 1_level_0,vote,gndtth,lastelec,lastelechh,lastcycle,lastcyclehh,Abbr,Precinct,PrecinctSub,Party,...,mostAfflsInHH,mixedAfflsInHH,allAffInHH,uniformAffInHH,cycle,etype,president,us_senate_maj,us_repre_maj,ca_governor
vid,e,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,E1,V,1.0,-1.0,-1.0,-1.0,-1.0,82,832400,0,DEM,...,DEM,False,False,False,Pres,Primary,DEM,-3,52,DEM
0,E2,V,1.0,1.0,1.0,-1.0,-1.0,82,832400,0,DEM,...,DEM,False,False,False,Pres,General,DEM,-3,50,DEM
0,E3,V,1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,...,DEM,False,False,False,Cong,Primary,DEM,-8,34,DEM
0,E4,V,1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,...,DEM,False,False,False,Cong,General,DEM,-8,34,DEM
0,E5,A,1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,...,DEM,False,False,False,Pres,Primary,DEM,10,58,DEM
0,E6,A,1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,...,DEM,False,False,False,Pres,General,DEM,10,60,DEM
0,E7,-1,-1.0,1.0,1.0,1.0,1.0,82,832400,0,DEM,...,DEM,False,False,False,Cong,Primary,REP,4,42,DEM
0,E8,-1,-1.0,-1.0,-1.0,1.0,1.0,82,832400,0,DEM,...,DEM,False,False,False,Cong,General,REP,4,42,DEM
1,E1,,-1.0,-1.0,-1.0,-1.0,-1.0,82,832910,0,DEM,...,DEM,False,False,False,Pres,Primary,DEM,-3,52,DEM
1,E2,V,1.0,-1.0,-1.0,-1.0,-1.0,82,832910,0,DEM,...,DEM,False,False,False,Pres,General,DEM,-3,50,DEM


In [14]:
data = data.reset_index()
#data.info()

In [15]:
obj = data.select_dtypes(include=object)
obj.describe()

Unnamed: 0,e,vote,PartyMain,BirthPlaceStateRegion,BirthPlaceCountryRegion,EmailProvider,CityArea,mostAfflsInHH,cycle,etype,president,ca_governor,ca_lt_govnor
count,106456,106456,106456,106456,106456,106456,106456,106456,106456,106456,106456,106456,106456
unique,8,5,5,5,7,202,3,18,2,2,2,1,1
top,E7,N,DEM,UNK,USA,UNK,Centerville,DEM,Pres,General,DEM,DEM,DEM
freq,13307,30329,53072,50912,56592,72072,73496,60544,53228,53228,79842,106456,106456


# Trying first Model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

  from numpy.core.umath_tests import inner1d


In [28]:
df = data[data.e.isin(['E1','E2','E3','E4','E5','E6'])] #- accuracy dropped - even with imputed -1 for missing data
df = data[data.e.isin(['E3','E4','E5','E6'])]

In [29]:
# need variance check - then drop 'ca_governor', 'ca_lt_govnor' 
# as column is same across data therefore carries no information
df = df.drop(columns=['ca_governor', 'ca_lt_govnor'])

In [30]:
# Object and category columns for get_dummies
print(df.select_dtypes(exclude=["number","bool_", "datetime"]).columns)
# Datetime columns for removal
print(df.select_dtypes(exclude=["number","bool_", "object_", "category"]).columns)

Index(['e', 'vote', 'Party', 'PartyMain', 'PAV', 'BirthPlaceState',
       'BirthPlaceStateRegion', 'BirthPlaceCountry', 'BirthPlaceCountryRegion',
       'Gender', 'MailCountry', 'StreetType', 'EmailProvider', 'CityArea',
       'mostAfflsInHH', 'cycle', 'etype', 'president'],
      dtype='object')
Index(['RegDate', 'RegDateOriginal'], dtype='object')


In [31]:
df_d = pd.get_dummies(data=df, columns=['Party', 'PartyMain', 'PAV', 'BirthPlaceState',
       'BirthPlaceStateRegion', 'BirthPlaceCountry', 'BirthPlaceCountryRegion',
       'Gender', 'MailCountry', 'StreetType', 'EmailProvider', 'CityArea',
       'mostAfflsInHH', 'cycle', 'etype', 'president'])
df_d = df_d.drop(columns=['RegDate', 'RegDateOriginal'])

In [32]:
X = df_d.drop(columns=['vid','vote','e','gndtth'])
y = df_d.gndtth

In [33]:
# Split the data into a training and test set.
X, Xtest, y, ytest = train_test_split(X, y, shuffle=True, random_state=5)

In [34]:
model = RandomForestClassifier()
model = model.fit(X,y)

In [35]:
print(accuracy_score(ytest, model.predict(Xtest)))

0.8589750012524423


In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
model2 = LogisticRegression()
model2 = model2.fit(X,y)

In [38]:
print(accuracy_score(ytest, model2.predict(Xtest)))

0.5070888232052503


Check: number of votes cast - so total number of N's vs total number of A,V's - to identify what % prediction a nieve model would give.

Check/play with tuning parameters - vs Default, grid searching for best value in Random Forest, tree size and feature depth - revise options, for Log Regression different paramters

check for overfitting - how - increase randomness factor

which feature most important particularly Random Forest - validate no data leakage

Sort NaNs imputing to -1 for RF
consider binning for LR - one bin for less than 0, one for 0-.3333, 0.333-.6666, .666 -1
