# US Presidential Election: Hypothesis Test & Predictive Modeling

This project builds on the results of the previous one *US Presidential Election: Data Cleaning & EDA* to create 
a predictive model. First, correlation of various demographics information with party vote shares are evaluated. Then,  difference of vote shares between election cycles are statistically tested using approperiate test method. Finally,   

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data_clean.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,State,Fips,County,Precincts,Votes,Less Than High School Diploma,At Least High School Diploma,At Least Bachelors's Degree,Graduate Degree,...,Teen.births,Children.in.single.parent.households,Adult.smoking,Adult.obesity,Diabetes,Sexually.transmitted.infections,Uninsured,Unemployment,Violent.crime,Injury.deaths
0,0,Arkansas,5043,"Drew County, Arkansas",17.0,6590.0,19.4,80.6,19.4,5.7,...,47.7,0.429,0.181,0.323,0.126,747.3,0.197,0.108,449.02,82.0
1,1,Arkansas,5087,"Madison County, Arkansas",20.0,6829.0,24.2,75.8,13.4,4.1,...,49.0,0.179,0.304,0.328,0.135,247.2,0.239,0.053,245.83,96.8
2,2,Georgia,13159,"Jasper County, Georgia",3.0,6016.0,21.7,78.3,13.3,5.6,...,54.1,0.381,0.21,0.298,0.118,324.1,0.239,0.096,205.6,71.6
3,3,Colorado,8037,"Eagle County, Colorado",30.0,22611.0,12.8,87.2,45.9,11.7,...,41.3,0.204,0.095,0.132,0.036,190.9,0.23,0.081,123.88,42.9
4,4,Georgia,13091,"Dodge County, Georgia",16.0,6995.0,23.1,76.9,13.8,4.8,...,64.0,0.453,0.189,0.358,0.153,497.9,0.208,0.115,477.48,79.4


In [3]:
vote = pd.read_csv('votes_clean.csv')
vote.head()

Unnamed: 0.1,Unnamed: 0,Democrats_12(Votes),Republicans_12(Votes),Democrats_16(Votes),Republicans_16(Votes),Democrats12_VoteShare,Republicans12_VoteShare,Democrats16_VoteShare,Republicans16_VoteShare
0,0,2630.0,3887.0,2364.0,3967.0,40.36,59.64,37.34,62.66
1,1,2099.0,4263.0,1587.0,4917.0,32.99,67.01,24.4,75.6
2,2,1845.0,4136.0,1544.0,4353.0,30.85,69.15,26.18,73.82
3,3,12792.0,9411.0,12652.0,8153.0,57.61,42.39,60.81,39.19
4,4,2442.0,5214.0,1836.0,5021.0,31.9,68.1,26.78,73.22


In [4]:
data_vote = pd.merge(data, vote, on = 'Unnamed: 0')
data_vote.head(5)

Unnamed: 0.1,Unnamed: 0,State,Fips,County,Precincts,Votes,Less Than High School Diploma,At Least High School Diploma,At Least Bachelors's Degree,Graduate Degree,...,Violent.crime,Injury.deaths,Democrats_12(Votes),Republicans_12(Votes),Democrats_16(Votes),Republicans_16(Votes),Democrats12_VoteShare,Republicans12_VoteShare,Democrats16_VoteShare,Republicans16_VoteShare
0,0,Arkansas,5043,"Drew County, Arkansas",17.0,6590.0,19.4,80.6,19.4,5.7,...,449.02,82.0,2630.0,3887.0,2364.0,3967.0,40.36,59.64,37.34,62.66
1,1,Arkansas,5087,"Madison County, Arkansas",20.0,6829.0,24.2,75.8,13.4,4.1,...,245.83,96.8,2099.0,4263.0,1587.0,4917.0,32.99,67.01,24.4,75.6
2,2,Georgia,13159,"Jasper County, Georgia",3.0,6016.0,21.7,78.3,13.3,5.6,...,205.6,71.6,1845.0,4136.0,1544.0,4353.0,30.85,69.15,26.18,73.82
3,3,Colorado,8037,"Eagle County, Colorado",30.0,22611.0,12.8,87.2,45.9,11.7,...,123.88,42.9,12792.0,9411.0,12652.0,8153.0,57.61,42.39,60.81,39.19
4,4,Georgia,13091,"Dodge County, Georgia",16.0,6995.0,23.1,76.9,13.8,4.8,...,477.48,79.4,2442.0,5214.0,1836.0,5021.0,31.9,68.1,26.78,73.22


In [5]:
data_vote['Republicans12_VoteShare']
data_vote.iloc[:,3]
#stats.pearsonr(data_vote.iloc[:,1], data_vote['Republicans12_VoteShare'])

0           Drew County, Arkansas
1        Madison County, Arkansas
2          Jasper County, Georgia
3          Eagle County, Colorado
4           Dodge County, Georgia
                  ...            
3136          Butler County, Ohio
3137           Logan County, Ohio
3138          Lorain County, Ohio
3139    Cherokee County, Oklahoma
3140    Cimarron County, Oklahoma
Name: County, Length: 3141, dtype: object

In [6]:
stats.pearsonr(data_vote.iloc[:,0], data_vote['Republicans12_VoteShare'])[0]

0.0053615713045454295

In [7]:
df_corr = pd.DataFrame(columns = ['Demographic info', '12 Correlation', '16 Correlation', '12 P-value', '16 P-value'])
display(df_corr)

Unnamed: 0,Demographic info,12 Correlation,16 Correlation,12 P-value,16 P-value


In [8]:
for i in range(4,data_vote.shape[1]):
    df_corr.loc[i-4]=[data_vote.columns[i], 
    stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans12_VoteShare'])[0], 
    stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans16_VoteShare'])[0],
    stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans12_VoteShare'])[1], 
    stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans16_VoteShare'])[1]]

display(df_corr)

Unnamed: 0,Demographic info,12 Correlation,16 Correlation,12 P-value,16 P-value
0,Precincts,-0.266277,-0.330111,3.963033e-52,9.712766e-81
1,Votes,-0.302003,-0.395448,3.047631e-67,4.407163e-118
2,Less Than High School Diploma,0.049446,0.058951,0.005575399,0.0009482711
3,At Least High School Diploma,-0.047965,-0.054349,0.007174521,0.002311376
4,At Least Bachelors's Degree,-0.303216,-0.464169,8.532572000000001e-68,1.233182e-167
5,Graduate Degree,-0.369278,-0.517054,4.517001e-102,2.50893e-214
6,School Enrollment,-0.132662,-0.167999,8.327733e-14,2.5642859999999998e-21
7,Median Earnings 2010,-0.116098,-0.192509,6.748051e-11,1.3326760000000001e-27
8,White (Not Latino) Population,0.437955,0.552828,2.284088e-147,6.294776e-251
9,African American Population,-0.392567,-0.461468,3.012613e-116,1.829538e-165


In [9]:
for i in range(4,data_vote.shape[1]):
    df_corr=df_corr.append([data_vote.columns[i], 
    stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans12_VoteShare'])[0], 
    stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans16_VoteShare'])[0],
    stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans12_VoteShare'])[1], 
    stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans16_VoteShare'])[1]], ignore_index = True)

display(df_corr)

Unnamed: 0,0,12 Correlation,12 P-value,16 Correlation,16 P-value,Demographic info
0,,-0.266277,3.963033e-52,-0.330111,9.712766e-81,Precincts
1,,-0.302003,3.047631e-67,-0.395448,4.407163e-118,Votes
2,,0.049446,5.575399e-03,0.058951,9.482711e-04,Less Than High School Diploma
3,,-0.047965,7.174521e-03,-0.054349,2.311376e-03,At Least High School Diploma
4,,-0.303216,8.532572e-68,-0.464169,1.233182e-167,At Least Bachelors's Degree
...,...,...,...,...,...,...
283,Republicans16_VoteShare,,,,,
284,0.943883,,,,,
285,1,,,,,
286,0,,,,,


In [10]:
data_vote.iloc[0,:]

Unnamed: 0                                                                        0
State                                                                      Arkansas
Fips                                                                           5043
County                                                        Drew County, Arkansas
Precincts                                                                        17
Votes                                                                          6590
Less Than High School Diploma                                                  19.4
At Least High School Diploma                                                   80.6
At Least Bachelors's Degree                                                    19.4
Graduate Degree                                                                 5.7
School Enrollment                                                              78.5
Median Earnings 2010                                                        

In [11]:
dadas
ffad

NameError: name 'dadas' is not defined

In [None]:
for i in range(3,data_vote.shape[1]):
    print(data_vote.columns[i], 
          stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans12_VoteShare']), 
          stats.pearsonr(data_vote.iloc[:,i], data_vote['Republicans16_VoteShare']))

In [None]:
stats.ttest_ind(data_vote['Republicans12_VoteShare'], data_vote['Republicans16_VoteShare'])

In [None]:
lm = sm.OLS(data_vote['Republicans16_VoteShare'], 
            sm.add_constant(data_vote.drop(['Unnamed: 0', 'State', 'Fips', 'County', 'Adults 65 and Older Living in Poverty',           
                                            'Democrats_12(Votes)', 'Republicans_12(Votes)', 
                                            'Democrats_16(Votes)', 'Republicans_16(Votes)',                                                                                  
                                            'Democrats12_VoteShare', 'Republicans12_VoteShare',                                                                                 
                                            'Democrats16_VoteShare', 'Republicans16_VoteShare'], axis = 1))).fit()
lm.summary()

In [None]:
plt.scatter(lm.fittedvalues, lm.resid)

In [None]:
X = data_vote
y = data_vote['Republicans12_VoteShare']

corr = np.corrcoef(X,y)[0,1]

tval = corr * np.sqrt((len(X)-2)/(1- corr **2))
print(tval)

p = stats.t.sf(tval, len(X)-2)*2 #two tailed!
print(p)

In [None]:
stats.pearsonr(data_vote, data_vote['Republicans12_VoteShare'])