In [83]:
import statsmodels
import pandas as pd
import statsmodels.stats.api as sms
from statsmodels.stats.proportion import proportion_confint
import statsmodels.formula.api as smf
import scipy.stats

In [None]:
# sample size needed for signficance

In [26]:
anes96 = statsmodels.datasets.anes96.load_pandas()
anes96_data = anes96.data

In [85]:
anes96_data.head()

Unnamed: 0,popul,TVnews,selfLR,ClinLR,DoleLR,PID,age,educ,income,vote,logpopul,age_bucket
0,0,7,7,1,6,6,36,3,1,1,-2.302585,old
1,190,1,3,3,5,1,20,4,1,0,5.24755,young
2,31,7,2,2,6,1,24,6,1,0,3.437208,young
3,83,4,3,4,5,1,28,6,1,0,4.420045,young
4,640,7,5,6,4,0,68,6,1,0,6.461624,old


In [28]:
anes96_data.columns

Index([u'popul', u'TVnews', u'selfLR', u'ClinLR', u'DoleLR', u'PID', u'age',
       u'educ', u'income', u'vote', u'logpopul'],
      dtype='object')

In [35]:
# assume young people were more likely to vote for Clinton (0) than Dole (1)
# at a 95% confidence level, the rate of young people voting for Clinton was 
# greater than the rate of old people voting for Dole

In [105]:
def make_my_age_bins(x):
    young = range(18,31)
    old = range(31,92)
    if x in young:
        return 0
    else:
        return 1
    
def make_my_education_bins(x):
    no_college = range(1,5)
    yes_college = range(5,8)
    if x in no_college:
        return 0
    else:
        return 1
    
def make_PID_buckets(x):
    dem = [0,1]
    ind = [2,3,4]
    rep = [5,6]
    if x in dem:
        return "Democrat"
    elif x in ind:
        return "Independent"
    else:
        return "Republican"

In [106]:
anes96_data['age_bucket'] = [make_my_age_bins(x) for x in anes96_data.age]
anes96_data['edu_bucket'] = [make_my_education_bins(x) for x in anes96_data.educ]
anes96_data['PID_bucket'] = [make_PID_buckets(x) for x in anes96_data.PID]

In [143]:
anes96_data.head()

Unnamed: 0,popul,TVnews,selfLR,ClinLR,DoleLR,PID,age,educ,income,vote,logpopul,age_bucket,edu_bucket,PID_bucket
0,0,7,7,1,6,6,36,3,1,1,-2.302585,1,0,Republican
1,190,1,3,3,5,1,20,4,1,0,5.24755,0,0,Democrat
2,31,7,2,2,6,1,24,6,1,0,3.437208,0,1,Democrat
3,83,4,3,4,5,1,28,6,1,0,4.420045,0,1,Democrat
4,640,7,5,6,4,0,68,6,1,0,6.461624,1,1,Democrat


In [144]:
anes96_data_dummied = anes96_data.join(pd.get_dummies(anes96_data['PID_bucket']))
anes96_data_dummied.head()

Unnamed: 0,popul,TVnews,selfLR,ClinLR,DoleLR,PID,age,educ,income,vote,logpopul,age_bucket,edu_bucket,PID_bucket,Democrat,Independent,Republican
0,0,7,7,1,6,6,36,3,1,1,-2.302585,1,0,Republican,0,0,1
1,190,1,3,3,5,1,20,4,1,0,5.24755,0,0,Democrat,1,0,0
2,31,7,2,2,6,1,24,6,1,0,3.437208,0,1,Democrat,1,0,0
3,83,4,3,4,5,1,28,6,1,0,4.420045,0,1,Democrat,1,0,0
4,640,7,5,6,4,0,68,6,1,0,6.461624,1,1,Democrat,1,0,0


In [141]:
form = 'vote ~ age + popul + income + edu_bucket + Democrat + Republican'
results = smf.ols(form, data = anes96_data_dummied).fit()

In [142]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                   vote   R-squared:                       0.570
Model:                            OLS   Adj. R-squared:                  0.567
Method:                 Least Squares   F-statistic:                     207.2
Date:                Sun, 28 May 2017   Prob (F-statistic):          5.51e-168
Time:                        16:39:05   Log-Likelihood:                -273.16
No. Observations:                 944   AIC:                             560.3
Df Residuals:                     937   BIC:                             594.3
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.2341      0.050      4.691      0.0

In [78]:
anes96_data.age_bucket.value_counts()

old      798
young    146
Name: age_bucket, dtype: int64

In [79]:
gb = anes96_data.groupby("age_bucket")

In [80]:
gb.sum()

Unnamed: 0_level_0,popul,TVnews,selfLR,ClinLR,DoleLR,PID,age,educ,income,vote,logpopul
age_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
old,237162,3159,3492,2326,4308,2305,40657,3624,13332,347,1901.360171
young,52062,360,591,449,784,378,3752,686,2085,46,432.497355


In [81]:
old_dole = proportion_confint(347, 798)
old_dole

(0.4004419780107909, 0.46923220745286826)

In [82]:
young_dole = proportion_confint(46, 146)
young_dole

(0.23971598316838316, 0.39042100313298667)

In [None]:
old_dole = proportion_confint

In [32]:
anes96_data.age.describe()

count    944.000000
mean      47.043432
std       16.423130
min       19.000000
25%       34.000000
50%       44.000000
75%       58.000000
max       91.000000
Name: age, dtype: float64