# Statsmodels odds 

In [1]:
import pandas as pd
import numpy as np

from collections import Counter 
import copy
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [2]:
df = sm.datasets.anes96.load_pandas().data
# insert missing values
np.random.seed(130)
rnd1 = np.random.choice(df.index,50,replace=False) 
rnd2 = np.random.choice(df.index,50,replace=False) 
df.loc[rnd1,"age"] = np.nan
df.loc[rnd2,"educ"] = np.nan

In [3]:
for c in df.columns:
    print(c)
    print(df[c].unique())
    print("="*50)

popul
[0.0e+00 1.9e+02 3.1e+01 8.3e+01 6.4e+02 1.1e+02 1.0e+02 1.8e+02 2.8e+03
 1.6e+03 3.3e+02 1.0e+03 1.3e+02 5.0e+00 3.3e+01 1.9e+01 7.4e+01 1.2e+01
 3.9e+02 4.0e+01 3.0e+00 4.5e+02 3.5e+02 6.4e+01 6.2e+01 1.1e+01 4.0e+00
 3.5e+01 2.7e+02 4.5e+01 6.0e+00 2.0e+00 3.5e+03 6.7e+01 3.0e+01 4.0e+02
 1.5e+01 2.2e+01 3.2e+01 5.9e+01 1.7e+02 1.0e+00 5.3e+01 1.6e+01 2.7e+01
 8.4e+01 2.0e+02 2.0e+01 7.3e+03 1.3e+01 9.0e+00 4.4e+01 5.1e+01 2.9e+01
 6.3e+02 5.0e+01 7.2e+02 2.4e+01 5.5e+01 6.3e+01 7.1e+01 9.0e+02 3.7e+02
 4.7e+02 4.7e+01 8.0e+00 2.9e+02 1.0e+01 9.3e+01 5.1e+02 5.6e+01 7.0e+00
 1.4e+02 1.4e+01 3.1e+02 3.7e+01 2.3e+01 3.6e+02 8.7e+01 4.2e+01 7.0e+01
 5.2e+02 5.4e+01 7.5e+01 3.4e+01 7.6e+01 4.3e+02 1.5e+02 2.5e+01 1.8e+01
 7.4e+02 8.1e+01 1.6e+02 2.2e+02 2.6e+01 8.8e+01 6.6e+01 5.7e+02 1.7e+01]
TVnews
[7. 1. 4. 3. 0. 5. 2. 6.]
selfLR
[7. 3. 2. 5. 4. 6. 1.]
ClinLR
[1. 3. 2. 4. 6. 7. 5.]
DoleLR
[6. 5. 4. 3. 7. 2. 1.]
PID
[6. 1. 0. 4. 3. 5. 2.]
age
[36. 20. 24. 28. 68. 21. 77. 31. nan

In [4]:
for c in df.columns:
    print(c)
    print(df[c].isnull().sum())
    print("="*50)

popul
0
TVnews
0
selfLR
0
ClinLR
0
DoleLR
0
PID
0
age
50
educ
50
income
0
vote
0
logpopul
0


In [17]:
df.isnull().value_counts()

popul  TVnews  selfLR  ClinLR  DoleLR  PID    age    educ   income  vote   logpopul
False  False   False   False   False   False  False  False  False   False  False       848
                                                     True   False   False  False        46
                                              True   False  False   False  False        46
                                                     True   False   False  False         4
dtype: int64

In [5]:
c = "TVnews"
dfM = df.copy()
dfM[c] = (dfM[c]
         .mask(df[c] <=7, "frequent")
         .mask(df[c] <=5, "moderate")
         .mask(df[c] <=3, "seldom")
        )

c = "income"
dfM[c] = (dfM[c]
          .mask(df[c] <= 24, "17-24")
          .mask(df[c] <= 16, "9-16")
          .mask(df[c] <= 8, "0-8")
         )

# include missing
c = "age"
dfM[c] = (dfM[c]
          .mask(df[c].isnull(),"missing")
          .mask(df[c] <=100, "81-100")
          .mask(df[c] <=80, "61-80")
          .mask(df[c] <=60, "41-60")
          .mask(df[c] <=40, "19-40")
         )

c = "logpopul"
dfM[c] = (dfM[c]
          .mask(df[c] <= 9, "high")
          .mask(df[c] <= 4.70, "moderate high")
          .mask(df[c] <= 3.09, "moderate low")
          .mask(df[c] < 0.09, "low")
         )

c = "PID"
cates= ["Strong Democrat","Weak Democrat","Independent-Democrat"
             ,"Independent-Indpendent", "Independent-Republican"
             ,"Weak Republican", "Strong Republican"]
rep_dic = {i:c for i,c in enumerate(cates)} 
dfM[c] = dfM[c].replace(rep_dic)

c = "selfLR"
cates = ["Strong Democrat","Weak Democrat","Independent-Democrat"
             ,"Independent-Indpendent", "Independent-Republican"
             ,"Weak Republican", "Strong Republican"]
rep_dic = {i:c for i,c in enumerate(cates)}
dfM[c] = dfM[c].replace(rep_dic)

# include missing
c = "educ"
cates = ["1-8 grades","Some high school", "High school graduate"
            ,"college", "College degree", "Master's degree", "PhD"]
rep_dic = {i:c for i,c in enumerate(cates)}
rep_dic = {**rep_dic, **{np.nan:"missing"}}
dfM[c] = dfM[c].replace(rep_dic)

In [6]:
for c in dfM.columns:
    print(c)
    print(dfM[c].isnull().sum())
    print("="*50)

popul
0
TVnews
0
selfLR
0
ClinLR
0
DoleLR
0
PID
0
age
0
educ
0
income
0
vote
0
logpopul
0


In [7]:
" + ".join(dfM.columns)

'popul + TVnews + selfLR + ClinLR + DoleLR + PID + age + educ + income + vote + logpopul'

In [8]:
dfM["age"].value_counts()

19-40      377
41-60      315
61-80      175
missing     50
81-100      27
Name: age, dtype: int64

In [9]:
formula = "vote ~ TVnews + selfLR + PID + C(age, Treatment('41-60')) + educ + income + logpopul"
res = smf.glm(formula=formula,data=dfM, family = sm.families.Binomial()).fit()

In [11]:
res.summary()

0,1,2,3
Dep. Variable:,vote,No. Observations:,944.0
Model:,GLM,Df Residuals:,913.0
Model Family:,Binomial,Df Model:,30.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-228.1
Date:,"Sun, 01 May 2022",Deviance:,456.2
Time:,11:38:59,Pearson chi2:,953.0
No. Iterations:,7,Pseudo R-squ. (CS):,0.5831
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.3737,0.951,-3.549,0.000,-5.237,-1.510
TVnews[T.moderate],0.2015,0.402,0.501,0.617,-0.587,0.990
TVnews[T.seldom],-0.3736,0.305,-1.224,0.221,-0.972,0.225
selfLR[T.Independent-Democrat],-2.2507,0.955,-2.358,0.018,-4.122,-0.380
selfLR[T.Independent-Indpendent],-1.8182,0.745,-2.440,0.015,-3.279,-0.358
selfLR[T.Independent-Republican],-0.7221,0.643,-1.122,0.262,-1.983,0.539
selfLR[T.Strong Republican],0.7330,0.666,1.101,0.271,-0.572,2.038
selfLR[T.Weak Democrat],-0.8294,1.262,-0.657,0.511,-3.303,1.644
selfLR[T.Weak Republican],-0.1586,0.655,-0.242,0.809,-1.442,1.125


In [12]:
res.summary2()

0,1,2,3
Model:,GLM,AIC:,518.2039
Link Function:,Logit,BIC:,-5797.9612
Dependent Variable:,vote,Log-Likelihood:,-228.1
Date:,2022-05-01 11:38,LL-Null:,-641.05
No. Observations:,944,Deviance:,456.2
Df Model:,30,Pearson chi2:,953.0
Df Residuals:,913,Scale:,1.0
Method:,IRLS,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-3.3737,0.9506,-3.5489,0.0004,-5.2369,-1.5105
TVnews[T.moderate],0.2015,0.4023,0.5007,0.6166,-0.5871,0.9900
TVnews[T.seldom],-0.3736,0.3052,-1.2240,0.2209,-0.9719,0.2246
selfLR[T.Independent-Democrat],-2.2507,0.9547,-2.3576,0.0184,-4.1219,-0.3796
selfLR[T.Independent-Indpendent],-1.8182,0.7451,-2.4403,0.0147,-3.2786,-0.3579
selfLR[T.Independent-Republican],-0.7221,0.6433,-1.1224,0.2617,-1.9830,0.5388
selfLR[T.Strong Republican],0.7330,0.6657,1.1011,0.2709,-0.5718,2.0379
selfLR[T.Weak Democrat],-0.8294,1.2618,-0.6573,0.5110,-3.3026,1.6437
selfLR[T.Weak Republican],-0.1586,0.6549,-0.2422,0.8086,-1.4422,1.1249


In [13]:
print(res.aic)
print(res.bic)
print(res.bic_llf)

518.2039485908164
-5797.961241100025
668.5578597413269




In [14]:
res.params

Intercept                               -3.373681
TVnews[T.moderate]                       0.201450
TVnews[T.seldom]                        -0.373617
selfLR[T.Independent-Democrat]          -2.250746
selfLR[T.Independent-Indpendent]        -1.818237
selfLR[T.Independent-Republican]        -0.722087
selfLR[T.Strong Republican]              0.733043
selfLR[T.Weak Democrat]                 -0.829419
selfLR[T.Weak Republican]               -0.158638
PID[T.Independent-Indpendent]            1.868662
PID[T.Independent-Republican]            3.514549
PID[T.Strong Democrat]                  -1.328870
PID[T.Strong Republican]                 5.098033
PID[T.Weak Democrat]                     0.250209
PID[T.Weak Republican]                   3.951231
C(age, Treatment('41-60'))[T.19-40]      0.172513
C(age, Treatment('41-60'))[T.61-80]      0.319518
C(age, Treatment('41-60'))[T.81-100]     1.243672
C(age, Treatment('41-60'))[T.missing]   -1.166781
educ[T.College degree]                  -0.292844


In [15]:
odds = np.exp(res.conf_int())
odds["odds"] = np.exp(res.params) 
odds = odds.T
odds.loc["odds_pretty"] = odds.apply(lambda x : f'{x["odds"]:.2f} ({x[0]:.2f}, {x[1]:.2f})')
odds = odds.T

In [16]:
odds

Unnamed: 0,0,1,odds,odds_pretty
Intercept,0.005317,0.220804,0.034263,"0.03 (0.01, 0.22)"
TVnews[T.moderate],0.55594,2.691223,1.223176,"1.22 (0.56, 2.69)"
TVnews[T.seldom],0.378376,1.251861,0.68824,"0.69 (0.38, 1.25)"
selfLR[T.Independent-Democrat],0.016214,0.684111,0.105321,"0.11 (0.02, 0.68)"
selfLR[T.Independent-Indpendent],0.037682,0.69915,0.162312,"0.16 (0.04, 0.70)"
selfLR[T.Independent-Republican],0.137659,1.713949,0.485738,"0.49 (0.14, 1.71)"
selfLR[T.Strong Republican],0.564522,7.674186,2.081405,"2.08 (0.56, 7.67)"
selfLR[T.Weak Democrat],0.036789,5.174422,0.436303,"0.44 (0.04, 5.17)"
selfLR[T.Weak Republican],0.236412,3.079924,0.853305,"0.85 (0.24, 3.08)"
PID[T.Independent-Indpendent],2.112784,19.872123,6.479622,"6.48 (2.11, 19.87)"
