In [1]:
# import library.
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import statsmodels.api as sm
from sklearn import metrics

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import data.
polling = pd.read_csv('../data/PollingData.csv')
polling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   State       145 non-null    object 
 1   Year        145 non-null    int64  
 2   Rasmussen   99 non-null     float64
 3   SurveyUSA   74 non-null     float64
 4   DiffCount   145 non-null    int64  
 5   PropR       145 non-null    float64
 6   Republican  145 non-null    int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 8.1+ KB


In [3]:
# data for each year.
polling['Year'].value_counts()

2004    50
2008    50
2012    45
Name: Year, dtype: int64

In [4]:
# missing data.
polling.isnull().sum()

State          0
Year           0
Rasmussen     46
SurveyUSA     71
DiffCount      0
PropR          0
Republican     0
dtype: int64

In [5]:
# limited features.
features = ['Rasmussen', 'SurveyUSA', 'PropR', 'DiffCount']
simple = polling[features].copy()
simple.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Rasmussen  99 non-null     float64
 1   SurveyUSA  74 non-null     float64
 2   PropR      145 non-null    float64
 3   DiffCount  145 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 4.7 KB


In [6]:
# impute missing values.
imputer = IterativeImputer(random_state=0)
imputed = pd.DataFrame(imputer.fit_transform(simple).round(1), columns=features)



In [7]:
# impute missing values in the original df.
polling[['Rasmussen', 'SurveyUSA']] = imputed[['Rasmussen', 'SurveyUSA']].copy()

In [8]:
# create training and testing sets.
train = polling[polling['Year'].isin([2004, 2008])].copy()
test = polling[polling['Year']==2012].copy()

In [9]:
# baseline.
train['Republican'].value_counts()

1    53
0    47
Name: Republican, dtype: int64

In [10]:
# baseline.
train['RasmussenSign'] = np.where(
    train['Rasmussen']<=0,
    np.where(
        train['Rasmussen']==0,
        0,
        -1
    ),
    1
)

test['RasmussenSign'] = np.where(
    test['Rasmussen']<=0,
    np.where(
        test['Rasmussen']==0,
        0,
        -1
    ),
    1
)

train[['RasmussenSign', 'Republican']].value_counts().sort_index()

RasmussenSign  Republican
-1             0             42
 0             0              1
               1              1
 1             0              4
               1             52
dtype: int64

In [11]:
train.corr().round(2)

Unnamed: 0,Year,Rasmussen,SurveyUSA,DiffCount,PropR,Republican,RasmussenSign
Year,1.0,-0.17,-0.19,-0.24,-0.19,-0.18,-0.14
Rasmussen,-0.17,1.0,0.96,0.54,0.82,0.78,0.8
SurveyUSA,-0.19,0.96,1.0,0.57,0.85,0.8,0.81
DiffCount,-0.24,0.54,0.57,1.0,0.83,0.81,0.76
PropR,-0.19,0.82,0.85,0.83,1.0,0.95,0.94
Republican,-0.18,0.78,0.8,0.81,0.95,1.0,0.91
RasmussenSign,-0.14,0.8,0.81,0.76,0.94,0.91,1.0


In [12]:
model1 = sm.Logit(train['Republican'], sm.add_constant(train[['PropR']])).fit()
print(model1.summary())

pred_1 = model1.predict(sm.add_constant(test[['PropR']]))
print(model1.pred_table())

Optimization terminated successfully.
         Current function value: 0.078862
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:             Republican   No. Observations:                  100
Model:                          Logit   Df Residuals:                       98
Method:                           MLE   Df Model:                            1
Date:                Thu, 19 Aug 2021   Pseudo R-squ.:                  0.8859
Time:                        22:18:41   Log-Likelihood:                -7.8862
converged:                       True   LL-Null:                       -69.135
Covariance Type:            nonrobust   LLR p-value:                 1.797e-28
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.1462      1.977     -3.108      0.002     -10.022      -2.271
PropR         11.3904      3.

In [13]:
model2 = sm.Logit(train['Republican'], sm.add_constant(train[['SurveyUSA', 'DiffCount']])).fit()
print(model2.summary())

pred_2 = model2.predict(sm.add_constant(test[['SurveyUSA', 'DiffCount']]))
print(model2.pred_table())

Optimization terminated successfully.
         Current function value: 0.058798
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:             Republican   No. Observations:                  100
Model:                          Logit   Df Residuals:                       97
Method:                           MLE   Df Model:                            2
Date:                Thu, 19 Aug 2021   Pseudo R-squ.:                  0.9150
Time:                        22:18:41   Log-Likelihood:                -5.8798
converged:                       True   LL-Null:                       -69.135
Covariance Type:            nonrobust   LLR p-value:                 3.379e-28
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2978      1.343     -0.966      0.334      -3.931       1.335
SurveyUSA      0.3940      0

In [14]:
th = 0.5
test['pred_1'] = (pred_1 >= th).astype(int)
test['pred_2'] = (pred_1 >= th).astype(int)

In [15]:
test[['Republican', 'RasmussenSign']].value_counts().sort_index()

Republican  RasmussenSign
0           -1               18
             0                2
             1                4
1            1               21
dtype: int64

In [16]:
test[['Republican', 'pred_1']].value_counts().sort_index()

Republican  pred_1
0           0         23
            1          1
1           1         21
dtype: int64

In [17]:
test[['Republican', 'pred_2']].value_counts().sort_index()

Republican  pred_2
0           0         23
            1          1
1           1         21
dtype: int64