## Data cleaning and feature engineering

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/voter-survey-december16.csv')

In [3]:
pattern = '(imiss_).{1}(_2016)'
def return_imiss_cols(df):
    imiss_cols = []
    for col in df.columns:
        if re.search(pattern, col):
            imiss_cols.append(col)
    return imiss_cols

# issue column list
imiss_col_list = return_imiss_cols(df)

In [4]:
# target column list  
target_col_list = ['presvote16post_2016']
df_target = df[target_col_list]

In [5]:
df_issue = df[imiss_col_list].fillna(0)
df_issue['imiss_j_2016'].value_counts()#.isna().sum()

1.0    5947
2.0    1688
3.0     181
0.0     127
4.0      57
Name: imiss_j_2016, dtype: int64

In [6]:
df_issue = df_issue.astype(str)
df_issue = df_issue.join(df_target)

In [7]:
df_dummies = pd.get_dummies(df_issue)

In [8]:
df_dummies.corr()['presvote16post_2016'].abs().sort_values(ascending=False)

presvote16post_2016    1.000000
imiss_l_2016_1.0       0.304989
imiss_d_2016_1.0       0.286253
imiss_y_2016_1.0       0.267083
imiss_u_2016_1.0       0.254038
imiss_x_2016_1.0       0.240264
imiss_g_2016_1.0       0.230016
imiss_p_2016_1.0       0.226241
imiss_l_2016_4.0       0.212959
imiss_u_2016_3.0       0.192896
imiss_q_2016_1.0       0.185467
imiss_g_2016_4.0       0.182592
imiss_d_2016_3.0       0.179266
imiss_l_2016_3.0       0.178131
imiss_y_2016_4.0       0.176428
imiss_p_2016_3.0       0.155080
imiss_y_2016_3.0       0.153542
imiss_j_2016_1.0       0.147418
imiss_x_2016_3.0       0.144682
imiss_i_2016_1.0       0.140380
imiss_f_2016_1.0       0.137812
imiss_s_2016_1.0       0.135825
imiss_i_2016_3.0       0.135604
imiss_k_2016_1.0       0.133861
imiss_r_2016_1.0       0.132862
imiss_h_2016_1.0       0.129626
imiss_x_2016_4.0       0.123174
imiss_q_2016_3.0       0.120913
imiss_i_2016_4.0       0.120097
imiss_e_2016_1.0       0.119690
                         ...   
imiss_r_

In [10]:
df_dummies.shape

(8000, 116)

In [17]:
df_predictors = df_dummies.drop('presvote16post_2016', axis=1)

In [18]:
df_predictors.head()

Unnamed: 0,imiss_a_2016_0.0,imiss_a_2016_1.0,imiss_a_2016_2.0,imiss_a_2016_3.0,imiss_a_2016_4.0,imiss_b_2016_0.0,imiss_b_2016_1.0,imiss_b_2016_2.0,imiss_b_2016_3.0,imiss_b_2016_4.0,...,imiss_x_2016_0.0,imiss_x_2016_1.0,imiss_x_2016_2.0,imiss_x_2016_3.0,imiss_x_2016_4.0,imiss_y_2016_0.0,imiss_y_2016_1.0,imiss_y_2016_2.0,imiss_y_2016_3.0,imiss_y_2016_4.0
0,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
