# Processing

In [60]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import seaborn as sns

In [61]:
import sys
import os
sys.path.append('../src/features')
from build_features import fill_nulls, filter_col_with_regex, get_dummies

In [62]:
# read in data 
df = pd.read_csv('../data/raw/20161201_voter_study_group.csv')
# fill null values 
df = fill_nulls(df)

In [63]:
# set regex pattern to find predictor column names
pattern = '(imiss_).{1}(_2016)'
# return list of predictor column names
col_list = filter_col_with_regex(df, pattern)
# save predictor subset of dataframe 
df_issue = df[col_list]
# one hot encoding categorical variables 
df_issue = get_dummies(df_issue)

# save target subset of dataframe
df_target = pd.DataFrame(df['presvote16post_2016'])

# predictors and target
df_target_issue = df_target.join(df_issue)
# correlation coefficients 
df_target_issue.corr()['presvote16post_2016'].abs().sort_values(ascending=False).head(10)

presvote16post_2016    1.000000
imiss_l_2016_1.0       0.271828
imiss_d_2016_1.0       0.260236
imiss_u_2016_1.0       0.242079
imiss_y_2016_1.0       0.237089
imiss_l_2016_4.0       0.212023
imiss_x_2016_1.0       0.211060
imiss_p_2016_1.0       0.208277
imiss_g_2016_1.0       0.200187
imiss_u_2016_3.0       0.177785
Name: presvote16post_2016, dtype: float64

In [64]:
# convert target to 3 categories 
df_3_cats = df_target['presvote16post_2016'].replace(
    to_replace=[0, 3, 4, 5, 6, 7],
    value=0,
    inplace=False
)              

In [65]:
# predictors 
X = df_issue
# target as array 
y = np.ravel(df_3_cats)

In [67]:
# pickle X and y 
sys.path.append('../data/processed/')
with open('predictor.pickle', 'wb') as file:
    pickle.dump(X, file)
with open('target.pickle', 'wb') as file:
    pickle.dump(y, file)