# Processing

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import seaborn as sns

In [2]:
import sys
import os
sys.path.append('../src/features')
from build_features import fill_nulls, filter_col_with_regex, get_dummies

In [3]:
# read in data 
df = pd.read_csv('../data/raw/20161201_voter_study_group.csv')
# fill null values 
df = fill_nulls(df)
weight = df.weight

In [16]:
# set regex pattern to find predictor column names
pattern = '(imiss_).{1}(_2016)'
# return list of predictor column names
col_list = filter_col_with_regex(df, pattern)
# save predictor subset of dataframe 
df_issue = df[col_list]
# save target subset of dataframe
df_target = pd.DataFrame(df['presvote16post_2016'])

# one hot encoding categorical variables 
df_issue_dummies = get_dummies(df_issue)
# dummy predictors and target
df_target_issue_dummies = df_target.join(df_issue_dummies)
# correlation coefficients 
df_target_issue_dummies.corr()['presvote16post_2016'].sort_values(ascending=False).tail(50)

    

imiss_i_2016_8.0    0.026377
imiss_e_2016_8.0    0.021660
imiss_t_2016_3.0    0.021117
imiss_t_2016_4.0    0.021060
imiss_b_2016_8.0    0.020605
imiss_c_2016_8.0    0.020605
imiss_f_2016_4.0    0.018626
imiss_c_2016_4.0    0.017311
imiss_j_2016_8.0    0.016875
imiss_o_2016_8.0    0.015868
imiss_u_2016_8.0    0.015509
imiss_k_2016_8.0    0.012888
imiss_f_2016_8.0    0.012554
imiss_q_2016_8.0    0.012503
imiss_a_2016_4.0    0.011710
imiss_t_2016_8.0    0.010219
imiss_g_2016_8.0    0.009219
imiss_l_2016_8.0    0.009035
imiss_m_2016_8.0    0.008992
imiss_y_2016_8.0    0.008992
imiss_s_2016_8.0    0.007443
imiss_o_2016_3.0    0.007269
imiss_r_2016_4.0    0.004385
imiss_d_2016_8.0    0.003177
imiss_x_2016_8.0    0.002776
imiss_e_2016_2.0    0.002531
imiss_p_2016_8.0    0.002056
imiss_b_2016_3.0   -0.000001
imiss_a_2016_8.0   -0.002177
imiss_u_2016_2.0   -0.004779
imiss_t_2016_2.0   -0.011815
imiss_b_2016_2.0   -0.013278
imiss_i_2016_2.0   -0.025044
imiss_o_2016_2.0   -0.025101
imiss_p_2016_4

In [5]:
df_target_issue = df_issue.join(df_target)
with open('../data/processed/df_target_issue.pickle', 'wb') as file:
    pickle.dump(df_target_issue, file)
with open('../data/processed/df_target_issue_dummies.pickle', 'wb') as file:
    pickle.dump(df_target_issue_dummies, file)

In [10]:
# convert target to 3 categories 
df_3cat = df_target['presvote16post_2016'].replace(
    to_replace=[8, 3, 4, 5, 6, 7],
    value=3,
    inplace=False
)  
# categorical issues, 3 target categories, weight 
df_3cat_issue_weight = df_issue.join(df_3cat).join(df['weight'])

with open('../data/processed/df_3cat_issue_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_issue_weight, file)
    
df_3cat_dummies = df_issue_dummies.join(df_3cat)

In [7]:
# three categories for target, issues, and weight 
df_3cat_dummies_weight = df_issue_dummies.join(df_3cat).join(df['weight'])
with open('../data/processed/df_3cat_dummies_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_dummies_weight, file)

In [8]:
# predictors 
X = df_issue
# target as array 
y = np.ravel(df_3cat)

In [9]:
# pickle X and y 
with open('../data/processed/predictor.pickle', 'wb') as file:
    pickle.dump(X, file)
with open('../data/processed/target.pickle', 'wb') as file:
    pickle.dump(y, file)
with open ('../data/processed/df_3cat_dummies_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_dummies_weight, file)

## Check counts for each

In [86]:
new = df_issue.join(df_3cat)
clinton = new.loc[new.presvote16post_2016 == 1]
trump = new.loc[new.presvote16post_2016 == 2]
other = new.loc[new.presvote16post_2016 == 3]
votes = [clinton, trump, other]

Unnamed: 0,imiss_a_2016,imiss_b_2016,imiss_c_2016,imiss_d_2016,imiss_e_2016,imiss_f_2016,imiss_g_2016,imiss_h_2016,imiss_i_2016,imiss_j_2016,...,imiss_o_2016,imiss_p_2016,imiss_q_2016,imiss_r_2016,imiss_s_2016,imiss_t_2016,imiss_u_2016,imiss_x_2016,imiss_y_2016,presvote16post_2016
1,1.0,1.0,2.0,2.0,1.0,1.0,4.0,2.0,2.0,1.0,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,3.0,4.0,2.0
4,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,2.0,1.0,...,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0
7,1.0,2.0,1.0,2.0,2.0,3.0,4.0,2.0,3.0,1.0,...,2.0,1.0,4.0,2.0,1.0,4.0,1.0,4.0,3.0,2.0
12,1.0,1.0,1.0,3.0,2.0,1.0,4.0,2.0,2.0,1.0,...,2.0,1.0,3.0,1.0,3.0,3.0,2.0,2.0,3.0,2.0
17,2.0,1.0,1.0,3.0,1.0,2.0,4.0,3.0,3.0,2.0,...,1.0,1.0,3.0,1.0,3.0,3.0,1.0,3.0,2.0,2.0


In [89]:
clinton.reset_index(drop=True, inplace=True)
other.reset_index(drop=True, inplace=True)
trump.reset_index(drop=True, inplace=True)

In [90]:
for i in votes:
    i.drop('presvote16post_2016', inplace=True, axis=1);

In [101]:
# clinton counts
one_count = 0
two_count = 0
three_count = 0 
four_count = 0
null_count = 0
print(clinton.shape)
for col in clinton: 
    for i in range(0, 3545):
        if df[col][i] == 1:
            one_count +=1
            continue
        if df[col][i] == 2:
            two_count +=1
            continue
        if df[col][i] == 3:
            three_count +=1
            continue
        if df[col][i] == 4:
            four_count +=1
            continue
        if df[col][i] == 8:
            null_count +=1
            
print('CLINTON','\n'
      'very:',one_count,'\n'
      'somewhat:',two_count, '\n'
      'not very:', three_count,'\n'
      'unimportant:', four_count, '\n'
      'no response:', null_count)

(3545, 23)
CLINTON 
very: 40969 
somewhat: 25157 
not very: 9679 
unimportant: 4653 
no response: 1077


In [102]:
one_count + two_count +  three_count + four_count + null_count

81535

In [103]:
3545 * 23

81535

In [106]:
# trump counts 
one_count = 0
two_count = 0
three_count = 0 
four_count = 0
null_count = 0
print(trump.shape)
for col in trump: 
    for i in range(0, 3479):
        if df[col][i] == 1:
            one_count +=1
            continue
        if df[col][i] == 2:
            two_count +=1
            continue
        if df[col][i] == 3:
            three_count +=1
            continue
        if df[col][i] == 4:
            four_count +=1
            continue
        if df[col][i] == 8:
            null_count +=1
            
print('TRUMP','\n'
      'very:',one_count,'\n'
      'somewhat:',two_count, '\n'
      'not very:', three_count,'\n'
      'unimportant:', four_count, '\n'
      'no response:', null_count)

(3479, 23)
TRUMP 
very: 40305 
somewhat: 24586 
not very: 9480 
unimportant: 4573 
no response: 1073


In [107]:
one_count +two_count+three_count+four_count+null_count

80017

In [108]:
3479*23

80017