# Processing

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import weightedcalcs as wc

Import functions from py files 

In [2]:
import sys
import os
sys.path.append('../src/features')
from build_features import fill_nulls, filter_col_with_regex, get_dummies

Read in data from Voter Study Group CSV

In [3]:
df = pd.read_csv('../data/raw/20161201_voter_study_group.csv')

Check number of observations, number of features, and a few example column names

In [9]:
print('num observations:', df.shape[0], '\n'
      'num features:', df.shape[1], '\n'
      'example columns:', list(df.columns)[0:5])

num observations: 8000 
num columns: 668 
example columns: ['case_identifier', 'weight', 'PARTY_AGENDAS_rand_2016', 'pp_primary16_2016', 'pp_demprim16_2016']


Check number of null values per feature

In [10]:
df.isna().sum().head(10)

case_identifier               0
weight                        0
PARTY_AGENDAS_rand_2016       0
pp_primary16_2016             0
pp_demprim16_2016          5026
pp_repprim16_2016          5094
inputstate_2016               0
izip_2016                     0
votereg2_2016                 0
votereg_f_2016               90
dtype: int64

Fill null values

In [11]:
df = fill_nulls(df)

Pickle dataframe

In [12]:
with open('../data/processed/df.pickle', 'wb') as file:
    pickle.dump(df, file)

In [None]:
# weight = df['weight']
# calc = wc.Calculator('weight')

In [None]:
# set regex pattern to find predictor column names
pattern = '(imiss_).{1}(_2016)'
# return list of predictor column names
col_list = filter_col_with_regex(df, pattern)
# save predictor subset of dataframe 
df_issue = df[col_list]
# save target subset of dataframe
df_target = pd.DataFrame(df['presvote16post_2016'])

# one hot encoding categorical variables 
df_issue_dummies = get_dummies(df_issue)
# dummy predictors and target
df_target_issue_dummies = df_target.join(df_issue_dummies)
# correlation coefficients 
df_target_issue_dummies.corr()['presvote16post_2016'].sort_values(ascending=False).head(10)

In [None]:
df_target_issue = df_issue.join(df_target)
with open('../data/processed/df_target_issue.pickle', 'wb') as file:
    pickle.dump(df_target_issue, file)
with open('../data/processed/df_target_issue_dummies.pickle', 'wb') as file:
    pickle.dump(df_target_issue_dummies, file)

In [None]:
# convert target to 3 categories 
df_3cat = df_target['presvote16post_2016'].replace(
    to_replace=[8, 3, 4, 5, 6, 7],
    value=3,
    inplace=False
)  
# categorical issues, 3 target categories, weight 
df_3cat_issue_weight = df_issue.join(df_3cat).join(df['weight'])

with open('../data/processed/df_3cat_issue_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_issue_weight, file)
    
df_3cat_dummies = df_issue_dummies.join(df_3cat)

In [None]:
# three categories for target, issues, and weight 
df_3cat_dummies_weight = df_issue_dummies.join(df_3cat).join(df['weight'])
with open('../data/processed/df_3cat_dummies_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_dummies_weight, file)

In [None]:
# predictors 
X = df_issue_dummies
# target as array 
y = np.ravel(df_3cat)

In [None]:
# pickle X and y 
with open('../data/processed/predictor.pickle', 'wb') as file:
    pickle.dump(X, file)
with open('../data/processed/target.pickle', 'wb') as file:
    pickle.dump(y, file)
with open ('../data/processed/df_3cat_dummies_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_dummies_weight, file)

## Check counts for each

In [None]:
new = df_issue.join(df_3cat)
clinton = new.loc[new.presvote16post_2016 == 1]
trump = new.loc[new.presvote16post_2016 == 2]
other = new.loc[new.presvote16post_2016 == 3]
votes = [clinton, trump, other]

In [None]:
clinton.reset_index(drop=True, inplace=True)
other.reset_index(drop=True, inplace=True)
trump.reset_index(drop=True, inplace=True)

In [None]:
for i in votes:
    i.drop('presvote16post_2016', inplace=True, axis=1);

In [None]:
# clinton counts
one_count = 0
two_count = 0
three_count = 0 
four_count = 0
null_count = 0
print(clinton.shape)
for col in clinton: 
    for i in range(0, 3545):
        if df[col][i] == 1:
            one_count +=1
            continue
        if df[col][i] == 2:
            two_count +=1
            continue
        if df[col][i] == 3:
            three_count +=1
            continue
        if df[col][i] == 4:
            four_count +=1
            continue
        if df[col][i] == 8:
            null_count +=1
            
print('CLINTON','\n'
      'very:',one_count,'\n'
      'somewhat:',two_count, '\n'
      'not very:', three_count,'\n'
      'unimportant:', four_count, '\n'
      'no response:', null_count)

In [None]:
one_count + two_count +  three_count + four_count + null_count

In [None]:
3545 * 23

In [None]:
item = [map(row function, x), x in cols]

In [None]:
# trump counts 
one_count = 0
two_count = 0
three_count = 0 
four_count = 0
null_count = 0
print(trump.shape)
for col in trump: 
    for i in range(0, 3479):
        if df[col][i] == 1:
            one_count +=1
            continue
        if df[col][i] == 2:
            two_count +=1
            continue
        if df[col][i] == 3:
            three_count +=1
            continue
        if df[col][i] == 4:
            four_count +=1
            continue
        if df[col][i] == 8:
            null_count +=1
            
print('TRUMP','\n'
      'very:',one_count,'\n'
      'somewhat:',two_count, '\n'
      'not very:', three_count,'\n'
      'unimportant:', four_count, '\n'
      'no response:', null_count)

In [None]:
one_count +two_count+three_count+four_count+null_count

In [None]:
3479*23