# Processing

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import weightedcalcs as wc
import random

Set random seed

In [2]:
random.seed(0)

Import functions from py files 

In [3]:
import sys
import os
sys.path.append('../src/features')
from build_features import fill_nulls, filter_col_with_regex, get_dummies

Read in data from Voter Study Group CSV

In [4]:
df = pd.read_csv('../data/raw/20161201_voter_study_group.csv')

Fill missing values with a number that represents the category "no response"

In [7]:
df = fill_nulls(df)

Add column that groups the <b>target variable</b> into three categories

In [26]:
df['vote_3cat'] = df_target['presvote16post_2016'].replace(
    to_replace=[8, 3, 4, 5, 6, 7], value=3, inplace=False)

Pickle clean dataframe with all features, weights, and target columns

In [24]:
with open('../data/processed/df.pickle', 'wb') as file:
    pickle.dump(df, file)

Create a list of features to represent <b>community engagement</b>

Set regular expressions pattern to match columns with issue questions

In [27]:
pattern = '(imiss_).{1}(_2016)'

Call function to return list

In [None]:
col_list = filter_col_with_regex(df, pattern)
df_issue = df[col_list]
# save target subset of dataframe
df_target = pd.DataFrame(df['presvote16post_2016'])

# one hot encoding categorical variables 
df_issue_dummies = get_dummies(df_issue)
# dummy predictors and target
df_target_issue_dummies = df_target.join(df_issue_dummies)
# correlation coefficients 
df_target_issue_dummies.corr()['presvote16post_2016'].sort_values(ascending=False).head(10)

In [10]:
# convert target to 3 categories 
df_3cat = df_target['presvote16post_2016'].replace(
    to_replace=[8, 3, 4, 5, 6, 7],
    value=3,
    inplace=False
)  
# categorical issues, 3 target categories, weight 
df_3cat_issue_weight = df_issue.join(df_3cat).join(df['weight'])

with open('../data/processed/df_3cat_issue_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_issue_weight, file)
    
df_3cat_dummies = df_issue_dummies.join(df_3cat)

In [11]:
# three categories for target, issues, and weight 
df_3cat_dummies_weight = df_issue_dummies.join(df_3cat).join(df['weight'])
with open('../data/processed/df_3cat_dummies_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_dummies_weight, file)

In [12]:
# predictors 
X = df_issue_dummies
# target as array 
y = np.ravel(df_3cat)

In [13]:
# pickle X and y 
with open('../data/processed/predictor.pickle', 'wb') as file:
    pickle.dump(X, file)
with open('../data/processed/target.pickle', 'wb') as file:
    pickle.dump(y, file)
with open ('../data/processed/df_3cat_dummies_weight.pickle', 'wb') as file:
    pickle.dump(df_3cat_dummies_weight, file)

## Check counts for each

In [14]:
new = df_issue.join(df_3cat)
clinton = new.loc[new.presvote16post_2016 == 1]
trump = new.loc[new.presvote16post_2016 == 2]
other = new.loc[new.presvote16post_2016 == 3]
votes = [clinton, trump, other]

In [15]:
clinton.reset_index(drop=True, inplace=True)
other.reset_index(drop=True, inplace=True)
trump.reset_index(drop=True, inplace=True)

In [16]:
for i in votes:
    i.drop('presvote16post_2016', inplace=True, axis=1);

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [17]:
# clinton counts
one_count = 0
two_count = 0
three_count = 0 
four_count = 0
null_count = 0
print(clinton.shape)
for col in clinton: 
    for i in range(0, 3545):
        if df[col][i] == 1:
            one_count +=1
            continue
        if df[col][i] == 2:
            two_count +=1
            continue
        if df[col][i] == 3:
            three_count +=1
            continue
        if df[col][i] == 4:
            four_count +=1
            continue
        if df[col][i] == 8:
            null_count +=1
            
print('CLINTON','\n'
      'very:',one_count,'\n'
      'somewhat:',two_count, '\n'
      'not very:', three_count,'\n'
      'unimportant:', four_count, '\n'
      'no response:', null_count)

(3545, 23)
CLINTON 
very: 40969 
somewhat: 25157 
not very: 9679 
unimportant: 4653 
no response: 1077


In [18]:
one_count + two_count +  three_count + four_count + null_count

81535

In [19]:
3545 * 23

81535

In [20]:
item = [map(row function, x), x in cols]

SyntaxError: invalid syntax (<ipython-input-20-62cb0965f841>, line 1)

In [None]:
# trump counts 
one_count = 0
two_count = 0
three_count = 0 
four_count = 0
null_count = 0
print(trump.shape)
for col in trump: 
    for i in range(0, 3479):
        if df[col][i] == 1:
            one_count +=1
            continue
        if df[col][i] == 2:
            two_count +=1
            continue
        if df[col][i] == 3:
            three_count +=1
            continue
        if df[col][i] == 4:
            four_count +=1
            continue
        if df[col][i] == 8:
            null_count +=1
            
print('TRUMP','\n'
      'very:',one_count,'\n'
      'somewhat:',two_count, '\n'
      'not very:', three_count,'\n'
      'unimportant:', four_count, '\n'
      'no response:', null_count)

In [21]:
one_count +two_count+three_count+four_count+null_count

81535

In [22]:
3479*23

80017