In [1]:
%load_ext autoreload
%autoreload 2

## Objective

In order to interpret the coefficient values and whether they're statistically signficant in an accurate manner, I need to remove colinearity.

In [13]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))

from helper_functions import dummify_cols_and_baselines, make_alphas, adjusted_r2, transform_school, get_vifs
from transform_for_num_issues_pred import main as transform_dataset

In [34]:
from __future__ import division
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
import json

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20,5

In [4]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

In [7]:
df_orig = transform_school(df_orig)
df_orig.shape

  df.school = df.school.str.extract(r'(\d\d?)').astype(int)


(516406, 40)

In [14]:
df_transformed = transform_dataset(df_orig)
df_transformed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_subset['NUM_ISSUES'] = 1


(534, 28)

## Choosing columns

In [10]:
cols_orig_dataset = ['NUM_ISSUES_PER_1000_POP']
cols_census = ['race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'earned_income_per_capita',
     'poverty_pop_w_public_assistance',
     'poverty_pop_w_food_stamps',
     'poverty_pop_w_ssi',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'bedroom',
     'bedroom_std_dev',
     'value',
     'value_std_dev',
     'rent',
     'rent_std_dev',
     'income',
     'income_std_dev']

In [15]:
df_col_subset = df_transformed[cols_orig_dataset + cols_census]
df_col_subset.shape

(534, 23)

## Dummify

In [18]:
cols_to_dummify = df_col_subset.dtypes[df_col_subset.dtypes == object].index
cols_to_dummify

Index([u'housing'], dtype='object')

In [29]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df_col_subset, cols_to_dummify)
df_dummified.shape

rent is baseline 0 1


(534, 23)

In [23]:
df_dummified.head(1).T

Unnamed: 0,0
NUM_ISSUES_PER_1000_POP,625.844595
race_white,0.242399
race_black,0.514358
race_asian,0.035473
race_hispanic,0.067568
poverty_pop_below_poverty_level,0.0
earned_income_per_capita,34340.0
poverty_pop_w_public_assistance,0.059748
poverty_pop_w_food_stamps,0.138365
poverty_pop_w_ssi,0.059748


## Let's check

In [32]:
get_vifs(df_dummified, 'NUM_ISSUES_PER_1000_POP')

race_white                              inf
race_other                              inf
race_hispanic                           inf
race_black                              inf
race_asian                              inf
poverty_pop_w_food_stamps          4.751462
bedroom_std_dev                    4.313277
value_std_dev                      4.151609
rent_std_dev                       4.076989
income_std_dev                     4.040532
school_std_dev                     3.750693
poverty_pop_below_poverty_level    3.493020
earned_income_per_capita           3.222267
housing_std_dev                    2.847687
income                             2.740324
housing_own                        2.470539
poverty_pop_w_ssi                  2.214317
school                             1.903531
poverty_pop_w_public_assistance    1.622089
bedroom                            1.543466
rent                               1.542634
value                              1.323880
dtype: float64

In [33]:
get_vifs(df_dummified.drop(['race_white'], axis=1), 'NUM_ISSUES_PER_1000_POP')

poverty_pop_w_food_stamps          4.751462
bedroom_std_dev                    4.313277
value_std_dev                      4.151609
rent_std_dev                       4.076989
income_std_dev                     4.040532
school_std_dev                     3.750693
poverty_pop_below_poverty_level    3.493020
earned_income_per_capita           3.222267
housing_std_dev                    2.847687
income                             2.740324
housing_own                        2.470539
race_black                         2.442737
poverty_pop_w_ssi                  2.214317
school                             1.903531
race_hispanic                      1.822507
poverty_pop_w_public_assistance    1.622089
race_asian                         1.557507
bedroom                            1.543466
rent                               1.542634
value                              1.323880
race_other                         1.097649
dtype: float64

In [36]:
get_vifs(df_dummified.drop(['race_white', 'bedroom_std_dev', 'poverty_pop_w_food_stamps'], axis=1), 'NUM_ISSUES_PER_1000_POP')

value_std_dev                      3.844698
income_std_dev                     3.691467
school_std_dev                     3.624831
rent_std_dev                       3.443195
poverty_pop_below_poverty_level    3.301714
earned_income_per_capita           3.193815
income                             2.738276
housing_std_dev                    2.661114
housing_own                        2.430098
race_black                         2.198097
school                             1.870666
poverty_pop_w_ssi                  1.785700
race_hispanic                      1.732169
race_asian                         1.525378
rent                               1.505189
bedroom                            1.494732
poverty_pop_w_public_assistance    1.450564
value                              1.318547
race_other                         1.095070
dtype: float64

## Conclusion

Once we set `race_white` to be the dummy variable, we don't have a feature that has a Variance Inflation Factor of 5 or more, which is a rule of thumb for a problematic amount of co-linearity, according to ISLR.