In [1]:
%load_ext autoreload
%autoreload 2

## Objective

I will try a LassoCV model with all the socioeconomic features I have from the Census. Then, I will remove the multicolinear features and run the model again. I know offhand that a couple of my features (like poverty) are multicolinear.

Also, I _might_ want to remove 0107021 Newbury St.

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))

from transform_for_num_issues_pred import main as transform_dataset

In [3]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20, 5

from helper_functions import dummify_cols_and_baselines, make_alphas

In [4]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

In [None]:
df_orig.head(1).T

In [12]:
df_transformed = transform_dataset(df_orig)
df_transformed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_subset1 = df_subset.groupby('tract_and_block_group').sum().reset_index()


(534, 28)

In [15]:
df_transformed.head(1).T

Unnamed: 0,0
tract_and_block_group,1004002
race_white,0.242399
race_black,0.514358
race_asian,0.035473
race_hispanic,0.0675676
race_other,0.140203
poverty_pop_below_poverty_level,0
earned_income_per_capita,34340
poverty_pop_w_public_assistance,0.0597484
poverty_pop_w_food_stamps,0.138365


In [13]:
df_transformed.NUM_ISSUES_PER_1000_POP.describe()

count     534.000000
mean      816.807870
std       397.501518
min        36.706869
25%       569.713779
50%       765.916847
75%      1029.902627
max      2939.597315
Name: NUM_ISSUES_PER_1000_POP, dtype: float64

In [14]:
df_transformed.set_index('tract_and_block_group').NUM_ISSUES_PER_1000_POP.sort_values(ascending=False).head(10)

tract_and_block_group
0107021    2939.597315
0404012    2247.452693
0201011    2224.089636
0701011    2094.106464
0703002    2070.548712
0403005    2040.268456
0806013    2038.740920
0703001    1924.330617
0002012    1901.960784
0910013    1847.953216
Name: NUM_ISSUES_PER_1000_POP, dtype: float64

## Dropping more outliers

To see if that improves the model. If so, the next question is, when do we stop removing outliers!?!

In [69]:
more_outliers = ['0107021', '0701011']

In [70]:
df_transformed = df_transformed[~df_transformed.tract_and_block_group.isin(more_outliers)]
df_transformed.shape

(532, 28)

In [68]:
df_transformed.head(1).T

Unnamed: 0,0
tract_and_block_group,1004002
race_white,0.242399
race_black,0.514358
race_asian,0.035473
race_hispanic,0.0675676
race_other,0.140203
poverty_pop_below_poverty_level,0
earned_income_per_capita,34340
poverty_pop_w_public_assistance,0.0597484
poverty_pop_w_food_stamps,0.138365


## Choosing columns

In [19]:
cols_orig_dataset = ['NUM_ISSUES_PER_1000_POP']
cols_census = ['race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'earned_income_per_capita',
     'poverty_pop_w_public_assistance',
     'poverty_pop_w_food_stamps',
     'poverty_pop_w_ssi',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'bedroom',
     'bedroom_std_dev',
     'value',
     'value_std_dev',
     'rent',
     'rent_std_dev',
     'income',
     'income_std_dev']

In [71]:
df_col_subset = df_transformed[cols_orig_dataset + cols_census]
df_col_subset.shape

(532, 23)

## Dummify

In [21]:
cols_to_dummify = df_col_subset.dtypes[df_col_subset.dtypes == object].index
cols_to_dummify

Index([u'school', u'housing'], dtype='object')

In [72]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df_col_subset, cols_to_dummify)

8_6th_grade is baseline 0 2
rent is baseline 1 2


In [73]:
df_dummified.shape

(532, 32)

In [None]:
# df_dummified = df_dummified.drop('race_other', axis=1)

## Running model

In [24]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer




In [81]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('NUM_ISSUES_PER_1000_POP', axis=1), 
    df_dummified.NUM_ISSUES_PER_1000_POP, 
    test_size=0.2, 
    random_state=500
)

In [82]:
pipe = make_pipeline(StandardScaler(), LassoCV())

In [62]:
mse = make_scorer(mean_squared_error, greater_is_better=False)

In [83]:
params = {'lassocv__alphas': make_alphas(0, 3)}
# params = {'lassocv__alphas': [[0.1], [0.3], [1]]}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=10)
model.fit(X_train, y_train);

In [84]:
pd.DataFrame(model.cv_results_).T.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6
mean_fit_time,0.0447977,0.035259,0.0415884,0.0335755,0.039107,0.0325193,0.0252504
mean_score_time,0.00128438,0.000935555,0.000984383,0.000955367,0.000944304,0.000912857,0.000726581
mean_test_score,0.25014,0.257191,0.272283,0.26075,0.118736,-0.0365899,-0.0365899
mean_train_score,0.397262,0.39421,0.379463,0.33671,0.163429,0,0
param_lassocv__alphas,[1.0],[3.0],[10.0],[30.0],[100.0],[300.0],[1000.0]


In [85]:
model.score(X_test, y_test)

0.32226117361262685

In [46]:
model.best_params_

{'lassocv__alphas': [10.0]}

### How many coefs at 0?

In [None]:
(model.best_estimator_.steps[-1][-1].coef_ == 0).sum()

In [None]:
X_train.shape[1]

14 out of 31 went to 0.

### Interesting  coef values?

In [None]:
coef_values = pd.DataFrame({
    'name': X_train.columns,
    'coef': model.best_estimator_.steps[-1][-1].coef_
})

coef_values['abs_coef'] = pd.np.abs(coef_values.coef)

In [None]:
coef_values[coef_values.coef != 0].sort_values('abs_coef', ascending=False)

## Conclusion

Our baseline model's performance seems pretty bad, with $R^2=0.28$.