In [1]:
%load_ext autoreload
%autoreload 2

## Objective

I will try a LassoCV model with all the socioeconomic features I have from the Census. Then, I will remove the multicolinear features and run the model again. I know offhand that a couple of my features (like poverty) are multicolinear.

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))

from transform_for_num_issues_pred import main as transform_dataset

In [3]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20, 5

from helper_functions import dummify_cols_and_baselines, make_alphas, adjusted_r2, transform_school

In [4]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

In [5]:
df_orig = transform_school(df_orig)
df_orig.shape

  df.school = df.school.str.extract(r'(\d\d?)').astype(int)


(516406, 40)

In [6]:
df_orig.head(1).T

Unnamed: 0,905425
CASE_ENQUIRY_ID,101001983786
OPEN_DT,2017-01-07 10:51:37
CLOSED_DT,2017-01-07 11:46:43
TYPE,Request for Snow Plowing
SubmittedPhoto,True
LOCATION_ZIPCODE,2124
Property_Type,Address
LATITUDE,42.2809
LONGITUDE,-71.068
Source,Citizens Connect App


In [7]:
df_transformed = transform_dataset(df_orig)
df_transformed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_subset['NUM_ISSUES'] = 1


(534, 28)

## Dropping more outliers

To see if that improves the model. If so, the next question is, when do we stop removing outliers!?!

In [9]:
more_outliers = ['0107021', '0701011', '0806013'] # last one is Dudley

In [10]:
df_transformed = df_transformed[~df_transformed.tract_and_block_group.isin(more_outliers)]
df_transformed.shape

(531, 28)

In [11]:
df_transformed.head(1).T

Unnamed: 0,0
tract_and_block_group,1004002
race_white,0.242399
race_black,0.514358
race_asian,0.035473
race_hispanic,0.0675676
race_other,0.140203
poverty_pop_below_poverty_level,0
earned_income_per_capita,34340
poverty_pop_w_public_assistance,0.0597484
poverty_pop_w_food_stamps,0.138365


## Choosing columns

In [51]:
cols_orig_dataset = ['NUM_ISSUES_PER_1000_POP']
cols_census = ['race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'income',
     'income_std_dev']
cols_census = [
     'race_black',
     'race_asian',
     'race_hispanic',
     'poverty_pop_below_poverty_level',
     'housing',
     'bedroom',
     'rent',
     'income',
     'income_std_dev']

In [52]:
df_col_subset = df_transformed[cols_orig_dataset + cols_census]
df_col_subset.shape

(531, 10)

In [53]:
df_col_subset.isnull().any()

NUM_ISSUES_PER_1000_POP            False
race_black                         False
race_asian                         False
race_hispanic                      False
poverty_pop_below_poverty_level    False
housing                            False
bedroom                            False
rent                               False
income                             False
income_std_dev                      True
dtype: bool

In [54]:
# this is a bad temporary band-aid
df_col_subset = df_col_subset.dropna(subset=['income_std_dev'])
df_col_subset.shape

(530, 10)

## Dummify

In [55]:
cols_to_dummify = df_col_subset.dtypes[df_col_subset.dtypes == object].index
cols_to_dummify

Index([u'housing'], dtype='object')

In [56]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df_col_subset, cols_to_dummify)

rent is baseline 0 1


In [57]:
df_dummified.shape

(530, 10)

In [58]:
# df_dummified = df_dummified.drop('race_other', axis=1)

## Running model

In [59]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer


In [61]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop(['NUM_ISSUES_PER_1000_POP'], axis=1), 
    df_dummified.NUM_ISSUES_PER_1000_POP, 
    test_size=0.2, 
    random_state=500
)

In [62]:
pipe = make_pipeline(StandardScaler(), LassoCV())

In [63]:
mse = make_scorer(mean_squared_error, greater_is_better=False)
adjusted_r2_scorer = make_scorer(adjusted_r2, num_features=X_test.shape[1])

In [64]:
params = {'lassocv__alphas': make_alphas(-3, 3)}
# params = {'lassocv__alphas': [[10]]}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=5, scoring=adjusted_r2_scorer)
model.fit(X_train, y_train);

In [67]:
pd.DataFrame(model.cv_results_).T.iloc[2:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
mean_test_score,0.139262,0.139264,0.139271,0.139291,0.139361,0.139595,0.140365,0.142222,0.144565,0.131982,0.0268871,-0.127112,-0.127112
mean_train_score,0.251226,0.251226,0.251226,0.251226,0.251225,0.251219,0.251153,0.250625,0.246111,0.227441,0.118093,-0.027339,-0.027339
param_lassocv__alphas,[0.001],[0.003],[0.01],[0.03],[0.1],[0.3],[1.0],[3.0],[10.0],[30.0],[100.0],[300.0],[1000.0]


In [68]:
print model.best_score_
print model.best_params_

0.14456484278
{'lassocv__alphas': [10.0]}


In [69]:
model.score(X_test, y_test)

0.14642548408453829

In [27]:
y_pred = model.predict(X_test)

In [28]:
r2_score(y_test, y_pred)

0.29196577228949505

In [48]:
mean_squared_error(y_test, y_pred)**0.5

320.39135017791932

### How many coefs at 0?

In [70]:
(model.best_estimator_.steps[-1][-1].coef_ == 0).sum()

3

In [71]:
X_train.shape[1]

9

3 out of 11 went to 0.

### Interesting  coef values?

In [72]:
coef_values = pd.DataFrame({
    'name': X_train.columns,
    'coef': model.best_estimator_.steps[-1][-1].coef_
})

coef_values['abs_coef'] = pd.np.abs(coef_values.coef)

In [73]:
coef_values[coef_values.coef != 0].sort_values('abs_coef', ascending=False)

Unnamed: 0,coef,name,abs_coef
3,-105.924478,poverty_pop_below_poverty_level,105.924478
2,-78.656195,race_hispanic,78.656195
1,-30.362473,race_asian,30.362473
6,27.640083,income,27.640083
8,16.977329,housing_own,16.977329
0,-8.152135,race_black,8.152135


Which ones went to 0?

In [74]:
coef_values[coef_values.coef == 0]

Unnamed: 0,coef,name,abs_coef
4,0.0,bedroom,0.0
5,-0.0,rent,0.0
7,0.0,income_std_dev,0.0


## How biased of an estimator?

In [34]:
y_test.describe()

count     107.000000
mean      818.061750
std       382.553881
min        41.771094
25%       572.609348
50%       766.844401
75%      1008.393285
max      2038.740920
Name: NUM_ISSUES_PER_1000_POP, dtype: float64

In [35]:
pd.Series(y_pred).describe()

count     107.000000
mean      793.414245
std       194.341198
min       201.394721
25%       684.880469
50%       794.940390
75%       955.569588
max      1110.146731
dtype: float64

In [47]:
(pd.Series(y_pred) - y_test.reset_index()['NUM_ISSUES_PER_1000_POP']).describe()

count     107.000000
mean      -24.647505
std       320.945149
min     -1459.549921
25%      -151.296245
50%        27.950099
75%       140.638699
max       737.131993
dtype: float64

## Conclusion

Using my custom-chosen subset of features, designed to reduce multicollinearity, gives me better performance.

The variance in the data my model explains is low, but it's a fairly unbiased estimator. That is to say, on the whole, its guesses aren't too far off from the true values. There is just a lot of variance in the data.