In [1]:
%load_ext autoreload
%autoreload 2

## Objective

While I guess I could use subsetted feats from prev LassoCV, I think the adj R2 of ~0.2 is low enough that I should be suspect of the features it had go to 0. I do want to avoid multicollinearity for the sake of interpreting the CI for ea feature, so I will remove some columns based on my EDA.

Let's see (1) how much I overfit, and (2) whether any features are statistically significant.

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))

from transform_for_num_issues_pred import main as transform_dataset

In [37]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
import string
from StringIO import StringIO

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20, 5

from helper_functions import dummify_cols_and_baselines, make_alphas, adjusted_r2, transform_school

In [4]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

In [5]:
df_orig = transform_school(df_orig)
df_orig.shape

  df.school = df.school.str.extract(r'(\d\d?)').astype(int)


(516406, 40)

In [7]:
df_orig.head(1).T

Unnamed: 0,905425
CASE_ENQUIRY_ID,101001983786
OPEN_DT,2017-01-07 10:51:37
CLOSED_DT,2017-01-07 11:46:43
TYPE,Request for Snow Plowing
SubmittedPhoto,True
LOCATION_ZIPCODE,2124
Property_Type,Address
LATITUDE,42.2809
LONGITUDE,-71.068
Source,Citizens Connect App


In [6]:
df_transformed = transform_dataset(df_orig)
df_transformed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_subset['NUM_ISSUES'] = 1


(534, 28)

In [7]:
df_transformed.NUM_ISSUES_PER_1000_POP.describe()

count     534.000000
mean      816.807870
std       397.501518
min        36.706869
25%       569.713779
50%       765.916847
75%      1029.902627
max      2939.597315
Name: NUM_ISSUES_PER_1000_POP, dtype: float64

## Dropping more outliers

To see if that improves the model. If so, the next question is, when do we stop removing outliers!?!

In [9]:
more_outliers = ['0107021', '0701011']

In [10]:
df_transformed = df_transformed[~df_transformed.tract_and_block_group.isin(more_outliers)]
df_transformed.shape

(532, 28)

In [11]:
df_transformed.head(1).T

Unnamed: 0,0
tract_and_block_group,1004002
race_white,0.242399
race_black,0.514358
race_asian,0.035473
race_hispanic,0.0675676
race_other,0.140203
poverty_pop_below_poverty_level,0
earned_income_per_capita,34340
poverty_pop_w_public_assistance,0.0597484
poverty_pop_w_food_stamps,0.138365


## Choosing columns

In [11]:
cols_orig_dataset = ['NUM_ISSUES_PER_1000_POP']
cols_census = [
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'earned_income_per_capita',
     'poverty_pop_w_public_assistance',
     'poverty_pop_w_ssi',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'bedroom',
     'value',
     'value_std_dev',
     'rent',
     'rent_std_dev',
     'income',
     'income_std_dev']

In [12]:
df_col_subset = df_transformed[cols_orig_dataset + cols_census]
df_col_subset.shape

(532, 20)

## Dummify

In [13]:
cols_to_dummify = df_col_subset.dtypes[df_col_subset.dtypes == object].index
cols_to_dummify

Index([u'housing'], dtype='object')

In [28]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df_col_subset, cols_to_dummify)

rent is baseline 0 1


In [15]:
df_dummified.shape

(532, 20)

## Running model

In [17]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer




In [18]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [29]:
df_dummified.columns = [col.translate(None, string.punctuation).replace(' ', '') if col != 'NUM_ISSUES_PER_1000_POP' else col for col in df_dummified.columns]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop(['NUM_ISSUES_PER_1000_POP'], axis=1), 
    df_dummified.NUM_ISSUES_PER_1000_POP, 
    test_size=0.2, 
    random_state=500
)

In [32]:
col_list = ' + '.join(df_dummified.drop('NUM_ISSUES_PER_1000_POP', axis=1))

est = smf.ols(
    'NUM_ISSUES_PER_1000_POP ~ {}'.format(col_list), 
    pd.concat([X_train, y_train], axis=1)).fit()
est.summary()

0,1,2,3
Dep. Variable:,NUM_ISSUES_PER_1000_POP,R-squared:,0.354
Model:,OLS,Adj. R-squared:,0.324
Method:,Least Squares,F-statistic:,11.68
Date:,"Tue, 21 Feb 2017",Prob (F-statistic):,2.43e-28
Time:,06:02:28,Log-Likelihood:,-3038.8
No. Observations:,425,AIC:,6118.0
Df Residuals:,405,BIC:,6199.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1431.0976,184.599,7.752,0.000,1068.205 1793.990
raceblack,-168.5367,83.300,-2.023,0.044,-332.290 -4.783
raceasian,-319.4962,157.391,-2.030,0.043,-628.902 -10.090
racehispanic,-525.6293,119.619,-4.394,0.000,-760.781 -290.477
raceother,-470.5648,314.457,-1.496,0.135,-1088.736 147.607
povertypopbelowpovertylevel,-641.0737,179.220,-3.577,0.000,-993.391 -288.757
earnedincomepercapita,0.0028,0.001,2.251,0.025,0.000 0.005
povertypopwpublicassistance,-42.5890,292.088,-0.146,0.884,-616.788 531.610
povertypopwssi,327.7093,217.298,1.508,0.132,-99.464 754.883

0,1,2,3
Omnibus:,23.412,Durbin-Watson:,1.962
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.865
Skew:,0.506,Prob(JB):,8.89e-07
Kurtosis:,3.74,Cond. No.,12000000.0


### Getting adjusted $R^2$ on test set

In [33]:
y_pred = est.predict(X_test)

In [34]:
adjusted_r2(y_test, y_pred, num_features=X_test.shape[1])

0.19111322859224694

In [35]:
mean_squared_error(y_test, y_pred)**0.5

310.24458951581659

## Interpreting model

Which features are nearly statistically significant?

In [38]:
df = pd.read_csv(StringIO(est.summary().tables[1].as_csv()), index_col=0).reset_index()
df.columns = ['coef_name'] + [i.rstrip().lstrip() for i in df.columns][1:]
df['coef_abs'] = pd.np.abs(df.coef)
df.head()

Unnamed: 0,coef_name,coef,std err,t,P>|t|,[95.0% Conf. Int.],coef_abs
0,Intercept,1431.0976,184.599,7.752,0.0,1068.205 1793.990,1431.0976
1,raceblack,-168.5367,83.3,-2.023,0.044,-332.290 -4.783,168.5367
2,raceasian,-319.4962,157.391,-2.03,0.043,-628.902 -10.090,319.4962
3,racehispanic,-525.6293,119.619,-4.394,0.0,-760.781 -290.477,525.6293
4,raceother,-470.5648,314.457,-1.496,0.135,-1088.736 147.607,470.5648


In [41]:
df.sort_values('P>|t|').head(10)

Unnamed: 0,coef_name,coef,std err,t,P>|t|,[95.0% Conf. Int.],coef_abs
0,Intercept,1431.0976,184.599,7.752,0.0,1068.205 1793.990,1431.0976
3,racehispanic,-525.6293,119.619,-4.394,0.0,-760.781 -290.477,525.6293
5,povertypopbelowpovertylevel,-641.0737,179.22,-3.577,0.0,-993.391 -288.757,641.0737
6,earnedincomepercapita,0.0028,0.001,2.251,0.025,0.000 0.005,0.0028
11,housingstddev,-0.4199,0.187,-2.242,0.026,-0.788 -0.052,0.4199
2,raceasian,-319.4962,157.391,-2.03,0.043,-628.902 -10.090,319.4962
1,raceblack,-168.5367,83.3,-2.023,0.044,-332.290 -4.783,168.5367
10,schoolstddev,-1.8126,1.002,-1.809,0.071,-3.783 0.158,1.8126
13,value,9.4e-05,5.7e-05,1.649,0.1,-1.8e-05 0.000,9.4e-05
8,povertypopwssi,327.7093,217.298,1.508,0.132,-99.464 754.883,327.7093


In [20]:
pipe = make_pipeline(StandardScaler(), LassoCV())

In [21]:
mse = make_scorer(mean_squared_error, greater_is_better=False)
adjusted_r2_scorer = make_scorer(adjusted_r2, num_features=X_test.shape[1])

In [22]:
params = {'lassocv__alphas': make_alphas(0, 3)}
params = {'lassocv__alphas': [[10]]}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=10, scoring=adjusted_r2_scorer)
model.fit(X_train, y_train);

In [23]:
pd.DataFrame(model.cv_results_).T.iloc[:5]

Unnamed: 0,0
mean_fit_time,0.0432394
mean_score_time,0.00155792
mean_test_score,-0.526669
mean_train_score,0.304795
param_lassocv__alphas,[10]


In [24]:
print model.best_score_
print model.best_params_

-0.526668696659
{'lassocv__alphas': [10]}


In [25]:
model.score(X_test, y_test)

0.1798477789087484

In [26]:
y_pred = model.predict(X_test)

In [27]:
r2_score(y_test, y_pred)

0.34233076610607183

In [28]:
mean_squared_error(y_test, y_pred)**0.5

308.78587019099342

### How many coefs at 0?

In [57]:
(model.best_estimator_.steps[-1][-1].coef_ == 0).sum()

7

In [58]:
X_train.shape[1]

21

7 out of 21 went to 0.

### Interesting  coef values?

In [59]:
coef_values = pd.DataFrame({
    'name': X_train.columns,
    'coef': model.best_estimator_.steps[-1][-1].coef_
})

coef_values['abs_coef'] = pd.np.abs(coef_values.coef)

In [60]:
coef_values[coef_values.coef != 0].sort_values('abs_coef', ascending=False)

Unnamed: 0,coef,name,abs_coef
4,-80.914518,poverty_pop_below_poverty_level,80.914518
5,61.445334,earned_income_per_capita,61.445334
2,-60.632383,race_hispanic,60.632383
11,-46.289554,housing_std_dev,46.289554
13,-36.020511,bedroom_std_dev,36.020511
17,-22.911767,rent_std_dev,22.911767
14,21.90984,value,21.90984
10,-20.494391,school_std_dev,20.494391
1,-14.321219,race_asian,14.321219
3,-8.981952,race_other,8.981952


Which ones went to 0?

In [None]:
cols_blacklist = ['poverty_pop_w_food_stamps', 'school', 'value_std_dev', 'rent', 'income', 'income_std_dev', 'housing_own']

In [61]:
coef_values[coef_values.coef == 0]

Unnamed: 0,coef,name,abs_coef
7,0.0,poverty_pop_w_food_stamps,0.0
9,-0.0,school,0.0
15,0.0,value_std_dev,0.0
16,0.0,rent,0.0
18,-0.0,income,0.0
19,-0.0,income_std_dev,0.0
20,-0.0,housing_own,0.0


## How biased of an estimator?

In [141]:
y_test.describe()

count     107.000000
mean      818.061750
std       382.553881
min        41.771094
25%       572.609348
50%       766.844401
75%      1008.393285
max      2038.740920
Name: NUM_ISSUES_PER_1000_POP, dtype: float64

In [143]:
pd.Series(y_pred).describe()

count     107.000000
mean      792.720383
std       213.013250
min       274.656059
25%       675.070863
50%       785.537522
75%       917.797577
max      1671.371023
dtype: float64

In [63]:
(pd.Series(y_pred) - y_test.reset_index()['NUM_ISSUES_PER_1000_POP']).describe()

count     107.000000
mean      -22.501372
std       309.414191
min     -1399.155635
25%      -153.374526
50%         1.809791
75%       168.027274
max       630.611823
dtype: float64

## Conclusion

Our baseline model's performance seems pretty bad, with $R^2=0.28$.