In [2]:
import pickle

In [9]:
with open('../data/q2_submodel_results.pkl') as infile:
    results = pickle.load(infile)

In [6]:
with open('../data/q2_submodel_y_preds.pkl') as infile:
    y_preds = pickle.load(infile)

In [7]:
with open('../data/q2_submodel_y_tests.pkl') as infile:
    y_tests = pickle.load(infile)

In [12]:
results.itervalues().next()

{'best_estimator': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 'best_params': {},
 'best_score': 0.17094545837951813,
 'result':                              0
 mean_fit_time         0.113809
 mean_score_time     0.00805807
 mean_test_score       0.170945
 mean_train_score      0.202891
 params                      {}
 rank_test_score              1
 split0_test_score     0.170945
 split0_train_score    0.202891
 std_fit_time                 0
 std_score_time               0
 std_test_score               0
 std_train_score              0,
 'rmse': 0.78215492084115734}

In [14]:
r = {}

for categ, result in results.iteritems():
    r[categ] = {
        'CV_test': result['best_score'],
        'rmse': result['rmse'],
        'CV_train': result['result'].loc['mean_train_score'].max()
    }

In [52]:
df = pd.DataFrame(r).T
df.head()

Unnamed: 0,CV_test,CV_train,rmse
Abandoned Bicycle,0.05915372,0.149311,0.595545
Abandoned Vehicles,0.05945412,0.082168,0.346471
Animal Generic Request,0.04533257,0.171777,0.721914
Bed Bugs,-1.460536e+22,0.081804,0.83062
Building Inspection Request,0.1438496,0.158578,0.639544


In [53]:
df.sort_values('rmse').head()

Unnamed: 0,CV_test,CV_train,rmse
Schedule a Bulk Item Pickup SS,0.083546,0.084689,0.294722
Schedule a Bulk Item Pickup,0.10005,0.102385,0.327968
Notification,0.313788,0.393493,0.329748
OCR Front Desk Interactions,0.059783,0.064095,0.335492
Unsanitary Conditions - Establishment,-0.028027,0.077545,0.3441


In [23]:
df.sort_values('CV_test', ascending=False).head()

Unnamed: 0,CV_test,CV_train,rmse
Recycling Cart Return,0.445682,0.456628,1.084334
Request for Snow Plowing,0.371975,0.373201,0.563758
Notification,0.313788,0.393493,0.329748
Electrical,0.233042,0.358054,0.619165
Needle Pickup,0.207811,0.324483,0.907098


## Looking at coefs for Recycling Cart Return

Does this have statistically significant coefs, assuming homoskedacity and a linear predictor-response relationship?

## >>>

In [25]:
%load_ext autoreload
%autoreload 2

In [26]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
from tqdm import tqdm

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20, 5

import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))
from helper_functions import dummify_cols_and_baselines, make_alphas, remove_outliers_by_type, adjusted_r2

In [27]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

## Removing outliers

A standard procedure is to remove values further than 3 standard deviations from the mean. Since I have so many low values and some very high values, I anecdotally think that the low values are very likely to be true, and the high values not so much.

So, I will remove values further than 3 SDs from the median, by type.

Ideally, I would take into account the time dimension. I would like to do so given more time.

In [28]:
df_outliers_removed = remove_outliers_by_type(df_orig, y_col='COMPLETION_HOURS_LOG_10')
df_outliers_removed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  group[pd.np.abs(group - group.median()) > stds * group.std()] = pd.np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.where(-key, value, inplace=True)


(508653, 40)

I'm removing ~1.5% of my rows.

## Choosing columns

In [29]:
cols_orig_dataset = ['COMPLETION_HOURS_LOG_10', 'TYPE', 'SubmittedPhoto', 'Property_Type', 'Source', 'neighborhood_from_zip']
cols_census = ['race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'earned_income_per_capita',
     'poverty_pop_w_public_assistance',
     'poverty_pop_w_food_stamps',
     'poverty_pop_w_ssi',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'bedroom',
     'bedroom_std_dev',
     'value',
     'value_std_dev',
     'rent',
     'rent_std_dev',
     'income',
     'income_std_dev']
cols_engineered = ['queue_wk', 'queue_wk_open', 'is_description']

In [32]:
df = df_outliers_removed[cols_orig_dataset + cols_census + cols_engineered]
df.shape

(508653, 31)

## Subsetting by Recycling Cart Return

In [33]:
df = df[df.TYPE == 'Recycling Cart Return']
df.shape

(1986, 31)

## Dummify

In [34]:
cols_to_dummify = [i for i in df.dtypes[df.dtypes == object].index if i != 'TYPE']
cols_to_dummify

['Property_Type', 'Source', 'neighborhood_from_zip', 'school', 'housing']

In [35]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Intersection is baseline 0 5
Constituent Call is baseline 1 5
West Roxbury is baseline 2 5
8_6th_grade is baseline 3 5
rent is baseline 4 5


In [36]:
df_dummified.shape

(1986, 54)

## Running model

In [48]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import string
from StringIO import StringIO


In [38]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [42]:
df_dummified.columns = [col.translate(None, string.punctuation).replace(' ', '') if col != 'COMPLETION_HOURS_LOG_10' else col for col in df_dummified.columns]

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('COMPLETION_HOURS_LOG_10', axis=1), 
    df_dummified.COMPLETION_HOURS_LOG_10, 
    test_size=0.2, 
    random_state=300
)

In [46]:
col_list = ' + '.join(df_dummified.drop('COMPLETION_HOURS_LOG_10', axis=1))

est = smf.ols(
    'COMPLETION_HOURS_LOG_10 ~ {}'.format(col_list), 
    pd.concat([X_train, y_train], axis=1)).fit()
est.summary()

0,1,2,3
Dep. Variable:,COMPLETION_HOURS_LOG_10,R-squared:,0.458
Model:,OLS,Adj. R-squared:,0.441
Method:,Least Squares,F-statistic:,26.52
Date:,"Wed, 22 Feb 2017",Prob (F-statistic):,2.49e-168
Time:,07:36:43,Log-Likelihood:,-2456.4
No. Observations:,1588,AIC:,5013.0
Df Residuals:,1538,BIC:,5281.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.5704,0.664,2.366,0.018,0.269 2.872
SubmittedPhoto[T.True],5.199e-10,2.47e-10,2.105,0.035,3.54e-11 1e-09
isdescription[T.True],-6.839e-11,6.11e-11,-1.120,0.263,-1.88e-10 5.14e-11
racewhite,0.1647,0.241,0.685,0.494,-0.307 0.637
raceblack,0.2899,0.238,1.216,0.224,-0.178 0.757
raceasian,-0.2347,0.317,-0.740,0.460,-0.857 0.388
racehispanic,0.6039,0.305,1.981,0.048,0.006 1.202
raceother,0.7467,0.476,1.567,0.117,-0.188 1.681
povertypopbelowpovertylevel,-0.3773,0.463,-0.814,0.416,-1.286 0.531

0,1,2,3
Omnibus:,2.784,Durbin-Watson:,2.046
Prob(Omnibus):,0.249,Jarque-Bera (JB):,2.675
Skew:,-0.092,Prob(JB):,0.262
Kurtosis:,3.081,Cond. No.,1.11e+16


## Interpreting model

Which features are most associated with completion time?

In [49]:
df = pd.read_csv(StringIO(est.summary().tables[1].as_csv()), index_col=0).reset_index()
df.columns = ['coef_name'] + [i.rstrip().lstrip() for i in df.columns][1:]
df['coef_abs'] = pd.np.abs(df.coef)
df.head()

Unnamed: 0,coef_name,coef,std err,t,P>|t|,[95.0% Conf. Int.],coef_abs
0,Intercept ...,1.5704,0.664,2.366,0.018,0.269 2.872,1.5704
1,SubmittedPhoto[T.True] ...,5.199e-10,2.47e-10,2.105,0.035,3.54e-11 1e-09,5.199e-10
2,isdescription[T.True] ...,-6.839e-11,6.11e-11,-1.12,0.263,-1.88e-10 5.14e-11,6.839e-11
3,racewhite ...,0.1647,0.241,0.685,0.494,-0.307 0.637,0.1647
4,raceblack ...,0.2899,0.238,1.216,0.224,-0.178 0.757,0.2899


In [50]:
df.sort_values('P>|t|')

Unnamed: 0,coef_name,coef,std err,t,P>|t|,[95.0% Conf. Int.],coef_abs
33,neighborhoodfromzipEastBoston ...,-1.335,0.281,-4.757,0.0,-1.885 -0.785,1.335
31,neighborhoodfromzipDorchester ...,-1.0256,0.204,-5.032,0.0,-1.425 -0.626,1.0256
24,queuewkopen ...,-0.0072,0.0,-18.067,0.0,-0.008 -0.006,0.0072
23,queuewk ...,0.0004,2e-05,18.334,0.0,0.000 0.000,0.0004
12,povertypopwssi ...,2.1198,0.542,3.913,0.0,1.057 3.182,2.1198
37,neighborhoodfromzipMattapan ...,-0.7032,0.236,-2.979,0.003,-1.166 -0.240,0.7032
34,neighborhoodfromzipFenwayKenmoreAudubonCircleL...,-1.352,0.468,-2.891,0.004,-2.269 -0.435,1.352
26,neighborhoodfromzipAllstonBrighton ...,-0.6326,0.247,-2.559,0.011,-1.117 -0.148,0.6326
35,neighborhoodfromzipHydePark ...,-0.6176,0.246,-2.514,0.012,-1.099 -0.136,0.6176
41,neighborhoodfromzipRoxbury ...,-0.8029,0.338,-2.376,0.018,-1.466 -0.140,0.8029


## Conclusion

Making sub-models improves the R2 and RMSE _for these chosen categories_, which are the ones above the 70th percentile in terms of number of issues, minus a couple ones stated above that messed up the R2.

This R2 and RMSE are 0.59 and 0.69. The ones for the main model are 0.55 and 0.73.

There are prolly more stat sig coefs here as well.

## Next Steps

If I had more time,
- check how the R2 was messed up on those couple of categs. Prolly bc R2 for that model was low, and my preds were way off. what to do about those categs then? either the mean, or the pred from the big model would work.
- group the categs below 70th percentile to give them enough sample size, then run model on them. trial-and-error + domain knowledge as to which categs would work and which wouldn't.