In [2]:
from __future__ import division
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
# from tqdm import tqdm

rcParams['figure.figsize'] = 20, 5
warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")

from scipy.sparse import csr_matrix
from zipcode_mapping import zipcode_mapping

In [60]:
df = pd.read_pickle('../data/data_w_transformed_census_and_removed_invalid_rows_and_cols_and_fixed_zips_and_descs_and_dropped_latlongs_and_more_dropped_cols_and_rows.pkl')
df.shape

(520860, 26)

In [61]:
df.head(1).T

Unnamed: 0,905425
OPEN_DT,2017-01-07 10:51:37
CLOSED_DT,2017-01-07 11:46:43
TYPE,Request for Snow Plowing
SubmittedPhoto,True
Property_Type,Address
LATITUDE,42.2809
LONGITUDE,-71.068
Source,Citizens Connect App
race_white,0.242399
race_black,0.514358


## Feature Engineering

## Dummifying

In [36]:
def dummify_cols_and_baselines(df, cols):
    baseline_cols = []
    
    for i, column in enumerate(cols):
        baseline = sorted(df[column].unique())[-1]
        print baseline, 'is baseline', i, len(cols)
        baseline_cols += [baseline]
        dummy = pd.get_dummies(df[column]).rename(columns=lambda x: column+'_'+str(x)).iloc[:,0:len(df[column].unique())-1]
        df = df.drop(column, axis=1) #Why not inplace? because if we do inplace, it will affect the df directly
        df = pd.concat([df, dummy], axis=1)
        
    return df, baseline_cols

In [56]:
cols_to_dummify = df.dtypes[df.dtypes == object].index
cols_to_dummify

Index([u'TYPE', u'Property_Type', u'Source', u'school', u'housing',
       u'neighborhood_from_zip'],
      dtype='object')

In [62]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Zoning is baseline 0 6
other is baseline 1 6
Twitter is baseline 2 6
8_6th_grade is baseline 3 6
rent is baseline 4 6
West Roxbury is baseline 5 6


In [63]:
df_dummified = df_dummified.drop(['LATITUDE', 'LONGITUDE', 'OPEN_DT', 'CLOSED_DT'], axis=1)

## Run model

In [10]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('COMPLETION_TIME', axis=1), 
    df_dummified.COMPLETION_TIME, 
    test_size=0.2, 
    random_state=300
)

In [16]:
pipe = make_pipeline(LassoCV(verbose=100))
cv = ShuffleSplit(X_train.shape[0], n_iter=1, test_size=0.2, random_state=300)

In [20]:
params = {'lassocv__alphas': [[0.67], [0.4]]}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv, verbose=100)
model.fit(X_train, y_train);

Fitting 1 folds for each of 2 candidates, totalling 2 fits
Pickling array (shape=(244,), dtype=object).
Memmaping (shape=(416688,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-2abbbf547ef4d07b72d4a55d5f7e591d.pkl
Pickling array (shape=(1, 416688), dtype=bool).
Memmaping (shape=(11, 416688), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-e7ffab2ce7874c7396d35f2a40d54f04.pkl
Memmaping (shape=(3, 416688), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-d7332f3df9a1eea3788c985697e8ddd9.pkl
[CV] lassocv__alphas=[0.67] ..........................................
Memmaping (shape=(229, 416688), dtype=uint8) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-f4b585c567fe6d424ac7129831bb9377.pkl
Pickling array (shape=(1,), dtype=object).
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(3,),

In [21]:
model.best_estimator_.steps[-1][-1].alpha_

0.40000000000000002

In [22]:
model.score(X_test, y_test)

0.23362293719786098

In [31]:
pd.Series(model.best_estimator_.steps[-1][-1].coef_).describe()

count     244.000000
mean      127.556128
std       902.363007
min      -854.001379
25%        -0.000000
50%         0.000000
75%         0.000000
max      7854.795419
dtype: float64

Which feats are non-zero?

In [33]:
list(X_train.columns[model.best_estimator_.steps[-1][-1].coef_ != 0])

['SubmittedPhoto',
 'race_white',
 'race_black',
 'poverty_pop_below_poverty_level',
 'bedroom',
 'value',
 'rent',
 'income',
 'zipcode',
 'TYPE_Abandoned Building',
 'TYPE_Abandoned Vehicles',
 'TYPE_Animal Generic Request',
 'TYPE_Bed Bugs',
 'TYPE_Breathe Easy',
 'TYPE_Building Inspection Request',
 'TYPE_Call Log',
 'TYPE_Chronic Dampness/Mold',
 'TYPE_Contractor Complaints',
 'TYPE_Contractors Complaint',
 'TYPE_Cross Metering - Sub-Metering',
 'TYPE_Electrical',
 'TYPE_Empty Litter Basket',
 'TYPE_Equipment Repair',
 'TYPE_Exceeding Terms of Permit',
 'TYPE_Fire Hydrant',
 'TYPE_Food Alert - Confirmed',
 'TYPE_Food Alert - Unconfirmed',
 'TYPE_General Comments For An Employee',
 'TYPE_General Comments For a Program or Policy',
 'TYPE_General Lighting Request',
 'TYPE_Graffiti Removal',
 'TYPE_Ground Maintenance',
 'TYPE_Heat - Excessive  Insufficient',
 'TYPE_Highway Maintenance',
 'TYPE_Illegal Auto Body Shop',
 'TYPE_Illegal Dumping',
 'TYPE_Illegal Occupancy',
 'TYPE_Illegal 

## Conclusion

This model performs worse than the last one. The diffs are (1) this one contains neighborhoods, and (2) the other one contains internal categs.

## Scratch: stop at June 2016

In [64]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified[df.OPEN_DT < '2016-06-01'].drop('COMPLETION_TIME', axis=1), 
    df_dummified[df.OPEN_DT < '2016-06-01'].COMPLETION_TIME, 
    test_size=0.2, 
    random_state=300
)

In [65]:
pipe = make_pipeline(LassoCV(verbose=100))
cv = ShuffleSplit(X_train.shape[0], n_iter=1, test_size=0.2, random_state=300)

In [77]:
params = {'lassocv__alphas': [[0.67], [0.4]]}
params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv, verbose=100)
model.fit(X_train, y_train);

Fitting 1 folds for each of 1 candidates, totalling 1 fits
Pickling array (shape=(244,), dtype=object).
Memmaping (shape=(362106,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_4501_140519266583120/4501-140519224026000-2c6b8c23248712b34318e3bb22a18a73.pkl
Pickling array (shape=(1, 362106), dtype=bool).
Memmaping (shape=(11, 362106), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_4501_140519266583120/4501-140519224026000-610402eada8b8a9f36ebe9e2ef914c8c.pkl
Memmaping (shape=(3, 362106), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_4501_140519266583120/4501-140519224026000-e447b9944b9cfe7037362bef6c18833a.pkl
[CV]  ................................................................
Memmaping (shape=(229, 362106), dtype=uint8) to new file /dev/shm/joblib_memmaping_pool_4501_140519266583120/4501-140519224026000-be12daa1e0f7ef33d84fa73ef47534cd.pkl
Pickling array (shape=(1,), dtype=object).
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(3,),

In [75]:
model.best_estimator_.steps[-1][-1].alphas_

array([  8.72173481e+06,   8.13391907e+06,   7.58572015e+06,
         7.07446800e+06,   6.59767254e+06,   6.15301149e+06,
         5.73831911e+06,   5.35157561e+06,   4.99089732e+06,
         4.65452755e+06,   4.34082797e+06,   4.04827069e+06,
         3.77543079e+06,   3.52097940e+06,   3.28367717e+06,
         3.06236832e+06,   2.85597494e+06,   2.66349179e+06,
         2.48398136e+06,   2.31656933e+06,   2.16044031e+06,
         2.01483386e+06,   1.87904080e+06,   1.75239974e+06,
         1.63429386e+06,   1.52414793e+06,   1.42142546e+06,
         1.32562614e+06,   1.23628337e+06,   1.15296201e+06,
         1.07525623e+06,   1.00278756e+06,   9.35203036e+05,
         8.72173481e+05,   8.13391907e+05,   7.58572015e+05,
         7.07446800e+05,   6.59767254e+05,   6.15301149e+05,
         5.73831911e+05,   5.35157561e+05,   4.99089732e+05,
         4.65452755e+05,   4.34082797e+05,   4.04827069e+05,
         3.77543079e+05,   3.52097940e+05,   3.28367717e+05,
         3.06236832e+05,

In [76]:
model.best_estimator_.steps[-1][-1].alpha_

615301.14888623345

In [73]:
pd.Series(model.best_estimator_.steps[-1][-1].coef_).describe()

count    2.440000e+02
mean     2.340394e-07
std      3.655813e-06
min     -0.000000e+00
25%      0.000000e+00
50%     -0.000000e+00
75%      0.000000e+00
max      5.710562e-05
dtype: float64

In [74]:
model.score(X_test, y_test)

0.00019269648628938985

## Conclusion

Restricting rows to < June 2016 actually hurt performance, prolly bc we drop NA y valued rows anyways.