In [1]:
from __future__ import division
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
# from tqdm import tqdm

rcParams['figure.figsize'] = 20, 5
warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")

from scipy.sparse import csr_matrix
from zipcode_mapping import zipcode_mapping

In [2]:
df = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df.shape

(516406, 44)

In [3]:
df1 = pd.read_pickle('../data/data_w_transformed_census_and_removed_invalid_rows_and_cols_and_fixed_zips_and_descs_and_dropped_latlongs_and_more_dropped_cols_and_rows.pkl')
df1.shape

(520860, 27)

In [4]:
df1.drop_duplicates().shape

(520793, 27)

In [5]:
df.isnull().sum()

CASE_ENQUIRY_ID                         0
OPEN_DT                                 0
CLOSED_DT                               0
TYPE                                    0
SubmittedPhoto                          0
ClosedPhoto                        473735
LOCATION_ZIPCODE                    92455
Property_Type                           0
LATITUDE                                0
LONGITUDE                               0
Source                                  0
Geocoded_Location                       0
case_enquiry_id                    328376
description                        353430
specific_location                  506210
title                              334494
COMPLETION_TIME                         0
tract_and_block_group                   0
queue_wk                           328376
race_white                              0
race_black                              0
race_asian                              0
race_hispanic                           0
race_other                        

In [6]:
df1.isnull().sum()

CASE_ENQUIRY_ID                    0
OPEN_DT                            0
CLOSED_DT                          0
TYPE                               0
SubmittedPhoto                     0
Property_Type                      0
LATITUDE                           0
LONGITUDE                          0
Source                             0
race_white                         0
race_black                         0
race_asian                         0
race_hispanic                      0
race_other                         0
poverty_pop_below_poverty_level    0
poverty_pop_w_public_assistance    0
poverty_pop_w_food_stamps          0
poverty_pop_w_ssi                  0
COMPLETION_TIME                    0
school                             0
housing                            0
bedroom                            0
value                              0
rent                               0
income                             0
zipcode                            0
neighborhood_from_zip              0
d

In [61]:
df.head(1).T

Unnamed: 0,905425
OPEN_DT,2017-01-07 10:51:37
CLOSED_DT,2017-01-07 11:46:43
TYPE,Request for Snow Plowing
SubmittedPhoto,True
Property_Type,Address
LATITUDE,42.2809
LONGITUDE,-71.068
Source,Citizens Connect App
race_white,0.242399
race_black,0.514358


## Feature Engineering

## Dummifying

In [7]:
def dummify_cols_and_baselines(df, cols):
    baseline_cols = []
    
    for i, column in enumerate(cols):
        baseline = sorted(df[column].unique())[-1]
        print baseline, 'is baseline', i, len(cols)
        baseline_cols += [baseline]
        dummy = pd.get_dummies(df[column]).rename(columns=lambda x: column+'_'+str(x)).iloc[:,0:len(df[column].unique())-1]
        df = df.drop(column, axis=1) #Why not inplace? because if we do inplace, it will affect the df directly
        df = pd.concat([df, dummy], axis=1)
        
    return df, baseline_cols

In [8]:
cols_to_dummify = df.dtypes[df.dtypes == object].index
cols_to_dummify

Index([                    u'TYPE',              u'ClosedPhoto',
                  u'Property_Type',                   u'Source',
              u'Geocoded_Location',              u'description',
              u'specific_location',                    u'title',
          u'tract_and_block_group', u'earned_income_per_capita',
          u'neighborhood_from_zip',                   u'school',
                        u'housing'],
      dtype='object')

In [10]:
df.earned_income_per_capita.head(2).iloc[0]

'34340'

In [62]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Zoning is baseline 0 6
other is baseline 1 6
Twitter is baseline 2 6
8_6th_grade is baseline 3 6
rent is baseline 4 6
West Roxbury is baseline 5 6


In [63]:
df_dummified = df_dummified.drop(['LATITUDE', 'LONGITUDE', 'OPEN_DT', 'CLOSED_DT'], axis=1)

## Run model

In [10]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('COMPLETION_TIME', axis=1), 
    df_dummified.COMPLETION_TIME, 
    test_size=0.2, 
    random_state=300
)

In [16]:
pipe = make_pipeline(LassoCV(verbose=100))
cv = ShuffleSplit(X_train.shape[0], n_iter=1, test_size=0.2, random_state=300)

In [20]:
params = {'lassocv__alphas': [[0.67], [0.4]]}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv, verbose=100)
model.fit(X_train, y_train);

Fitting 1 folds for each of 2 candidates, totalling 2 fits
Pickling array (shape=(244,), dtype=object).
Memmaping (shape=(416688,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-2abbbf547ef4d07b72d4a55d5f7e591d.pkl
Pickling array (shape=(1, 416688), dtype=bool).
Memmaping (shape=(11, 416688), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-e7ffab2ce7874c7396d35f2a40d54f04.pkl
Memmaping (shape=(3, 416688), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-d7332f3df9a1eea3788c985697e8ddd9.pkl
[CV] lassocv__alphas=[0.67] ..........................................
Memmaping (shape=(229, 416688), dtype=uint8) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-f4b585c567fe6d424ac7129831bb9377.pkl
Pickling array (shape=(1,), dtype=object).
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(3,),

In [21]:
model.best_estimator_.steps[-1][-1].alpha_

0.40000000000000002

In [22]:
model.score(X_test, y_test)

0.23362293719786098

In [31]:
pd.Series(model.best_estimator_.steps[-1][-1].coef_).describe()

count     244.000000
mean      127.556128
std       902.363007
min      -854.001379
25%        -0.000000
50%         0.000000
75%         0.000000
max      7854.795419
dtype: float64

Which feats are non-zero?

In [33]:
list(X_train.columns[model.best_estimator_.steps[-1][-1].coef_ != 0])

['SubmittedPhoto',
 'race_white',
 'race_black',
 'poverty_pop_below_poverty_level',
 'bedroom',
 'value',
 'rent',
 'income',
 'zipcode',
 'TYPE_Abandoned Building',
 'TYPE_Abandoned Vehicles',
 'TYPE_Animal Generic Request',
 'TYPE_Bed Bugs',
 'TYPE_Breathe Easy',
 'TYPE_Building Inspection Request',
 'TYPE_Call Log',
 'TYPE_Chronic Dampness/Mold',
 'TYPE_Contractor Complaints',
 'TYPE_Contractors Complaint',
 'TYPE_Cross Metering - Sub-Metering',
 'TYPE_Electrical',
 'TYPE_Empty Litter Basket',
 'TYPE_Equipment Repair',
 'TYPE_Exceeding Terms of Permit',
 'TYPE_Fire Hydrant',
 'TYPE_Food Alert - Confirmed',
 'TYPE_Food Alert - Unconfirmed',
 'TYPE_General Comments For An Employee',
 'TYPE_General Comments For a Program or Policy',
 'TYPE_General Lighting Request',
 'TYPE_Graffiti Removal',
 'TYPE_Ground Maintenance',
 'TYPE_Heat - Excessive  Insufficient',
 'TYPE_Highway Maintenance',
 'TYPE_Illegal Auto Body Shop',
 'TYPE_Illegal Dumping',
 'TYPE_Illegal Occupancy',
 'TYPE_Illegal 

## Conclusion

This model performs worse than the last one. The diffs are (1) this one contains neighborhoods, and (2) the other one contains internal categs.

## Scratch: stop at June 2016

In [64]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified[df.OPEN_DT < '2016-06-01'].drop('COMPLETION_TIME', axis=1), 
    df_dummified[df.OPEN_DT < '2016-06-01'].COMPLETION_TIME, 
    test_size=0.2, 
    random_state=300
)

In [36]:
pipe = make_pipeline(LassoCV(verbose=100))
cv = ShuffleSplit(X_train.shape[0], n_iter=1, test_size=0.2, random_state=300)

In [37]:
params = {'lassocv__alphas': [[1e-4], [1e-5], [1e-6]]}
# params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv, verbose=100)
model.fit(X_train, y_train);

Fitting 1 folds for each of 3 candidates, totalling 3 fits
Pickling array (shape=(245,), dtype=object).
Memmaping (shape=(416688,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_23445_139737817055888/23445-139737708814800-2abbbf547ef4d07b72d4a55d5f7e591d.pkl
Memmaping (shape=(4, 416688), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_23445_139737817055888/23445-139737708814800-6ab2b0e0291f77e4272bf0c5c2200cbf.pkl
Pickling array (shape=(1, 416688), dtype=bool).
Memmaping (shape=(11, 416688), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_23445_139737817055888/23445-139737708814800-e7ffab2ce7874c7396d35f2a40d54f04.pkl
[CV] lassocv__alphas=[0.0001] ........................................
Memmaping (shape=(229, 416688), dtype=uint8) to new file /dev/shm/joblib_memmaping_pool_23445_139737817055888/23445-139737708814800-f4b585c567fe6d424ac7129831bb9377.pkl
Pickling array (shape=(4,), dtype=object).
Pickling array (shape=(1,), dtype=object).
Pickling array (shap

In [38]:
model.best_estimator_.steps[-1][-1].alpha_

0.0001

In [39]:
pd.DataFrame(model.cv_results_).T

Unnamed: 0,0,1,2
mean_fit_time,126.606,124.548,124.887
mean_score_time,1.08935,1.38144,1.16691
mean_test_score,0.229975,0.229973,0.229973
mean_train_score,0.243109,0.243111,0.243112
param_lassocv__alphas,[0.0001],[1e-05],[1e-06]
params,{u'lassocv__alphas': [0.0001]},{u'lassocv__alphas': [1e-05]},{u'lassocv__alphas': [1e-06]}
rank_test_score,1,2,3
split0_test_score,0.229975,0.229973,0.229973
split0_train_score,0.243109,0.243111,0.243112
std_fit_time,0,0,0


In [40]:
pd.Series(model.best_estimator_.steps[-1][-1].coef_).describe()

count      245.000000
mean      -603.756043
std       1718.562702
min      -2051.048287
25%      -1567.550488
50%       -866.222972
75%         -0.004439
max      15058.077451
dtype: float64

In [41]:
model.score(X_test, y_test)

0.24384815500145818

## Conclusion

Restricting rows to < June 2016 actually hurt performance, prolly bc we drop NA y valued rows anyways.