In [2]:
from __future__ import division
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
# from tqdm import tqdm

rcParams['figure.figsize'] = 20, 5
warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")

from scipy.sparse import csr_matrix
from zipcode_mapping import zipcode_mapping

In [3]:
df = pd.read_pickle('../data/data_w_transformed_census_and_removed_invalid_rows_and_cols_and_fixed_zips_and_descs_and_dropped_latlongs_and_more_dropped_cols_and_rows.pkl')
df.shape

(520860, 24)

In [11]:
df.head(1).T

Unnamed: 0,905425
TYPE,Request for Snow Plowing
SubmittedPhoto,True
Property_Type,Address
LATITUDE,42.2809
LONGITUDE,-71.068
Source,Citizens Connect App
race_white,0.242399
race_black,0.514358
race_asian,0.035473
race_hispanic,0.0675676


## Feature Engineering

## Dummifying

In [4]:
def dummify_cols_and_baselines(df, cols):
    baseline_cols = []
    
    for i, column in enumerate(cols):
        baseline = sorted(df[column].unique())[-1]
        print baseline, 'is baseline', i, len(cols)
        baseline_cols += [baseline]
        dummy = pd.get_dummies(df[column]).rename(columns=lambda x: column+'_'+str(x)).iloc[:,0:len(df[column].unique())-1]
        df = df.drop(column, axis=1) #Why not inplace? because if we do inplace, it will affect the df directly
        df = pd.concat([df, dummy], axis=1)
        
    return df, baseline_cols

In [6]:
cols_to_dummify = df.dtypes[df.dtypes == object].index
cols_to_dummify

Index([u'TYPE', u'Property_Type', u'Source', u'school', u'housing',
       u'neighborhood_from_zip'],
      dtype='object')

In [7]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Zoning is baseline 0 6
other is baseline 1 6
Twitter is baseline 2 6
8_6th_grade is baseline 3 6
rent is baseline 4 6
West Roxbury is baseline 5 6


In [12]:
df_dummified = df_dummified.drop(['LATITUDE', 'LONGITUDE'], axis=1)

## Run model

In [10]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('COMPLETION_TIME', axis=1), 
    df_dummified.COMPLETION_TIME, 
    test_size=0.2, 
    random_state=300
)

In [16]:
pipe = make_pipeline(LassoCV(verbose=100))
cv = ShuffleSplit(X_train.shape[0], n_iter=1, test_size=0.2, random_state=300)

In [20]:
params = {'lassocv__alphas': [[0.67], [0.4]]}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv, verbose=100)
model.fit(X_train, y_train);

Fitting 1 folds for each of 2 candidates, totalling 2 fits
Pickling array (shape=(244,), dtype=object).
Memmaping (shape=(416688,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-2abbbf547ef4d07b72d4a55d5f7e591d.pkl
Pickling array (shape=(1, 416688), dtype=bool).
Memmaping (shape=(11, 416688), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-e7ffab2ce7874c7396d35f2a40d54f04.pkl
Memmaping (shape=(3, 416688), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-d7332f3df9a1eea3788c985697e8ddd9.pkl
[CV] lassocv__alphas=[0.67] ..........................................
Memmaping (shape=(229, 416688), dtype=uint8) to new file /dev/shm/joblib_memmaping_pool_4501_140519259577744/4501-140519148787856-f4b585c567fe6d424ac7129831bb9377.pkl
Pickling array (shape=(1,), dtype=object).
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(3,),

In [21]:
model.best_estimator_.steps[-1][-1].alpha_

0.40000000000000002

In [22]:
model.score(X_test, y_test)

0.23362293719786098

## Conclusion

This model performs worse than the last one. The diffs are (1) this one contains neighborhoods, and (2) the other one contains internal categs.