I will want to beat the baseline score of $R^2=0.24$ and $RMSE=1,881$.

In [1]:
from __future__ import division
import pandas as pd

In [2]:
import warnings
import seaborn as sns

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")

from pylab import rcParams
rcParams['figure.figsize'] = 20, 5

import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
from zipcode_mapping import zipcode_mapping

In [4]:
from scipy.sparse import csc_matrix, csr_matrix, coo_matrix

In [5]:
df = pd.read_pickle('../data/data_w_transformed_census_and_removed_invalid_rows_and_cols_and_fixed_zips_and_descs_and_dropped_latlongs.pkl')
df.shape

(744372, 37)

In [6]:
df.head(1).T

Unnamed: 0,905223
CASE_ENQUIRY_ID,101001983804
OPEN_DT,2017-01-08 12:00:16
TARGET_DT,2017-01-10 08:30:00
CLOSED_DT,NaT
CASE_TITLE,Parking Enforcement
SUBJECT,Transportation - Traffic Division
REASON,Enforcement & Abandoned Vehicles
TYPE,Parking Enforcement
Department,BTDT
SubmittedPhoto,False


In [7]:
df.Property_Type = df.Property_Type.fillna('other')

In [8]:
df[['Property_Type', 'neighborhood_from_zip']].isnull().sum()

Property_Type            0
neighborhood_from_zip    0
dtype: int64

## Let's do other features in the original dataset first

In [9]:
old_df = df.copy()

In [10]:
df.head(1)#.loc[:, 'SubmittedPhoto':]

Unnamed: 0,CASE_ENQUIRY_ID,OPEN_DT,TARGET_DT,CLOSED_DT,CASE_TITLE,SUBJECT,REASON,TYPE,Department,SubmittedPhoto,...,COMPLETION_TIME,school,housing,bedroom,value,rent,income,is_issue_unresolved,zipcode,neighborhood_from_zip
905223,101001983804,2017-01-08 12:00:16,2017-01-10 08:30:00,NaT,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT,False,...,,20_bachelors,own,3,350000.0,2250,87500,True,2131.0,Roslindale


In [11]:
df = df[['TYPE', 'REASON', 'Department', 'SubmittedPhoto', 'Property_Type', 'Source', 'neighborhood_from_zip', 'zipcode', 'COMPLETION_TIME']]

In [12]:
df.isnull().sum()

TYPE                         0
REASON                       0
Department                   0
SubmittedPhoto               0
Property_Type                0
Source                       0
neighborhood_from_zip        0
zipcode                      0
COMPLETION_TIME          65765
dtype: int64

In [13]:
df = df.dropna()

In [14]:
df.isnull().sum()

TYPE                     0
REASON                   0
Department               0
SubmittedPhoto           0
Property_Type            0
Source                   0
neighborhood_from_zip    0
zipcode                  0
COMPLETION_TIME          0
dtype: int64

In [15]:
df.shape

(678607, 9)

## Dummifying

In [None]:
def dummify(df, column):
    # from Darren's linear regression slides
    print '{} is your baseline'.format(sorted(df[column].unique())[-1])
    dummy = pd.get_dummies(df[column]).rename(columns=lambda x: column+'_'+str(x)).iloc[:,0:len(df[column].unique())-1]
    df = df.drop(column,axis=1) #Why not inplace? because if we do inplace, it will affect the df directly
    return pd.concat([df,dummy],axis=1)

In [17]:
df1 = dummify(df, 'TYPE')
df2 = dummify(df1, 'REASON')
df1 = None
df3 = dummify(df2, 'Department')
df2 = None
df4 = dummify(df3, 'Property_Type')
df3 = None
df5 = dummify(df4, 'Source')
df4 = None
df6 = dummify(df5, 'neighborhood_from_zip')
df5 = None
df7 = dummify(df6, 'zipcode')

Zoning is your baseline
Weights and Measures is your baseline
PWDx is your baseline
other is your baseline
Twitter is your baseline
West Roxbury is your baseline
2467.0 is your baseline


In [18]:
df7.shape

(678607, 344)

In [19]:
df7.head()

Unnamed: 0,SubmittedPhoto,COMPLETION_TIME,TYPE_ADA,TYPE_Abandoned Bicycle,TYPE_Abandoned Building,TYPE_Abandoned Vehicles,TYPE_Alert Boston,TYPE_Animal Found,TYPE_Animal Generic Request,TYPE_Animal Lost,...,zipcode_2136.0,zipcode_2146.0,zipcode_2163.0,zipcode_2167.0,zipcode_2199.0,zipcode_2203.0,zipcode_2210.0,zipcode_2215.0,zipcode_2222.0,zipcode_2446.0
905425,True,0.918333,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905235,False,2.325,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905379,False,2.573333,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905264,False,1.03,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905447,False,1.681944,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Let's run the model

In [20]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error




Splitting train/test 80/20:

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    df7.drop('COMPLETION_TIME', axis=1), 
    df7.COMPLETION_TIME, 
    test_size=0.2, 
    random_state=300
)

In [22]:
# pipe = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
# pipe = make_pipeline(StandardScaler(with_mean=False), LassoCV(alphas=10e4))
pipe = make_pipeline(LassoCV(verbose=100))

In [23]:
# cv = ShuffleSplit(X_train.shape[0], n_iter=1, test_size=0.2, random_state=300)

In [None]:
# aa = csr_matrix(X_train).tocsr()

In [None]:
params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=3, verbose=100)
model.fit(X_train, y_train);

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Pickling array (shape=(343,), dtype=object).
Memmaping (shape=(542885,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_36379_139891876182288/36379-139892094990864-4bdc25708fc0932945710f596ef51e3b.pkl
Pickling array (shape=(1, 542885), dtype=bool).
[CV]  ................................................................
Memmaping (shape=(342, 542885), dtype=uint8) to new file /dev/shm/joblib_memmaping_pool_36379_139891876182288/36379-139892094990864-1854da98d7dcf25da4d948328e863489.pkl
Pickling array (shape=(1,), dtype=object).
Pickling array (shape=(342,), dtype=object).
Memmaping (shape=(542885,), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_36379_139891876182288/36379-139892094990864-4bdc25708fc0932945710f596ef51e3b.pkl
Memmaping (shape=(542885,), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_36379_139891876182288/36379-139892094990864-9e5ba7efb1ba38bea046d91adef12000.pkl
Memmaping (shape=(54288

In [None]:
model.n_iter_

In [None]:
model.alpha_

Are we in a high bias or high variance situation?

In [None]:
model.get_params()

In [None]:
pd.DataFrame(model.cv_results_).T

Let's look at the coefficients.

In [None]:
pd.Series(model.coef_).describe()

In [None]:
pd.Series(model.best_estimator_.steps[-1][-1].coef_).describe()

In [None]:
pd.Series(model.best_estimator_.steps[-1][-1].coef_).describe()

In [None]:
pd.Series(model.best_estimator_.steps[1][-1].coef_).describe()

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)