In [1]:
from __future__ import division
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline

rcParams['figure.figsize'] = 20, 5
warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")

from scipy.sparse import csr_matrix
from zipcode_mapping import zipcode_mapping

In [2]:
df = pd.read_pickle('../data/data_w_transformed_census_and_removed_invalid_rows_and_cols_and_fixed_zips_and_descs_and_dropped_latlongs.pkl')
df.shape

(744372, 37)

## Preprocessing

In [3]:
df.Property_Type = df.Property_Type.fillna('other')
df.Property_ID = df.Property_ID.fillna('other')

In [None]:
df.head(1).T

In [None]:
old_df = df.copy()

In [None]:
old_df.shape

In [4]:
df = df[['TYPE', 'SubmittedPhoto', 'Property_Type', \
    'Source', 'zipcode', 'COMPLETION_TIME', 'race_white', 'race_black', \
    'race_asian', 'race_hispanic', 'race_other', 'poverty_pop_below_poverty_level', \
    'poverty_pop_w_public_assistance', 'poverty_pop_w_food_stamps', 'poverty_pop_w_ssi', \
    'school', 'housing', 'bedroom', 'value', 'rent', 'income']]
df.shape

(744372, 21)

In [5]:
df.isnull().sum()

TYPE                                   0
REASON                                 0
Department                             0
SubmittedPhoto                         0
Property_Type                          0
Source                                 0
neighborhood_from_zip                  0
zipcode                                0
COMPLETION_TIME                    65765
race_white                             0
race_black                             0
race_asian                             0
race_hispanic                          0
race_other                             0
poverty_pop_below_poverty_level        0
poverty_pop_w_public_assistance        0
poverty_pop_w_food_stamps              0
poverty_pop_w_ssi                      0
school                                 0
housing                                0
bedroom                                0
value                                  0
rent                                   0
income                                 0
dtype: int64

In [5]:
df = df.dropna()

## Dummifying

In [6]:
def dummify_cols_and_baselines(df, cols):
    baseline_cols = []
    
    for i, column in enumerate(cols):
        baseline = sorted(df[column].unique())[-1]
        print baseline, 'is baseline', i, len(cols)
        baseline_cols += [baseline]
        dummy = pd.get_dummies(df[column]).rename(columns=lambda x: column+'_'+str(x)).iloc[:,0:len(df[column].unique())-1]
        df = df.drop(column, axis=1) #Why not inplace? because if we do inplace, it will affect the df directly
        df = pd.concat([df, dummy], axis=1)
        
    return df, baseline_cols

In [7]:
cols_to_dummify = df.dtypes[df.dtypes == object].index
cols_to_dummify

Index([u'TYPE', u'Property_Type', u'Source', u'school', u'housing'], dtype='object')

In [8]:
df.shape

(678607, 21)

In [9]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Zoning is baseline 0 5
other is baseline 1 5
Twitter is baseline 2 5
8_6th_grade is baseline 3 5
rent is baseline 4 5


In [10]:
df_dummified.shape

(678607, 237)

## Running model

Let's try w/o GridSearchCV; is it faster?

In [11]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV



Splitting train/test 80/20:

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('COMPLETION_TIME', axis=1), 
    df_dummified.COMPLETION_TIME, 
    test_size=0.2, 
    random_state=300
)

In [14]:
pipe = make_pipeline(StandardScaler(), LassoCV(verbose=100))

In [15]:
cv = ShuffleSplit(X_train.shape[0], n_iter=1, test_size=0.2, random_state=300)

In [16]:
params = {'lassocv__alphas': [0.06, 0.1, 1]}
params = {} # if it's fast enough, have GridSearchCV go through all the alphas
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv, verbose=True)
model.fit(X_train, y_train);

Fitting 1 folds for each of 1 candidates, totalling 1 fits
(array([ -0.        ,  -0.        ,   0.        ,  -0.        ,
         0.        ,  -0.        ,  -0.        ,   0.        ,
        -0.        ,  -0.        ,  -0.        ,  -0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,  -0.        ,  -0.        ,
        -0.        ,  -0.        ,  -0.        ,  -0.        ,
         0.        ,  -0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
        -0.        ,  -0.        ,  -0.        ,   0.        ,
         0.        ,  -0.        ,   0.        ,  -0.        ,
        -0.        ,  -0.        ,   0.        ,   0.        ,
         0.        ,  -0.        ,  -0.        ,   0.        ,
         0.        ,  -0.        ,   0.        ,   0.        ,
        -0.        ,  -0.        ,  -0.        ,  -0.     

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   55.7s finished


(array([-0.        , -0.        ,  0.        , -0.        ,  0.        ,
       -0.        , -0.        ,  0.        , -0.        , -0.        ,
       -0.        , -0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.        ,  0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.        , -0.        , -0.        ,  0.        ,
        0.        , -0.        ,  0.        , -0.        , -0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        , -0.        ,  0.        ,
        0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        ,  0.        ,
        0.        , -0.        , -0.        , -0.        ,  0. 

## Model results

In [17]:
pd.DataFrame(model.cv_results_).T

Unnamed: 0,0
mean_fit_time,47.8926
mean_score_time,1.45549
mean_test_score,0.266772
mean_train_score,0.281355
params,{}
rank_test_score,1
split0_test_score,0.266772
split0_train_score,0.281355
std_fit_time,0
std_score_time,0
