In [36]:
from __future__ import division
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
from tqdm import tqdm

rcParams['figure.figsize'] = 20, 5
warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")

from helper_functions import dummify_cols_and_baselines, make_alphas

In [3]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

## Removing extra columns

In [5]:
df = df_orig[['TYPE', 'COMPLETION_HOURS_LOG_10']]

## Dummify

In [6]:
cols_to_dummify = df.dtypes[df.dtypes == object].index
cols_to_dummify

Index([u'TYPE'], dtype='object')

In [7]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Zoning is baseline 0 1


In [8]:
df_dummified.shape

(516406, 192)

### Running model

In [9]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error




In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('COMPLETION_HOURS_LOG_10', axis=1), 
    df_dummified.COMPLETION_HOURS_LOG_10, 
    test_size=0.2, 
    random_state=300
)

In [11]:
pipe = make_pipeline(LinearRegression())

In [46]:
cv = ShuffleSplit(X_train.shape[0], n_iter=5, test_size=0.4, random_state=300)

In [47]:
params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv, verbose=True)
model.fit(X_train, y_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.8s finished


In [49]:
model.score(X_test, y_test)

0.48053034246809379

In [48]:
pd.DataFrame(model.cv_results_).T

Unnamed: 0,0
mean_fit_time,2.48673
mean_score_time,0.186477
mean_test_score,-8.09924e+20
mean_train_score,0.486185
params,{}
rank_test_score,1
split0_test_score,-2.98164e+20
split0_train_score,0.486389
split1_test_score,-2.62978e+18
split1_train_score,0.486558


## Scratch: just testing on non-log

To see if I get $R^2$ around 0.24.

In [67]:
params = {'lassocv__n_alphas': [3]}
model2 = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv, verbose=True)
model2.fit(X_train, 10**y_train);

Fitting 1 folds for each of 1 candidates, totalling 1 fits
(array([ -0.        ,   0.        ,   0.        ,  -0.        ,
        -0.        ,  -0.        ,  -0.        ,  -0.        ,
        -0.        ,  -0.        ,  -0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,  -0.        ,
        -0.        ,  -0.        ,   0.        ,   0.        ,
        -0.        ,   0.        ,  -0.        ,  -0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,  -0.        ,   0.        ,   0.        ,
        -0.        ,   0.        ,   0.        ,  -0.        ,
        -0.        ,  -0.        ,   0.        ,  -0.        ,
        -0.        ,  -0.        ,  -0.        ,   0.        ,
         0.        ,   0.        ,  -0.        ,  -0.        ,
         0.        ,   0.        ,  -0.        ,   0.        ,
        -0.        ,  -0.        ,   0.        ,  -0.     

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   21.9s finished


(array([-0.,  0.,  0., -0., -0., -0., -0., -0., -0., -0.,  0., -0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0., -0., -0.,  0.,  0.,  0., -0.,  0.,
       -0., -0.,  0.,  0.,  0.,  0.,  0., -0.,  0.,  0., -0.,  0.,  0.,
       -0., -0., -0., -0., -0., -0., -0., -0.,  0.,  0.,  0., -0., -0.,
        0.,  0.,  0.,  0., -0., -0.,  0., -0.,  0., -0.,  0.,  0., -0.,
       -0.,  0., -0.,  0.,  0., -0.,  0.,  0.,  0., -0.,  0.,  0., -0.,
       -0., -0.,  0.,  0., -0.,  0.,  0.,  0.,  0., -0., -0., -0., -0.,
       -0.,  0.,  0.,  0., -0., -0., -0., -0.,  0., -0.,  0.,  0., -0.,
        0.,  0., -0., -0., -0.,  0.,  0., -0.,  0.,  0.,  0., -0.,  0.,
        0.,  0., -0.,  0.,  0., -0.,  0., -0.,  0.,  0.,  0., -0.,  0.,
       -0., -0., -0., -0., -0., -0.,  0., -0.,  0.,  0., -0., -0., -0.,
       -0., -0., -0.,  0., -0.,  0.,  0.,  0., -0.,  0., -0., -0., -0.,
        0., -0.,  0.,  0., -0.,  0., -0., -0.,  0.,  0.,  0., -0.,  0.,
        0., -0., -0.,  0., -0., -0., -0.,  0.,  0., -0.,  0., -

In [68]:
pd.DataFrame(model2.cv_results_).T

Unnamed: 0,0
mean_fit_time,19.7006
mean_score_time,0.097306
mean_test_score,0.231869
mean_train_score,0.23788
param_lassocv__n_alphas,3
params,{u'lassocv__n_alphas': 3}
rank_test_score,1
split0_test_score,0.231869
split0_train_score,0.23788
std_fit_time,0


Yep, I do.