In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
from datetime import datetime

from sklearn.cross_validation import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import PolynomialFeatures




## Preprocessing

In [63]:
def dummify(df, column):
    # https://github.com/zipfian/DSI_Lectures/blob/master/linear-regression/darren_reger/LR%20Notebook.ipynb
    if type(df) == pd.Series:
        df = pd.DataFrame(df)
    
    print '{} is your baseline'.format(sorted(df[column].unique())[-1])
    dummy = pd.get_dummies(df[column]).rename(columns=lambda x: column+'_'+str(x)).iloc[:,0:len(df[column].unique())-1]
    df = df.drop(column,axis=1) #Why not inplace? because if we do inplace, it will affect the df directly
    return pd.concat([df,dummy],axis=1)

In [2]:
df = pd.read_hdf('../data/data_till_june_2016_w_descs_and_census.h5')

In [3]:
df = df.dropna(subset=['CLOSED_DT'])

In [4]:
df.shape

(718936, 155)

## Setting aside 20% for the test set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('COMPLETION_TIME', axis=1), 
    df.COMPLETION_TIME, 
    test_size=0.2, 
    random_state=300
)

## Setting aside another 20% for the CV set

In [None]:
X_CV_train, X_CV_test, y_CV_train, y_CV_test = train_test_split(
    X_train, 
    y_train, 
    test_size=0.2, 
    random_state=300
)

## Fitting basic linear model

In [97]:
def get_rmse(est):
    return (est.ssr / (est.nobs - 2))**0.5

In [21]:
est = smf.ols(
    'COMPLETION_TIME ~ C(TYPE)', 
    pd.concat([X_CV_train, y_CV_train], axis=1)).fit()
est.summary()

0,1,2,3
Dep. Variable:,COMPLETION_TIME,R-squared:,0.254
Model:,OLS,Adj. R-squared:,0.254
Method:,Least Squares,F-statistic:,500.2
Date:,"Mon, 30 Jan 2017",Prob (F-statistic):,0.0
Time:,04:19:29,Log-Likelihood:,-2644900.0
No. Observations:,295228,AIC:,5290000.0
Df Residuals:,295026,BIC:,5292000.0
Df Model:,201,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,6010.8016,665.455,9.033,0.000,4706.528 7315.075
C(TYPE)[T.Abandoned Bicycle],-5415.7685,670.440,-8.078,0.000,-6729.812 -4101.725
C(TYPE)[T.Abandoned Building],-4382.0527,676.233,-6.480,0.000,-5707.450 -3056.656
C(TYPE)[T.Abandoned Vehicles],-5730.9712,666.069,-8.604,0.000,-7036.448 -4425.494
C(TYPE)[T.Alert Boston],-5904.8983,782.003,-7.551,0.000,-7437.601 -4372.195
C(TYPE)[T.Animal Found],-5538.8280,717.725,-7.717,0.000,-6945.549 -4132.107
C(TYPE)[T.Animal Generic Request],-5915.5075,674.448,-8.771,0.000,-7237.407 -4593.608
C(TYPE)[T.Animal Lost],-5514.7986,721.003,-7.649,0.000,-6927.943 -4101.654
C(TYPE)[T.Animal Noise Disturbances],-5592.5116,799.778,-6.993,0.000,-7160.053 -4024.970

0,1,2,3
Omnibus:,347583.669,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55168693.001
Skew:,6.177,Prob(JB):,0.0
Kurtosis:,68.82,Cond. No.,1.05e+17


In [98]:
get_rmse(est)

1881.5539142475611

## Getting CV R^2 score, but for ea fold

Just using $R^2$ for now since it's the default for `sklearn`. Will want to try out adjusted $R^2$ and RMSE as well.

In [7]:
vectorizer = CountVectorizer()
vectorizer.fit_transform(df['TYPE']); # I need to have all the columns be all the possible categories from _both_ train and test data

In [95]:
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=300)

In [93]:
pipe = make_pipeline(LinearRegression())
# pipe = make_pipeline(StandardScaler(), LinearRegression()) # StdScaler doesn't work for some reason
# pipe = make_pipeline(LabelEncoder(), OneHotEncoder(), LinearRegression()) 

In [100]:
training_scores = []
test_scores = []

for CV_train_index, CV_test_index in ss.split(y_train):
    X_CV_train = vectorizer.transform(X_train.iloc[CV_train_index]['TYPE'])
    X_CV_test = vectorizer.transform(X_train.iloc[CV_test_index]['TYPE'])
    y_CV_train = y_train.iloc[CV_train_index]
    y_CV_test = y_train.iloc[CV_test_index]
    
    pipe.fit(X_CV_train, y_CV_train)
    training_score = pipe.score(X_CV_train, y_CV_train)
    test_score = pipe.score(X_CV_test, y_CV_test)
    training_scores += [training_score]
    test_scores += [test_score]
    print "Training: {}. Test: {}".format(training_score, test_score)

Training: 0.251190728986. Test: 0.244443794726
Training: 0.25098114034. Test: 0.245562044148
Training: 0.250661088013. Test: 0.246838181334
Training: 0.250889657331. Test: 0.245313621299
Training: 0.250496518478. Test: 0.247290981624


## Let's check the performance of our baseline model on the test data

I predict that it will be pretty similar to the CV performances: ~24% $R^2$.

In [102]:
pipe.score(vectorizer.transform(X_test['TYPE']), y_test)

0.24201144067182326

And indeed it is 0.24.

## Playing around w a couple more options
- StdScaler
- `Reason` instead of `TYPE`
- higher-order polynomials

### First: StdScaler

In [129]:
pipe = make_pipeline(StandardScaler(with_mean=False), LinearRegression()) # StdScaler doesn't work for some reason
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=300)

In [130]:
training_scores = []
test_scores = []

for CV_train_index, CV_test_index in ss.split(y_train):
    X_CV_train = vectorizer.transform(X_train.iloc[CV_train_index]['TYPE'])
    X_CV_test = vectorizer.transform(X_train.iloc[CV_test_index]['TYPE'])
    y_CV_train = y_train.iloc[CV_train_index]
    y_CV_test = y_train.iloc[CV_test_index]
    
    pipe.fit(X_CV_train, y_CV_train)
    training_score = pipe.score(X_CV_train, y_CV_train)
    test_score = pipe.score(X_CV_test, y_CV_test)
    training_scores += [training_score]
    test_scores += [test_score]
    print "Training: {}. Test: {}".format(training_score, test_score)

Training: 0.251190729522. Test: 0.244376787574
Training: 0.250981140706. Test: 0.245561964383
Training: 0.250661088981. Test: 0.2468307821
Training: 0.250889657881. Test: 0.245296219324
Training: 0.250496519272. Test: 0.247254285959


I would expect higher $R^2$ scores than w/o StdScaler.

Instead, they're about the same. I guess this means ShuffleSplit really shuffled the rows well?

Update: Now that I think about it, since I'm only using one feature, scaling doesn't come into play. So it makes sense that I'd get similar CV $R^2$ values.

### Second: Trying `Reason`, w/o StdScaler

In [131]:
pipe = make_pipeline(LinearRegression()) # StdScaler doesn't work for some reason
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=300)

In [134]:
training_scores = []
test_scores = []

for CV_train_index, CV_test_index in ss.split(y_train):
    X_CV_train = vectorizer.transform(X_train.iloc[CV_train_index]['REASON'])
    X_CV_test = vectorizer.transform(X_train.iloc[CV_test_index]['REASON'])
    y_CV_train = y_train.iloc[CV_train_index]
    y_CV_test = y_train.iloc[CV_test_index]
    
    pipe.fit(X_CV_train, y_CV_train)
    training_score = pipe.score(X_CV_train, y_CV_train)
    test_score = pipe.score(X_CV_test, y_CV_test)
    training_scores += [training_score]
    test_scores += [test_score]
    print "Training: {}. Test: {}".format(training_score, test_score)

Training: 0.050364299692. Test: 0.0528208754389
Training: 0.0511371522887. Test: 0.0498099354244
Training: 0.0512894654362. Test: 0.0492466901145
Training: 0.0505269903012. Test: 0.0522024353821
Training: 0.0505914157316. Test: 0.0519740845286


Looks like `Reason` has even less signal than `TYPE`. This makes sense, since `Reason` is more broad than `TYPE`.

## Third: higher-order polynomials

Doesn't make sense to use higher-order polynomials when the values are only ever 1 or 0. This will just add more columns without adding more signal.