In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from math import pi
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

In [2]:
data = pd.read_csv('./historic_data.csv')

train_cut = int(len(data) * 0.8)
validate_cut = int(len(data) * 0.9)

train, validate, test = data.iloc[0:train_cut, :], data.iloc[train_cut:validate_cut, :], data.iloc[validate_cut:, :]

In [3]:
X_train, y_train = train.iloc[:, 0:10], train.iloc[:, 10]
X_validate, y_validate = validate.iloc[:, 0:10], validate.iloc[:, 10]
X_train.head()

Unnamed: 0,date,hour,month,day_of_week,sunrise,icon,precip_prob,temperature,humidity,wind_speed
0,2013-06-01,0,6,7,0,clear,0.01,77.65,0.61,2.06
1,2013-06-01,1,6,7,0,clear,0.01,75.62,0.67,1.93
2,2013-06-01,2,6,7,0,clear,0.01,74.72,0.7,2.31
3,2013-06-01,3,6,7,0,clear,0.01,73.32,0.76,2.16
4,2013-06-01,4,6,7,0,clear,0.01,72.42,0.79,1.93


<h1>Dummy Predictor</h1>

It is helpful to start with a heuristic predictor. The type of predictor that can be used without any statistical or machine learning knowledge. We can compare our ML work against this predictor to determine what type of improvement machine learning has added. 

In this heuristic I am simply making predictions by going back in time 1 year and finding the closest day of the week and resuing that number. That is, I am using the 3rd Wednesday in June 2015 to predict the 3rd Wednesday in June 2016. In the event that that information is not available, I simply take the last available data from the same hour of the day.

In [4]:
class YearAgoRegressor(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.X = X.loc[:, ['date', 'hour']]
        self.X['date'] = self.X['date'].map(lambda x: pd.to_datetime(x))
        self.X['rides'] = y
        return self
    
    def _meaning(self, x):
        prev_year = pd.to_datetime(x.date) - pd.DateOffset(years=1)
        day_delta = int(prev_year.strftime('%w')) + 1 - x.day_of_week
        prev_year = prev_year - pd.Timedelta(days=day_delta)
        if not self.X[(self.X.date == prev_year) & (self.X.hour == x.hour)].empty:
            return self.X[(self.X.date == prev_year) & (self.X.hour == x.hour)].iloc[0, :].rides
        return self.X[(self.X.hour == x.hour)].iloc[-1, :].rides
    
    def predict(self, X, y=None):
        return(X.apply(self._meaning, axis=1))

In [5]:
params = {}
pipeline = make_pipeline(
    YearAgoRegressor()
)
clf = GridSearchCV(pipeline, params, cv=TimeSeriesSplit(3), scoring='neg_mean_squared_error', n_jobs=-1, verbose=5)
clf.fit(train, train.rides)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.4min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None, steps=[('yearagoregressor', YearAgoRegressor())]),
       fit_params=None, iid='warn', n_jobs=-1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=5)

In [6]:
clf.score(X_validate, y_validate)

-777558.4309392265

A MSE of ~777558 will serve as my baseline

<h1>SVM</h1>

In [7]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import TransformerMixin

In [8]:
class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, columns=[]):
        self.columns = columns

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X.loc[:, self.columns]

class CustomTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, func):
        self.func = func
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return self.func(X)

In [9]:
pipeline = Pipeline([
    ('union', FeatureUnion([
        ('cat', Pipeline([
            ('cat_selector', ColumnSelector(['sunrise', 'icon'])),
            ('cat_ohe', OneHotEncoder(sparse=False))
        ])),
        ('year', Pipeline([
            ('year_selector', ColumnSelector(['date'])),
            ('year_extractor', CustomTransformer(lambda y: y.applymap(lambda x: float(pd.to_datetime(x).year - 2013)))),
            ('year_scaler', StandardScaler())
        ])),
        ('int', Pipeline([
            ('int_selector', ColumnSelector(['precip_prob', 'temperature', 'humidity', 'wind_speed'])),
            ('int_scaler', StandardScaler())
        ])),
        ('rad', Pipeline([
            ('rad_selector', ColumnSelector(['hour', 'day_of_week', 'month'])),
            ('rad_cos_sin', FeatureUnion([
                ('rad_cos', CustomTransformer(lambda y: y.apply(lambda x: np.round(np.cos(x * pi * 2/ x.nunique()), 5)))),
                ('rad_sin', CustomTransformer(lambda y: y.apply(lambda x: np.round(np.sin(x * pi * 2/ x.nunique()), 5))))
            ]))  
        ]))
    ])),
    ('svr', SVR())
])

In [None]:
from sklearn.externals.joblib import parallel_backend

params = {'svr__C': [900],
          'svr__gamma': [0.1]}
clf = GridSearchCV(pipeline, params, cv=TimeSeriesSplit(2), scoring='neg_mean_squared_error', n_jobs=-1, verbose=5)
clf.fit(X_train, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.1min finished


In [None]:
clf.best_params_

In [None]:
print("HLLO")

In [32]:
clf.score(X_validate, y_validate)

-589907.1375426432

In [381]:
DataFrame(dict(pred=clf.predict(X_validate), val=y_validate))

  Xt = transform.transform(Xt)


Unnamed: 0,pred,val
31853,154.895315,45
31854,252.679496,282
31855,416.924642,973
31856,574.092884,2063
31857,1283.154557,3812
31858,1306.899159,2698
31859,1331.603307,1355
31860,1459.155920,1359
31861,1518.848957,1516
31862,1586.666969,1619
