In [1]:
import numpy as np
import pandas as pd
from math import pi
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler, StandardScaler
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from keras.models import Model, load_model
from keras.layers import GRU, Dense, Dropout, concatenate, Input
from keras.optimizers import RMSprop
from random import shuffle
import warnings
import holidays
from datetime import timedelta
np.set_printoptions(threshold=np.nan)

Using TensorFlow backend.


In [2]:
data =pd.read_csv('./actuals.csv')


train_cut = int(len(data) * 0.8)
validate_cut = int(len(data) * 0.9)
test_cut = int(len(data))

In [3]:
X, y = data, data['rides']
X.head()

Unnamed: 0,date,sunrise,icon,precip_prob,temperature,humidity,wind_speed,rides
0,2013-06-01 00:00:00,0,clear,0.01,77.65,0.61,2.06,152
1,2013-06-01 01:00:00,0,clear,0.01,75.62,0.67,1.93,102
2,2013-06-01 02:00:00,0,clear,0.01,74.72,0.7,2.31,67
3,2013-06-01 03:00:00,0,clear,0.01,73.32,0.76,2.16,41
4,2013-06-01 04:00:00,0,clear,0.01,72.42,0.79,1.93,16


In [4]:
y.head()

0    152
1    102
2     67
3     41
4     16
Name: rides, dtype: int64

<h2>Define Custom Transformers and Pipelines</h2>

In [5]:
class HolidaySelector(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        hd = [date for date, name in holidays.US(years=[2013, 2014, 2015, 2016, 2017, 2018]).items()
                        if name.startswith(("New Year's Day", "Washington's Birthday", "Memorial Day", "Independence Date",
                        "Labor Day", "Thanksgiving", "Christmas Day"))]
        hd_eve = [day - timedelta(days=1) for day in hd]
        hd.extend(hd_eve)
        self.h = [str(date) for date in hd]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[['date']].applymap(lambda x: int(pd.to_datetime(x).strftime('%Y-%m-%d') in self.h))
    
class DateTimeExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, extract):
        self.extract = extract
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X[['date']].applymap(lambda x: float(getattr(pd.to_datetime(x), self.extract)))

class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X.loc[:, self.columns]


In [6]:
all_pipeline = Pipeline([
    ('union', FeatureUnion([
        ('onehot_pipeline', Pipeline([
            ('onehot_union', FeatureUnion([
                ('hour', DateTimeExtractor('hour')),
                ('month', DateTimeExtractor('month')), 
                ('dayofweek', DateTimeExtractor('dayofweek')), 
                ('categories', ColumnSelector(['sunrise', 'icon'])),
                ('holiday', HolidaySelector()),
            ])),
            ('onehot_encoder', OneHotEncoder(sparse=False))
        ])),
        ('float_pipeline', Pipeline([
            ('float_union', FeatureUnion([
                ('year', DateTimeExtractor('year')),
                ('floats', ColumnSelector(['precip_prob', 'temperature', 'humidity', 'wind_speed'])),
            ])),
            ('scaler', StandardScaler())
        ]))
    ]))
])

time_pipeline = Pipeline([
    ('union', FeatureUnion([
        ('onehot_pipeline', Pipeline([
            ('onehot_union', FeatureUnion([
                ('hour', DateTimeExtractor('hour')),
                ('month', DateTimeExtractor('month')), 
                ('dayofweek', DateTimeExtractor('dayofweek')), 
                ('holiday', HolidaySelector())
            ])),
            ('onehot_encoder', OneHotEncoder(sparse=False))
        ])),
        ('float_pipeline', Pipeline([
            ('float_union', FeatureUnion([
                ('year', DateTimeExtractor('year')),
                ('floats', ColumnSelector(['precip_prob', 'temperature', 'humidity', 'wind_speed', 'rides'])),
            ])),
            ('scaler', StandardScaler())
        ]))
    ]))
])


In [7]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    all_pipeline.fit(X.iloc[0:train_cut, :])
    all_X = all_pipeline.transform(X)
  
    time_pipeline.fit(X.iloc[0:train_cut, :])
    time_X = time_pipeline.transform(X)

In [8]:
from sklearn.externals import joblib

joblib.dump(all_pipeline, 'all_pipeline.pkl', compress=1)
joblib.dump(time_pipeline, 'time_pipeline.pkl', compress=1)

['time_pipeline.pkl']

In [9]:
def create_datasets(time_X, all_X, start, stop):
    ONE_DAY = 24
    lookback = 28*ONE_DAY 
    lag = 2*ONE_DAY 
    
    if start < lookback + lag:
        start = lookback + lag

    input_time_X = np.zeros((stop-start, lookback//ONE_DAY, len(time_X[0])))
    input_all_X = np.zeros((stop-start, len(all_X[0])))
    
    for i in range(start, stop):
        input_time_X[i-start] = time_X[i-lookback-lag:i-lag:ONE_DAY]
        input_all_X[i-start] = all_X[i]
    
    return input_time_X, input_all_X

input_time_X, input_all_X = create_datasets(time_X, all_X, 0, train_cut)
v_input_time_X, v_input_all_X = create_datasets(time_X, all_X, train_cut, validate_cut)

In [10]:
temporal = Input(shape=(len(input_time_X[0]), len(input_time_X[0][0])), name='temporal')
temporal1 = GRU(32, dropout=0.5, recurrent_dropout=0.5, name='temporal_gru_1')(temporal)
temporal1 = Dense(64, name='temporal_dense_1')(temporal1)
temporal1 = Dropout(0.5, name='temporal_dropout_1')(temporal1)


weather = Input(shape=(len(input_all_X[0]), ), name='all')
weather1 = Dense(64, activation='relu', name='all_dense_1')(weather)
weather1 = Dropout(0.5, name='all_dropout_1')(weather1)


concat = concatenate([temporal1, weather1], name='concat')
concat1 = Dense(128, activation='relu', name='concat_dense_1')(concat)
concat1 = Dropout(0.5, name='concat_dropout_1')(concat1)
concat1 = Dense(128, activation='relu', name='concat_dense_2')(concat1)
concat1 = Dropout(0.5, name='concat_dropout_2')(concat1)
output = Dense(1, name='output')(concat1)


model = Model([temporal, weather], output)
model.compile(optimizer='adam', loss='mse')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
temporal (InputLayer)           (None, 28, 51)       0                                            
__________________________________________________________________________________________________
temporal_gru_1 (GRU)            (None, 32)           8064        temporal[0][0]                   
__________________________________________________________________________________________________
all (InputLayer)                (None, 60)           0                                            
__________________________________________________________________________________________________
temporal_dense_1 (Dense)        (None, 64)           2112        temporal_gru_1[0][0]             
__________________________________________________________________________________________________
all_dense_

In [11]:
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

In [12]:
callbacks_list = [
    EarlyStopping(
        monitor='val_loss',
        patience=20
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=10
    ),
    ModelCheckpoint(
        filepath='new_model_5.h5',
        monitor='val_loss',
        save_best_only=True
    )
]

model.fit([input_time_X, input_all_X], y[30*24:train_cut].to_numpy(), epochs=100, batch_size=64,
                validation_data=([v_input_time_X, v_input_all_X], y[train_cut:validate_cut]),
                callbacks=callbacks_list)

Train on 32333 samples, validate on 4132 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100


Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1c004184390>

In [13]:
model = load_model('new_model_5.h5')

print(model.evaluate([input_time_X, input_all_X],  y[30*24:train_cut].to_numpy()))
print(model.evaluate([v_input_time_X, v_input_all_X], y[train_cut:validate_cut].to_numpy()))

51777.24979298675
188623.02609964908
