# 8. Evaluation

In this notebook, we compare the prediction results on the test set from the Uber model to predictions from Facebook's Prophet Bayesian time series forecasting package.

In [1]:
# allows for import from `src` directory
import sys 
sys.path.append('../')

from src import data
from src import utils
from src import inference
from src import evaluation

from models.predict import *

import numpy as np
import pandas as pd

params = utils.read_json_params('../parameters.json')
df, dataloaders = data.full_pipeline(params)

Data already downloaded

43910 train rows from 2012-10-02 09:00:00 to 2017-10-05 23:00:00
4320 valid rows from 2017-10-05 23:00:00 to 2018-04-03 23:00:00
4321 test rows from 2018-04-03 23:00:00 to 2018-09-30 23:00:00

17341 samples of 48 input steps and 4 output steps in train
3622 samples of 48 input steps and 4 output steps in valid
4060 samples of 48 input steps and 4 output steps in test



## 7.1 Get Uber results

Run the Uber inference algorithm and rescale the results back to the original scale.

In [2]:
def equal(a, b, tol=1):
    a,b = int(a),int(b)
    return np.allclose(a,b,rtol=1)

def reinflate(df, inference_results):
    Y, Y_hat, Y_hat_2upper, Y_hat_2lower, Y_hat_upper, Y_hat_lower = inference_results
    
    X_orig = df['traffic_volume'].values
    Y_test = Y
    steps = 5
    windows = np.lib.stride_tricks.as_strided(X_orig, shape=[len(X_orig)-steps+1, steps], strides=(8,8))
    test_start_idx = np.where((Y_test[:steps] ==  windows).all(axis=1))[0][0]

    results = []
    idx = 0
    for r,row in enumerate(df[['traffic_volume']].iloc[test_start_idx:].itertuples()):
        try:
            if np.isnan(row.traffic_volume):
                continue
            else:
                if equal(row.traffic_volume, Y_test[idx]):
                    results.append([row.Index, Y_test[idx], Y_hat[idx], Y_hat_2upper[idx], Y_hat_2lower[idx], Y_hat_upper[idx], Y_hat_lower[idx]])
                    idx +=1
        except:
            pass

    df_inference = pd.DataFrame(np.vstack(results))
    full_dates = pd.DataFrame(pd.date_range(results[0][0], results[-1][0], freq='H'))
    df_inference = df_inference.merge(full_dates, left_on=0, right_on=0, how='right')
    cols = ['datetime','Y', 'Y_hat', 'Y_hat_2upper', 'Y_hat_2lower', 'Y_hat_upper', 'Y_hat_lower']
    df_inference = df_inference.rename({i:cols[i] for i in np.arange(df_inference.shape[-1])}, axis=1).set_index(cols[0])
    df_inference = df_inference.astype({col: np.float64 for col in df_inference.columns})
    
    return df_inference

In [3]:
results = inference.run(params, dataloaders)
Y, Y_hat, Y_hat_2upper, Y_hat_2lower, Y_hat_upper, Y_hat_lower = results

100%|██████████| 200/200 [02:01<00:00,  1.65it/s]


In [4]:
df_uber = reinflate(df, results)

## 7.2 Facebook Prophet

For comparision to the model we have implemented, we will train Faceook's Prophet with the training and validation dataset, then use it for predictions on the test set. We will then evaluate and compare performance with a range of metrics.

In [5]:
import fbprophet

# copy the data up to the end of validation set from df to a new prophet friendly dataframe
test_start = '2018-04-05 23:00:00'
test_end = '2018-09-28 20:00:00'

df_for_prophet = (df.reset_index()[['date_time','traffic_volume']]
                .rename({'date_time': 'ds', 'traffic_volume': 'y'}, axis=1))
df_for_prophet = df_for_prophet[df_for_prophet['ds'] < test_start]
prophet = fbprophet.Prophet()
_ = prophet.fit(df_for_prophet) # 23:22:40

#### Create a range of hourly datetime values over the test dataset date range

In [6]:
n_test_hours = int((np.datetime64(test_end) - np.datetime64(test_start)) /  
                   np.timedelta64(60*60,'s')) + 1
future = prophet.make_future_dataframe(periods=n_test_hours,freq='H')

#### Make predictions

In [7]:
forecast = prophet.predict(future)
df_prophet = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
df_prophet = df_prophet[df_prophet['ds'] >= test_start].set_index('ds')

## 7.3 Calculate evaluation metrics

In [8]:
y = df_uber.Y.values

### Mean absolute error

In [9]:
print('\t | MAE')
for label,predictions in zip(['Uber', 'Prophet'], [df_uber.Y_hat, df_prophet.yhat]):
    print(f'{label:8s} | {evaluation.mae(y,predictions.values):.4f}')

	 | MAE
Uber     | 280.4710
Prophet  | 680.9752


### Root mean squared error

In [10]:
print('\t | RMSE')
for label,predictions in zip(['Uber', 'Prophet'], [df_uber.Y_hat, df_prophet.yhat]):
    print(f'{label:8s} | {evaluation.rmse(y,predictions.values):.4f}')

	 | RMSE
Uber     | 490.9205
Prophet  | 955.8478


### Mean absolute percentage error

In [11]:
print('\t | MAPE')
for label,predictions in zip(['Uber', 'Prophet'], [df_uber.Y_hat, df_prophet.yhat]):
    print(f'{label:8s} | {evaluation.mape(y,predictions.values):.4f}')

	 | MAPE
Uber     | 0.1299
Prophet  | 0.4125


### Symmetric mean absolute percentage error

In [12]:
print('\t | SMAPE')
for label,predictions in zip(['Uber', 'Prophet'], [df_uber.Y_hat, df_prophet.yhat]):
    print(f'{label:8s} | {evaluation.smape(y,predictions.values):.4f}')

	 | SMAPE
Uber     | 0.0289
Prophet  | 0.0243
