In [None]:
import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import sys
sys.path.append('../utils')
from utils import load_processed_data, cv, get_test_metrics

## Baseline Models
- Selects a few stations from the preprocessed Fwy 405 N data
- Uses very basic prediction methods (e.g. using mean values)
- Considers MAE and RMSE on the test data
- Allows us to see baseline accuracy metrics

Note: must run the notebook `/preprocessing/preprocess.ipynb` to get the required data, or import processed data from Google Drive explained on main README.

In [None]:
station_meta = pd.read_csv('../data/processed/fwy_405_n_ds/meta.csv')
station_meta.head()

In [None]:
_, _, station_data = load_processed_data('../data/processed/rdp_ds')
station_data.head()

In [None]:
with open('./env.dat', 'rb') as f:
    ENV = pickle.load(f)

### Construct Model

In [None]:
class baseline_mod:
    def __init__(self, pred_method='mean'):
        self.train_data = None
        if pred_method in ['mean', 'mean_tod']: # tod=time of day
            self.pred_method = pred_method
        else:
            raise Exception("Prediction method must be 'mean' or 'mean_tod'!")
    
    def fit(self, train_data):
        if type(train_data) != pd.Series: 
            raise Exception('Train data must be of type pd.Series!')
        self.train_data = train_data
    
    def predict(self, test_data):
        if self.train_data is None: 
            raise Exception('You must fit before predicting!')
        if type(test_data) != pd.Series:
            raise Exception('Test data must be of type pd.Series!')
        
        if self.pred_method == 'mean_tod':
            df = self.train_data.to_frame().reset_index().rename(columns={'index': 'Time', self.train_data.name: 'Speed'})
            df['H'] = [x.hour for x in df['Time']]
            df['M'] = [x.minute for x in df['Time']]
            dm = df.groupby(['H', 'M']).mean()['Speed']

            preds = [dm[x.hour][x.minute] for x in test_data.index] 
            return np.array(preds)
        else:
            mean = np.mean(train_data.values)
            return np.array([mean for x in range(test_data.shape[0])])

### Train Test Split

In [None]:
# good example stations to try
#   717711  (difficult)
#   716659  (many random outliers)
#   761455  (periodic)
#   772455  (very periodic)

# select station randomly chosen from env var
STATION = ENV['station_id']

# use may, june as train data and july as test data
df = station_data.loc[station_data.index.month.isin([5, 6, 7]), STATION]
train_data = df[(df.index.month == 5) | (df.index.month == 6)]
test_data = df[df.index.month == 7]

In [None]:
train_data.plot()

### Mean Fill
Simply predict the mean value

In [None]:
baseline = baseline_mod(pred_method='mean')
baseline.fit(train_data)
train_preds = baseline.predict(train_data)
test_preds = baseline.predict(test_data)

In [None]:
# with open('./trained/baseline/baseline_mean.dat', 'wb') as f:
#     pickle.dump(baseline, f)

In [None]:
import plotly.graph_objects as go

In [34]:
# results
print('MAE:  %.3f' % mean_absolute_error(test_data.values, test_preds))
print('RMSE: %.3f' % mean_squared_error(test_data.values, test_preds, squared=False))

fig = go.Figure()
fig.add_trace(go.Line(x=df.index, y=df, name='True Values'))
fig.add_trace(go.Line(x=train_data.index, y=train_preds, name='Predicted Values (Train)'))
fig.add_trace(go.Line(x=test_data.index, y=test_preds.flatten(), name='Predicted Values (Test)'))
fig.update_layout(
    title="Baseline (Mean) Forecast Results",
    xaxis_title="Time",
    yaxis_title="Forecast")

MAE:  4.541
RMSE: 6.317



plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [None]:
# fig.write_html('../plots/baseline_mean.html')

Evaluate:

In [None]:
cv_metrics = cv(baseline, train_data)
cv_metrics

In [None]:
test_metrics = get_test_metrics(test_data, test_preds)
test_metrics

In [None]:
# metrics = {'cv': cv_metrics, 'test': test_metrics}

# with open('./trained/baseline/metrics_baseline_mean.dat', 'wb') as f:
#     pickle.dump(metrics, f)

### Daily Mean
Predict using the mean value based on the time of day

In [None]:
baseline = baseline_mod(pred_method='mean_tod')
baseline.fit(train_data)
train_preds = baseline.predict(train_data)
test_preds = baseline.predict(test_data)

In [None]:
# with open('./trained/baseline/baseline_tod.dat', 'wb') as f:
#     pickle.dump(baseline, f)

In [None]:
# results
print('MAE:  %.3f' % mean_absolute_error(test_data.values, test_preds))
print('RMSE: %.3f' % mean_squared_error(test_data.values, test_preds, squared=False))

fig = go.Figure()
fig.add_trace(go.Line(x=df.index, y=df, name='True Values'))
fig.add_trace(go.Line(x=train_data.index, y=train_preds, name='Predicted Values (Train)'))
fig.add_trace(go.Line(x=test_data.index, y=test_preds.flatten(), name='Predicted Values (Test)'))
fig.update_layout(
    title="Baseline (Mean TOD) Forecast Results",
    xaxis_title="Time",
    yaxis_title="Forecast")

In [None]:
# fig.write_html('../plots/baseline_tod.html')

Evaluate:

In [None]:
cv_metrics = cv(baseline, train_data, metrics=['mse', 'mae', 'rmse', 'r2'])
cv_metrics

In [None]:
test_metrics = get_test_metrics(test_data, test_preds)
test_metrics

In [None]:
# metrics = {'cv': cv_metrics, 'test': test_metrics}

# with open('./trained/baseline/metrics_baseline.dat', 'wb') as f:
#     pickle.dump(metrics, f)