# Load Data

In [None]:
import pickle
import yaml
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
import sys
sys.path.append('../utils')
from utils import load_processed_data, cv, get_test_metrics

In [None]:
adj_mat, ind_station_mapper, speed_df = load_processed_data('../data/processed/rdp_ds')

In [None]:
with open('../models/env.yaml') as f:
    ENV = yaml.load(f, Loader=yaml.FullLoader)

**Choose Station**

In [None]:
station_speed = speed_df[ENV['station_id']]
station_speed = station_speed[station_speed.index.month.isin([5, 6, 7])] # subset and choose data in may-july
station_speed

In [None]:
fig = px.line(x=station_speed.index, y=station_speed, title='Time Series Plot')
fig.update_xaxes(title='Time')
fig.update_yaxes(title='Speed (mph)')

# Prepare Data for Model

In [None]:
import numpy as np

In [None]:
# create train-test data
cutoff = station_speed[(station_speed.index.month == 5) | (station_speed.index.month == 6)].index.shape[0]
train = station_speed.iloc[:cutoff]
test = station_speed.iloc[cutoff:]

# get normalization params from train
train_mean = np.mean(train)
train_std = np.std(train)
train = (train - train_mean) / train_std
test = (test - train_mean) / train_std

In [None]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift,
               train, test):
        # Store the raw data.
        self.train = train
        self.test = test

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
        
        
    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.labels_slice, :]

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels
    
    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(data=data, targets=None, 
                                                          sequence_length=self.total_window_size, 
                                                          sequence_stride=1, shuffle=False, batch_size=1)
        ds = ds.map(self.split_window)
        return ds
    
    def get_train(self):
        return self.make_dataset(self.train.to_frame())

    def get_test(self):
        return self.make_dataset(self.test.to_frame())


    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}'])

In [None]:
num_lags = ENV['num_lags']

In [None]:
def get_train_test_for_mod(num_lags):
    window = WindowGenerator(train=train, test=test, input_width=num_lags, # initialize sliding window 
                              label_width=1, shift=1)
    
    mod_train = window.get_train()
    mod_test = window.get_test()

    X_train = []
    y_train = []
    for X, y in mod_train:
        X_train.append(X[0, :, :])
        y_train.append(y[0, :, :])
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    y_train = y_train.reshape(y_train.shape[0], 1)

    X_test = []
    y_test = []
    for X, y in mod_test:
        X_test.append(X[0, :, :])
        y_test.append(y[0, :, :])
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    y_test = y_test.reshape(y_test.shape[0], 1)
    
    return X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = get_train_test_for_mod(num_lags)

# Build Model

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, Dense
import keras
from time import time
import tensorflow as tf

In [None]:
# create and fit the CNN
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=(num_lags,), activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1))

model.compile(loss='mean_squared_error', optimizer='adam')
start = time()
model.fit(X_train, y_train, epochs=ENV['dl_train_epochs'], batch_size=1)
end = time()

In [None]:
# model.save('./trained/CNN')

Tune:

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
cv_results = pd.DataFrame(columns=['mean_fit_time', 'param_num_lags', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score'])
for num_lags in [1, 3, 5, 10]:
    metrics = {'mean_fit_time': 0, 'param_num_lags': num_lags, 'split0_test_score': 0, 'split1_test_score': 0, 
              'split2_test_score': 0, 'split3_test_score': 0, 'split4_test_score': 0}
    X_train_cv, y_train_cv, X_test_cv, y_test_cv = get_train_test_for_mod(num_lags)
    
    for i in range(5): # 5 fold CV
        model = Sequential()
        model.add(Conv1D(filters=32, kernel_size=(num_lags,), activation='relu'))
        model.add(Dense(units=32, activation='relu'))
        model.add(Dense(units=1))

        model.compile(loss='mean_squared_error', optimizer='adam')
        start = time()
        model.fit(X_train_cv, y_train_cv, epochs=ENV['dl_train_epochs'], batch_size=1)
        end = time()
        
        test_cv_preds = model.predict(X_test_cv)

        # invert predictions
        test_cv_preds = test_cv_preds * train_std + train_mean
        
        metrics['mean_fit_time'] += (end - start) / 5
        metrics[f'split{i}_test_score'] = mean_squared_error(y_test_cv, test_cv_preds, squared=False)
    cv_results = cv_results.append(metrics, ignore_index=True).reset_index(drop=True)

In [None]:
best_results_row = cv_results.iloc[cv_results[[col for col in cv_results if 'split' in col]].mean(axis=1).idxmin()]
best_params = best_results_row[[col for col in best_results_row.index if 'param' in col]].to_dict()

In [None]:
# with open('./trained/CNN/grid_search_CNN.dat', 'wb') as f:
#     pickle.dump({'best_params': best_params, 'results': cv_results}, f)

# Best Model

In [None]:
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=(best_params['num_lags'],), activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1))

model.compile(loss='mean_squared_error', optimizer='adam')

X_train, y_train, X_test, y_test = get_train_test_for_mod(best_params['num_lags'])
model.fit(X_train, y_train, epochs=ENV['dl_train_epochs'], batch_size=1)

In [None]:
# model.save('./trained/CNN')

# Evaluate

In [None]:
# make predictions
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# invert predictions
train_preds = train_preds * train_std + train_mean
test_preds = test_preds * train_std + train_mean

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
fig.add_trace(go.Line(x=station_speed.index, y=station_speed, name='True Values'))
fig.add_trace(go.Line(x=station_speed[num_lags:cutoff].index, y=train_preds.flatten(), name='Predicted Values (Train)'))
fig.add_trace(go.Line(x=station_speed[(cutoff+num_lags):].index, y=test_preds.flatten(), name='Predicted Values (Test)'))
fig.update_layout(
    title="CNN Forecast Results",
    xaxis_title="Time",
    yaxis_title="Forecast")

In [None]:
# fig.write_html('../plots/CNN.html')

In [None]:
cv_metrics = cv(model, [X_train, y_train], metrics=['mse', 'mae', 'rmse', 'r2'], epochs=ENV['dl_cv_epochs'], verbose=True, folds=ENV['cv_folds'])

# unscale test labels
y_test = y_test * train_std + train_mean
test_metrics = get_test_metrics(y_test.flatten(), test_preds.flatten())

metrics = {'cv': cv_metrics, 'test': test_metrics}

In [None]:
# with open('./trained/CNN/metrics_CNN.dat', 'wb') as f:
#     pickle.dump(metrics, f)