# Load Data

In [None]:
import pickle
import yaml
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from random import choice
import sys
sys.path.append('../utils')
from utils import load_processed_data, cv, get_test_metrics

In [None]:
adj_mat, ind_station_mapper, speed_df = load_processed_data('../data/processed/rdp_ds')

In [None]:
with open('../models/env.yaml') as f:
    ENV = yaml.load(f, Loader=yaml.FullLoader)

**Choice Station**

In [None]:
station_speed = speed_df[ENV['station_id']]
station_speed = station_speed[station_speed.index.month.isin([5, 6, 7])] # subset and choose data in may-july

In [None]:
fig = px.line(x=station_speed.index, y=station_speed, title='Time Series Plot')
fig.update_xaxes(title='Time')
fig.update_yaxes(title='Speed (mph)')

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
fig = seasonal_decompose(station_speed, period=5*12*24).plot()
fig.set_size_inches(13, 8)

We clearly see seasonality the same time each day during night time. We will remove this by taking the difference of the 288 (daily) lag.

In [None]:
import numpy as np

In [None]:
station_speed_no_seasonal = (station_speed - station_speed.shift(288)).dropna()
fig = px.line(x=station_speed_no_seasonal.index, y=station_speed_no_seasonal, title='Time Series Plot w/ Seasonal Component Removed')
fig.update_xaxes(title='Time')
fig.update_yaxes(title='')

In [None]:
station_speed_no_seasonal_stationary = (station_speed_no_seasonal - station_speed_no_seasonal.shift(1)).dropna()
fig = px.line(x=station_speed_no_seasonal_stationary.index, y=station_speed_no_seasonal_stationary, title='Time Series Plot w/ Seasonal Component Removed and First Difference')
fig.update_xaxes(title='Time')
fig.update_yaxes(title='')

In [None]:
fig = seasonal_decompose(station_speed_no_seasonal, period=5*12*24).plot()
fig.set_size_inches(13, 8)

**Test if Time Series is Stationary**

$H_0:$ The time series is non-stationary.
<br>
$H_1:$ The time series is stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
test_stat, p, _, _ , _, _ = adfuller(station_speed_no_seasonal_stationary.dropna())
p

We can conclude the time series is stationary.

**Build Auto ARIMA Model**

In [None]:
differencing = station_speed.shift(288) + station_speed.shift(1)
train_raw = (station_speed - differencing)
train_raw = train_raw[train_raw.index.month.isin([5, 6])]
train = train_raw.dropna()
test = station_speed[station_speed.index.month == 7]

In [None]:
import pmdarima as pmd

In [None]:
def arimamodel(timeseriesarray):
    autoarima_model = pmd.auto_arima(timeseriesarray, 
                              start_p=1, 
                              start_q=1,
                              test="adf",
                              trace=True)
    return autoarima_model

arima_model = arimamodel(train)
arima_model.summary()

In [None]:
# with open('./trained/ARIMA/arima(1,0,5).dat', 'wb') as f:
#     pickle.dump(arima_model, f)

Evaluate:

In [None]:
# # use to load model for evaluation instead of training above
# import pickle
# with open('./trained/ARIMA/arima(1,0,5).dat', 'rb') as f:
#     arima_model = pickle.load(f)

In [None]:
import statsmodels.api as sm

# need to create ARIMA model class as we need to feed in a model that can be trained to perform CV. auto arima can't be retrained.
class custom_arima_mod:
    def __init__(self):
        self.train = None
        self.mod = None
        
    def fit(self, train, order=(1, 0, 5)): # use same order as order determined from auto arima
        self.train = train
        self.mod = sm.tsa.arima.ARIMA(self.train, order=order)
        self.mod = self.mod.fit()
    
    def predict(self, test):
        return self.mod.forecast(steps=test.shape[0])
        
m = custom_arima_mod()
cv_metrics = cv(m, train, metrics=['mse', 'mae', 'rmse', 'r2'])
test_preds = arima_model.predict(test.shape[0])
test_metrics = get_test_metrics(test, test_preds)

# metrics = {'cv': cv_metrics, 'test': test_metrics}

# with open('./trained/ARIMA/metrics_ARIMA(1,0,5).dat', 'wb') as f:
#     pickle.dump(metrics, f)

In [None]:
train_preds, train_conf = arima_model.predict_in_sample(return_conf_int=True, alpha=0.05)
train_conf = pd.DataFrame(train_conf).rename(columns={0: 'lower', 1: 'upper'})
train_conf['diff'] = differencing.loc[train_raw.index].dropna().values
train_conf = train_conf.apply(lambda x: [x['lower'] + x['diff'], x['upper'] + x['diff']], axis=1).apply(pd.Series).values
train_preds = train_preds + differencing.loc[train_raw.index].dropna()

test_preds, test_conf = arima_model.predict(test.shape[0], return_conf_int=True, alpha=0.05)
test_conf = pd.DataFrame(test_conf).rename(columns={0: 'lower', 1: 'upper'})
test_conf['diff'] = differencing.loc[test.index].dropna().values
test_conf = test_conf.apply(lambda x: [x['lower'] + x['diff'], x['upper'] + x['diff']], axis=1).apply(pd.Series).values
test_preds = test_preds + differencing.loc[test.index].dropna()

In [None]:
fig = go.Figure()
fig.add_trace(go.Line(x=station_speed.index, y=station_speed, name='True Values'))
fig.add_trace(go.Line(x=train.index, y=train_preds, name='Predicted Values (Train)', line=dict(color='rgba(255, 0, 0)')))
fig.add_trace(go.Line(x=test.index, y=test_preds, name='Predicted Values (Test)', line=dict(color='rgba(44, 160, 44)')))
fig.update_layout(
    title="ARIMA (1, 0, 5) Forecast Results",
    xaxis_title="Time",
    yaxis_title="Forecast")

fig.add_traces([go.Scatter(x=train_raw.index, y=train_conf[:, 1],
                    mode = 'lines', line_color = 'rgba(0,0,0,0)',
                    showlegend = False),
                go.Scatter(x=train_raw.index, y=train_conf[:,0],
                    mode = 'lines', line_color = 'rgba(0,0,0,0)',
                    name = '95% CI (Train)',
                    fill='tonexty', fillcolor = 'rgba(255, 0, 0, 0.2)')])

fig.add_traces([go.Scatter(x=test.index, y=test_conf[:, 1],
                    mode = 'lines', line_color = 'rgba(0,0,0,0)',
                    showlegend = False),
                go.Scatter(x=test.index, y=test_conf[:,0],
                    mode = 'lines', line_color = 'rgba(0,0,0,0)',
                    name = '95% CI (Test)',
                    fill='tonexty', fillcolor = 'rgba(44, 160, 44, 0.2)')])


In [None]:
# fig.write_html('../plots/ARIMA(1,0,5).html')

**Analyze PACF and ACF**

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure(figsize=(18,12))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(station_speed_no_seasonal_stationary, lags=50, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(station_speed_no_seasonal_stationary, lags=50, ax=ax2)

PACF suggests 1 significant lags in the AR model, and ACF suggests 1 significant lags in MA model. This verifies that the auto arima method resulted in a reasonable model.