In [None]:
import config
import os
import random
import utils

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn import preprocessing

import pmdarima as pm
from pmdarima.arima import auto_arima
from pmdarima.arima import CHTest
from pmdarima.arima import ADFTest, KPSSTest
from pmdarima.arima.utils import ndiffs, nsdiffs
from pmdarima.utils import tsdisplay
from pmdarima.preprocessing import BoxCoxEndogTransformer

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
seed_value = 42
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

Load Data

In [None]:
saltlake_week = pd.read_csv('../saltlake_week.csv')
saltlake_week

In [None]:
y = saltlake_week[['Day', 'VMT (Veh-Miles)']].iloc[53:, :]
y.columns = ['Week', 'VMT']
y['Week'] = pd.to_datetime(y['Week'])
y.head()

In [None]:
y.tail()

In [None]:
y.set_index('Week', inplace=True)
y.head()

In [None]:
from pmdarima.utils import pacf

plt.plot(pacf(y))

In [None]:
from pandas.plotting import autocorrelation_plot

autocorrelation_plot(y)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

plot_acf(y['VMT'])

In [None]:
plot_pacf(y['VMT'], method='ywm')

In [None]:
X = saltlake_week[['Day', 'Cases', 'News Sentiment', 'Unemployment Rate', 'PRCP', 'SNWD',
                         'Percent_Fully_Vaccinated_5&Older', 'TAVG',
                         'Stay at Home', 'Mask', 'School Opening', 'Health Emergency', 'Holiday']].iloc[53:209, :]
X.columns = ['Week', 'Cases', 'News Sentiment', 'Unemployment Rate', 'PRCP', 'SNWD',
                         'Percent_Fully_Vaccinated_5&Older', 'TAVG',
                         'Stay at Home', 'Mask', 'School Opening', 'Health Emergency', 'Holiday']
X.head()

In [None]:
X['Week'] = pd.to_datetime(X['Week'])
X.dtypes

In [None]:
X.set_index('Week', inplace=True)
X.head()

In [None]:
for column in X.columns:
    plot_acf(X[column], title=column)

Test X for Stationarity

In [None]:
def adf_test(series, a=0.05, name=''):
    adf_test = ADFTest(alpha = 0.05)
    p_value = adf_test.should_diff(series)[0]

    if p_value <= a:
        print(f" {name} : P-Value = {p_value} => Stationary. ")
    else:
        print(f" {name} : P-Value = {p_value} => Non-stationary.")

for column in X.columns:
    adf_test(X[column], name=column)

If data is not stationary, perform differencing until all data is.

In [None]:
X_diff = X.diff().dropna()
for column in X_diff.columns:
    adf_test(X_diff[column], name=column)

In [None]:
X_diff2 = X_diff.diff().dropna()
for column in X_diff2.columns:
    adf_test(X_diff2[column], name=column)

Visualize stationary data

In [None]:
for column in X_diff2.columns:
    plt.figure()
    plt.plot(X_diff2[column])
    plt.title(column)

In [None]:
X_diff2.shape

Stationarity test on y

In [None]:
adf_test(y, name='VMT')

Find best value of d

In [None]:
y_diff = y.diff().dropna()
adf_test(y_diff, name='VMT') # d=1

Best value of D

In [None]:
nsdiffs(y,
            m=52,
            max_D=12,
            test='ch')

In [None]:
y_scaler, y_scaled = utils.scale(y)

In [None]:
y_scaled = pd.DataFrame(y_scaled)
y_scaled

In [None]:
y_scaled.columns = ['VMT']
y_scaled.index = y.index
y_scaled

Separate data into training and testing sets (8 weeks for testing)

In [None]:
trainX_diff2 = X_diff2.iloc[:150, :]
testX_diff2 = X_diff2.iloc[150:, :]
trainY = y_scaled.iloc[2:152, :]
testY = y_scaled.iloc[152:, :]

In [None]:
print(trainX_diff2.shape)
print(trainY.shape)
print(testX_diff2.shape)

Visualize training and testing sets

In [None]:
plt.plot(trainY)
plt.plot(testY)

Create and fit ARIMA model

In [None]:
arima_model =  auto_arima(trainY, trainX_diff2, d=1,
                          max_p=10, max_d=5, max_q=5, 
                          start_P=0, D=1, start_Q=0, 
                          max_P=5, max_D=5, max_Q=5, 
                          m=52, seasonal=True, error_action='warn',trace = True,
                          supress_warnings=True,stepwise = True,
                          random_state=seed_value,n_fits = 50)

In [None]:
arima_model.summary()

In [None]:
arima_model.get_params()

In [None]:
y_pred = arima_model.predict(n_periods = 4, X=testX_diff2)
y_pred_inv = y_scaler.inverse_transform(y_pred.reshape((-1, 1)))

In [None]:
prediction = pd.DataFrame(y_pred_inv,index=testX_diff2.index, columns=['VMT'])
prediction

In [None]:
trainY_inv = pd.DataFrame(y_scaler.inverse_transform(trainY), index=trainY.index, columns=['VMT'])
testY_inv = pd.DataFrame(y_scaler.inverse_transform(testY), index=testY.index, columns=['VMT'])

In [None]:
plt.figure(figsize=(8,5))
plt.plot(trainY_inv,label="Training")
plt.plot(testY_inv,label="Test")
plt.plot(prediction,label="Predicted")
plt.legend(loc = 'upper left')
plt.show()

In [None]:
from sklearn.metrics import r2_score
testY_inv = pd.DataFrame(testY_inv)
testY_inv['Predicted_VMT'] = prediction
testY_inv

In [None]:
plt.figure(figsize=(8,5))
plt.plot(testY_inv['VMT'],label="Test")
plt.plot(prediction,label="Predicted")
plt.legend(loc = 'upper left')
plt.show()

In [None]:
r2_score(testY_inv['VMT'], testY_inv['Predicted_VMT'])

In [None]:
mean_absolute_percentage_error(testY_inv['VMT'], testY_inv['Predicted_VMT'])

In [None]:
mean_squared_error(testY_inv['VMT'], testY_inv['Predicted_VMT'])

In [None]:
mean_squared_error(testY_inv['VMT'], testY_inv['Predicted_VMT'], squared=False)

In [None]:
import pickle

with open('arima.pkl', 'wb') as pkl:
    pickle.dump(arima_model, pkl)