# ARIMA Model

In [None]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import torch

src_path = os.path.abspath(os.path.join('../../..'))
if src_path not in sys.path:
    sys.path.append(src_path)

from src.models.arima import MultiARIMA
from src.utils.datasets import HSMD

## Load the Huge Stock Market Dataset

For proof of concept, load only the `Close` column `NFLX` stock. Load the data in 1 day windows (ARIMA consider the series 1 data point at a time).

In [None]:
dataset = HSMD('../../../data/HSMD', stocks=['nflx'], columns=['Close'], window_size=1)

## Split the data into a train and test 

Use 80% of the data for the training set, and 20% of the data for the test set. Seed the PRNG to get the same split every time, for the purpose of effective cross-validation.

In [None]:
TRAIN_PCT = 0.8
train_size = int(len(dataset) * TRAIN_PCT)
test_size = len(dataset) - train_size

In [None]:
generator = torch.Generator().manual_seed(1)
train_set, test_set = torch.utils.data.random_split(dataset, [train_size, test_size], generator=generator)

In [None]:
# Reformat the data before feeding it to ARIMA
history = [x.astype(float).flatten() for x in train_set]

## Train an ARIMA model and summarize the fit

In [None]:
model = ARIMA(history, order=(5, 1, 0))
fit = model.fit()
print(fit.summary())

## Plot the residuals of the fit. 

The residual error for the model varies from the -100 to +150 range.

The distribution of the residuals seems to be centered below 0, indicating the model typically underestimates the stock price.

In [None]:
residuals = pd.DataFrame(fit.resid)
residuals.plot()
plt.show()
# density plot of residuals
residuals.plot(kind='kde')
plt.show()

## Evaluate the model on the test set

For each data point in the test set, we train the model on the preceeding `history` (including the train set) to calculate a fit. We create a forecast from this fit.

In [None]:
preds = [] # predictions
gt = [] # ground truth
for window in test_set:
    window = window.astype(float).flatten()
    model = ARIMA(history, order=(5, 1, 0))
    fit = model.fit()
    output = fit.forecast()
    preds.append(output)
    gt.append(window)
    history.append(window)

## Visualize model predictions

In [None]:
plt.figure(figsize=(15, 10))
time = np.arange(0, len(gt))
plt.plot(time, gt, 'g', label='Ground Truths')
plt.plot(time, preds, 'b', label='Predictions')
plt.xlabel('Time')
plt.ylabel('Stock Price')
plt.show()