In [1]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt 
from statsmodels.tsa.arima_model import ARIMA 
from sklearn.metrics import mean_squared_error

TICKER_DIR = "C:\\Users\\anton\\Documents\\antoniouaa\\msc_thesis\\data\\tickers\\ticker_data"

dataset = pd.read_csv(os.path.join(TICKER_DIR, "AAPL.csv"))
dataset = dataset.dropna(how="any")
dataset

Unnamed: 0,date,close,high,low,open,volume,adjClose,adjHigh,adjLow,adjOpen,adjVolume,divCash,splitFactor
0,2016-01-04T00:00:00.000Z,105.35,105.368,102.0000,102.61,67649387,97.940352,97.957086,94.825970,95.393066,67649387,0.0,1.0
1,2016-01-05T00:00:00.000Z,102.71,105.850,102.4100,105.75,55790992,95.486033,98.405185,95.207133,98.312219,55790992,0.0,1.0
2,2016-01-06T00:00:00.000Z,100.70,102.370,99.8700,100.56,68457388,93.617403,95.169946,92.845780,93.487250,68457388,0.0,1.0
3,2016-01-07T00:00:00.000Z,96.45,100.130,96.4300,98.68,81094428,89.666321,93.087494,89.647728,91.739477,81094428,0.0,1.0
4,2016-01-08T00:00:00.000Z,96.96,99.110,96.7600,98.55,70798016,90.140451,92.139234,89.954518,91.618621,70798016,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,2019-12-24T00:00:00.000Z,284.27,284.890,282.9197,284.69,12119714,282.839838,283.456719,281.496331,283.257725,12119714,0.0,1.0
1002,2019-12-26T00:00:00.000Z,289.91,289.980,284.7000,284.82,23334004,288.451463,288.521111,283.267674,283.387071,23334004,0.0,1.0
1003,2019-12-27T00:00:00.000Z,289.80,293.970,288.1200,291.12,36592936,288.342016,292.491037,286.670468,289.655375,36592936,0.0,1.0
1004,2019-12-30T00:00:00.000Z,291.52,292.690,285.2200,289.46,36059614,290.053363,291.217477,283.785058,288.003727,36059614,0.0,1.0


In [3]:
def evaluate_arima(X, order):
    size = int(len(X) * 0.6)
    train, test = X[:size], X[size:]
    history = [x for x in train]
    predictions = []
    # perform walk-forward validation
    for t in range(len(test)):
        # fit arima model to the history of values
        model = ARIMA(history, order=order)
        model_fit = model.fit(disp=-1)
        # predict the next value
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])

    error = mean_squared_error(test, predictions)
    return error

In [4]:
def evaluate_models(X, p_values, d_values, q_values):
    scores = []
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p, d, q)
                try:
                    mse = evaluate_arima(X, order)
                    scores.append((mse, order))
                    print(f"ARIMA ORDER: {order}\tMSE: {mse}")
                except:
                    continue
    return scores

In [5]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning
warnings.simplefilter("ignore", ConvergenceWarning)
warnings.simplefilter("ignore", HessianInversionWarning)

X = dataset["close"].values
scores = evaluate_models(X, [0, 1, 2, 4, 6], range(3), range(3))
best_score = sorted(scores)[0]
print(f"BEST SCORE: {best_score[1]}\tORDER: {best_score[0]}")

ARIMA ORDER: (0, 0, 0)	MSE: 3821.8814001182213
ARIMA ORDER: (0, 0, 1)	MSE: 1033.7350948344583
ARIMA ORDER: (0, 1, 0)	MSE: 11.734324374072088
ARIMA ORDER: (0, 1, 1)	MSE: 11.815273785838126
ARIMA ORDER: (0, 1, 2)	MSE: 11.798110453292773
ARIMA ORDER: (0, 2, 0)	MSE: 23.82513037931285
ARIMA ORDER: (0, 2, 1)	MSE: 11.796213936505604
ARIMA ORDER: (0, 2, 2)	MSE: 11.888266339347307
ARIMA ORDER: (1, 0, 0)	MSE: 11.817900605795554
ARIMA ORDER: (1, 1, 0)	MSE: 11.810913253059034
ARIMA ORDER: (1, 1, 1)	MSE: 11.892296187819516
ARIMA ORDER: (1, 2, 0)	MSE: 18.653292474651664
ARIMA ORDER: (1, 2, 1)	MSE: 11.880698118481483
ARIMA ORDER: (1, 2, 2)	MSE: 11.841949775738277
ARIMA ORDER: (2, 0, 0)	MSE: 11.943224546804979
ARIMA ORDER: (2, 1, 0)	MSE: 11.79766354031549
ARIMA ORDER: (2, 1, 1)	MSE: 11.791101446322925
ARIMA ORDER: (2, 1, 2)	MSE: 12.041865524239054
ARIMA ORDER: (2, 2, 0)	MSE: 16.561400234927344
ARIMA ORDER: (2, 2, 1)	MSE: 11.861757302275702
ARIMA ORDER: (2, 2, 2)	MSE: 11.943216025427253
ARIMA ORDER: (4

In [None]:
# Try a prediction with the optimal ARIMA parameters
X = dataset["close"].values
model = ARIMA(X, (4, 1, 2))
model_ = model.fit(disp=False)
yhat = model_.forecast()[0]
print(f"Current Value: {X[-1]}\nForecast: {yhat}")

In [None]:
# two step forecast
forecasting = model_.forecast(steps=5)
forecast_vals = forecasting[0]
# reshape the arrays to have the same length
preds = np.zeros(len(X)+len(forecast_vals))
preds[-len(forecast_vals):] = forecast_vals
preds[preds==0] = np.NaN
# plot them on top of each other
plt.plot(X)
plt.plot(preds)

In [None]:
test_set = pd.read_csv(os.path.join(TICKER_DIR, "MA_BAP.csv"))
test_set = test_set.dropna(how="any")
test_X = test_set["close"].values
scores = evaluate_models(test_X, [0, 1, 2, 4, 6], range(3), range(3))
best_score = sorted(scores)[0]
print(f"BEST SCORE: {best_score[1]}\tORDER: {best_score[0]}")

In [3]:
# Unify all data and normalize them to [0, 1]
# Then perform grid search to find the optimal ARIMA hyperparameters

import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

TICKER_DIR = "C:\\Users\\anton\\Documents\\antoniouaa\\msc_thesis\\data\\tickers\\ticker_data\\_Rolling"
tickers = []
os.getcwd()
cols = ["close"]
for tick in os.listdir(TICKER_DIR):
    path = os.path.join(TICKER_DIR, tick)
    tick_df = pd.read_csv(path, header=0, usecols=cols, squeeze=True)
    tickers.append(tick_df)

df = pd.concat(tickers)
X = df.values.reshape(-1, 1)
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(X)
scaled_df

array([[0.41735308],
       [0.42277325],
       [0.42311468],
       ...,
       [0.06845632],
       [0.06773078],
       [0.06794418]])

In [4]:
from statsmodels.tsa.arima_model import ARIMA 
from sklearn.metrics import mean_squared_error
from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning
import warnings

warnings.simplefilter("ignore", ConvergenceWarning)
warnings.simplefilter("ignore", HessianInversionWarning)

possible_orders = ((0, 1, 2, 4, 6), (0, 1, 2), (0, 1, 2))
scores = evaluate_models(scaled_df, [0, 1, 2, 4, 6], range(3), range(3))
# evaluate_arima(scaled_df, (0, 1, 0))

ARIMA ORDER: (0, 0, 0)	MSE: 0.026282427501433427
ARIMA ORDER: (0, 0, 1)	MSE: 0.0076512836084998296
ARIMA ORDER: (0, 1, 0)	MSE: 0.0003019037547837042
ARIMA ORDER: (0, 1, 1)	MSE: 0.00030193748483150297
ARIMA ORDER: (0, 1, 2)	MSE: 0.0003019847439414797
ARIMA ORDER: (0, 2, 0)	MSE: 0.0006024538250254549
ARIMA ORDER: (0, 2, 1)	MSE: 0.00030210259710299677
ARIMA ORDER: (0, 2, 2)	MSE: 0.0003024305351752679
ARIMA ORDER: (1, 0, 0)	MSE: 0.00030115859104032697
ARIMA ORDER: (1, 0, 1)	MSE: 0.0003011833907391868
ARIMA ORDER: (1, 0, 2)	MSE: 0.00030123177428806086
ARIMA ORDER: (1, 1, 0)	MSE: 0.000301939417601608
ARIMA ORDER: (1, 2, 0)	MSE: 0.0004606290157417548
ARIMA ORDER: (1, 2, 1)	MSE: 0.000302133480945515
ARIMA ORDER: (1, 2, 2)	MSE: 0.00030236192926837765
ARIMA ORDER: (2, 0, 0)	MSE: 0.00030118579173162333
ARIMA ORDER: (2, 1, 0)	MSE: 0.0003019846697362073
ARIMA ORDER: (2, 2, 0)	MSE: 0.00041057501490515637
ARIMA ORDER: (2, 2, 1)	MSE: 0.00030260409230600945
ARIMA ORDER: (2, 2, 2)	MSE: 0.000302346406419

In [6]:
a_order = (2, 0, 0)
model = ARIMA(scaled_df, a_order)
model_ = model.fit(disp=False)
yhat = model_.forecast()[0]
print(f"Current Value: {X[-1]}\nForecast: {yhat}")

NameError: name 'scaled_df' is not defined