# Setup

In [1]:
cd /home/mitchellkwong/onedrive/workbins/BT4013/BT4013-toofdoctor

/home/mitchellkwong/onedrive/workbins/BT4013/BT4013-toofdoctor


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from datetime import datetime
import pickle

import numpy as np
import pandas as pd
import pmdarima as pm
from tqdm import tqdm

import utils

# Models

In [3]:
from models.numeric.arima import ArimaWrapper

# Verify persistence is working

In [4]:
import os
import pickle

def variable_scheme(y):
    """Returns the last 10 close prices in a train split"""
    rows = y.index < datetime(2021, 1, 1)
    cols = 'CLOSE'
    num_y = 10
    return y.loc[rows, cols].tail(num_y)

data = utils.load_futures_data()
models = {}

# Fit all models
for future in tqdm(data):
    models[future] = ArimaWrapper(y=variable_scheme)
    models[future].fit(data, future, error_action='ignore')

# Save all models
for future in tqdm(data):
    with open(f'{future}.p', 'wb') as f:
        pickle.dump(models[future], f)

# Prediction for t+1
pred1 = []
for future in tqdm(data):
    pred1.append(models[future].predict(data, future))
    
# Prediction for t+2
pred2 = []
for future in tqdm(data):
    pred2.append(models[future].predict(data, future))

# Reload all models
for future in tqdm(data):
    with open(f'{future}.p', 'rb') as f:
        models[future] = pickle.load(f)

# Prediction for t+1 again
pred3 = []
for future in tqdm(data):
    pred3.append(models[future].predict(data, future))

# Remove temp files
for future in tqdm(data):
    os.remove(f'{future}.p')

assert any(np.array(pred1) != np.array(pred2)) # predictions for t+1 and t+2 should differ
assert all(np.array(pred1) == np.array(pred3)) # predictions for t+1 should match before and after reload

100%|██████████| 88/88 [00:30<00:00,  2.86it/s]
100%|██████████| 88/88 [00:00<00:00, 638.35it/s]
100%|██████████| 88/88 [00:01<00:00, 70.11it/s]
100%|██████████| 88/88 [00:01<00:00, 67.57it/s]
100%|██████████| 88/88 [00:00<00:00, 752.32it/s]
100%|██████████| 88/88 [00:01<00:00, 69.88it/s]
100%|██████████| 88/88 [00:00<00:00, 51978.42it/s]


# Model Tuning

In [5]:
class ArimaRaw(ArimaWrapper):
    SAVED_DIR = 'saved_models/numeric/arima'
    
    def __init__(self):
        super().__init__(y=self.transform_predictor)
    
    def transform_predictor(self, data):
        """Returns the last 50 close prices in a train split"""
        rows = data.index < datetime(2021, 1, 1)
        cols = 'CLOSE'
        y = data.loc[rows, cols]
        return y

class ArimaLinear(ArimaWrapper):
    SAVED_DIR = 'saved_models/numeric/arimalinear'
    
    def __init__(self):
        super().__init__(y=self.transform_predictor)
    
    def transform_predictor(self, data):
        rows = data.index < datetime(2021, 1, 1)
        cols = 'CLOSE'
        y = data.loc[rows, cols]
        y = np.log(y)
        return y

class ArimaNoTrend(ArimaWrapper):
    SAVED_DIR = 'saved_models/numeric/arimanotrend'
    
    def __init__(self):
        super().__init__(y=self.transform_predictor)
    
    def transform_predictor(self, data):
        rows = data.index < datetime(2021, 1, 1)
        cols = 'CLOSE'
        y = data.loc[rows, cols]
        y = np.diff(y)
        return y

class ArimaLinearNoTrend(ArimaWrapper):
    SAVED_DIR = 'saved_models/numeric/arimalinearnotrend'
    
    def __init__(self):
        super().__init__(y=self.transform_predictor)
    
    def transform_predictor(self, data):
        rows = data.index < datetime(2021, 1, 1)
        cols = 'CLOSE'
        y = data.loc[rows, cols]
        y = np.log(y)
        y = np.diff(y)
        return y

In [6]:
candidates = [
    ArimaRaw,
    ArimaLinear,
    ArimaNoTrend,
    ArimaLinearNoTrend,
]
candidates = {model.SAVED_DIR: model for model in candidates}

In [7]:
# Load all models (fit and save if no model exists)
data = utils.load_futures_data()
models = {}
for name, model in candidates.items():
    for future in tqdm(data):
        try:
            with open(f'{model.SAVED_DIR}/{future}.p', 'rb') as f:
                models[name, future] = pickle.load(f)
        except:
            models[name, future] = model()
            models[name, future].fit(data, future, out_of_sample_size=1500, error_action='ignore')
            with open(f'{model.SAVED_DIR}/{future}.p', 'wb') as f:
                pickle.dump(models[name, future], f)

models = pd.DataFrame.from_dict(models, orient='index')
models.index = pd.MultiIndex.from_tuples(models.index, names = ['save_dir', 'future'])
models.columns = ['arima']
models.head()

100%|██████████| 88/88 [00:02<00:00, 30.75it/s]
100%|██████████| 88/88 [00:01<00:00, 46.43it/s]
100%|██████████| 88/88 [00:00<00:00, 108.41it/s]
100%|██████████| 88/88 [00:01<00:00, 72.61it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,arima
save_dir,future,Unnamed: 2_level_1
saved_models/numeric/arima,F_AD,<__main__.ArimaRaw object at 0x7f7d9eb266d0>
saved_models/numeric/arima,F_BO,<__main__.ArimaRaw object at 0x7f7d9eb20110>
saved_models/numeric/arima,F_BP,<__main__.ArimaRaw object at 0x7f7d9cd462d0>
saved_models/numeric/arima,F_C,<__main__.ArimaRaw object at 0x7f7d9cd56490>
saved_models/numeric/arima,F_CC,<__main__.ArimaRaw object at 0x7f7d9cd5c4d0>


In [12]:
models['name'] = models['arima'].apply(lambda x: str(x.model))
models['MSE'] = models['arima'].apply(lambda x: x.model.oob_)
models[['name', 'MSE']].to_csv('model_metrics/arima_summary.csv')

# Model Summary

In [3]:
summary = pd.read_csv('model_metrics/arima_summary.csv')

In [7]:
np.sqrt(summary.groupby('save_dir').mean())

Unnamed: 0_level_0,MSE
save_dir,Unnamed: 1_level_1
saved_models/numeric/arima,146559.598031
saved_models/numeric/arimalinear,0.185371
saved_models/numeric/arimalinearnotrend,0.01515
saved_models/numeric/arimanotrend,9424.600275
