In [1]:
import os
import pandas as pd
import numpy as np
import datetime as dt
import pathlib
import re
import json
import math
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp
from statsmodels.tsa.ar_model import AutoReg, ar_select_order

symbols_list_file_name = "binance_BTC_from_2019_05_01_to_2022_04_30.json"
with open(symbols_list_file_name, "r") as f:
    symbols_list = json.load(f)

input_folder_name = "full_returns_winsorised"

In [2]:
# note: we never have to save anything (No need, and takes too much space) 
# this code is just repeated when we run the models

In [None]:
# consider shifting the data label one day prior
# so 2019-05-01 0:00 denotes the vol in 2019-05-01 0:00 to 2019-05-02 0:00

In [182]:
symbol = 'adabtc'
file_name = symbol + '.csv.gz'
path = os.path.join(os.getcwd(), input_folder_name, file_name)
df = pd.read_csv(path, compression='gzip')

nums = df.index
first_day_first_minute = dt.datetime(2019, 5, 1)
initial_time = dt.datetime.utcfromtimestamp(0)
theoretical_first_minute = math.floor(
    (first_day_first_minute - initial_time) / dt.timedelta(minutes=1)
)
dt_index = pd.to_datetime(theoretical_first_minute + nums, unit="m")
df.set_index(dt_index, inplace=True)
squared_returns = df.pow(2)
volas = squared_returns.groupby(pd.Grouper(freq="H")).sum()

train_size = 731 * 24 # probably add 184 to this
volas_train = volas[:train_size]
volas_train_rolled = volas_train.rolling(24).sum()
volas_train_rolled.dropna(inplace=True)
volas_train_rolled.index += pd.DateOffset(hours=1) # need to do this on test as well 

In [203]:
lag_days_list = [1, 2, 3, 7, 14, 30]

AR_mods = dict()
for lag_days in lag_days_list:
    AR_lookback = lag_days * 24
    mod = AutoReg(volas_train_rolled, AR_lookback, old_names=False)
    AR_mods[lag_days] = mod

AICs = []
for lag_days in lag_days_list:
    mod = AR_mods[lag_days]
    AICs.append(mod.fit().aic)
AICs # so 7 is best (in this case!) (lower is better)

lag_days = 7 
mod = AR_mods[lag_days]

res = mod.fit()

In [212]:
# manually write a selection function based on this

In [213]:
predictions = res.predict()
df_res = volas_train_rolled.copy(deep=True)
df_res["pred"] = predictions
((df_res['adabtc'] - df_res['pred']) ** 2).sum()
df_res.plot()

In [222]:
# need to get rid of useless intraday stuff and re-measure

