In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from datetime import datetime
import json
import os
from typing import Dict, List, Tuple, Union
from abc import ABC, abstractmethod

from lib.utils import get_93

## Funcs

In [21]:
def initial_preprocess(data: pd.DataFrame, windsorize=True):
    """
    Handle missing values, convert datetime to index, windsorize outliers, rename columns
    """
    data = data.ffill()
    data = data.bfill()
    assert data.isna().sum().sum() == 0

    data['datetime'] = pd.to_datetime(data['datetime'], utc=True).dt.tz_convert('US/Eastern')
    data.set_index('datetime', inplace=True)

    data.columns = [col for col in data.columns if col != 'datetime'] # filter out datetime
    namelist = data.columns.tolist() # save column names (before renaming)
    data.columns = [col + '_logvol' for col in data.columns] # rename columns
    date = data.index

    if windsorize:
        for clm in data.columns:
            max_p = np.percentile(data[clm], 99.9)
            min_p = np.percentile(data[clm], 0.1)

            data.loc[data[clm] > max_p, clm] = max_p
            data.loc[data[clm] < min_p, clm] = min_p
    
    return data, date, namelist

## Main

In [88]:
back_day = 15
back_day = list(range(back_day))
window_length = 6*250
train_size = 1000 #! MODIFIED for smaller dataset
forward_day = 10

In [24]:
rv_data = pd.read_csv('/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/65min_rv.csv', index_col=0)
rv_data, date, namelist = initial_preprocess(rv_data, windsorize=False)

In [63]:
rv_data

Unnamed: 0_level_0,TMO_logvol,ABT_logvol,HD_logvol,MCD_logvol,PG_logvol,CAT_logvol,DIS_logvol,CCI_logvol,JNJ_logvol,KO_logvol,...,COST_logvol,IBM_logvol,ADP_logvol,AVGO_logvol,WMT_logvol,AMGN_logvol,INTU_logvol,AXP_logvol,MMC_logvol,CB_logvol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-10-11 10:35:00-04:00,-9.302132,-9.308770,-8.861162,-9.032787,-9.181782,-8.267610,-8.949561,-9.245858,-9.123982,-9.574766,...,-9.758462,-9.029469,-9.164025,-7.821013,-9.906052,-8.309061,-7.986252,-8.662172,-9.824389,-9.117123
2018-10-11 11:40:00-04:00,-9.226515,-9.660382,-9.765163,-10.027525,-10.502906,-8.801236,-10.125150,-10.421172,-10.023504,-10.618561,...,-10.126290,-9.864239,-9.209012,-8.637280,-10.469171,-9.083652,-8.634823,-9.688864,-10.128361,-9.916038
2018-10-11 12:45:00-04:00,-9.818327,-10.029960,-10.518059,-10.743978,-10.994644,-9.591604,-10.272350,-10.747697,-10.491507,-11.048911,...,-10.732626,-10.336389,-9.891691,-9.291962,-10.898001,-9.966830,-9.518611,-10.129218,-10.798520,-10.359858
2018-10-11 13:50:00-04:00,-10.207846,-10.672297,-10.974280,-11.049742,-11.168651,-9.828537,-10.841925,-11.778167,-10.952831,-11.755173,...,-10.943079,-10.542780,-10.012840,-9.699978,-11.319553,-10.194840,-9.536182,-10.620143,-11.741473,-10.880489
2018-10-11 14:55:00-04:00,-8.921766,-8.968561,-8.872568,-9.690731,-9.740425,-8.682480,-9.503440,-10.269008,-9.185082,-10.092992,...,-9.973373,-9.243132,-9.060507,-8.967603,-9.951001,-9.123352,-8.827453,-9.273452,-9.856659,-9.745645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-09 11:40:00-04:00,-10.733918,-11.561414,-11.459266,-11.630080,-11.549440,-10.420834,-10.661230,-11.052822,-11.640935,-10.917620,...,-10.752899,-11.384324,-11.297519,-10.206596,-11.178907,-10.680779,-10.645832,-10.627294,-11.829896,-11.312079
2023-10-09 12:45:00-04:00,-11.575774,-11.660044,-11.389878,-11.992928,-11.837725,-10.662594,-11.225769,-9.910547,-12.022989,-10.924749,...,-10.925418,-12.035542,-12.237446,-11.091774,-11.439434,-11.202431,-11.457626,-11.228341,-11.642755,-11.378411
2023-10-09 13:50:00-04:00,-11.869460,-11.628670,-11.684992,-11.961455,-11.665920,-10.952734,-11.278099,-10.838116,-12.250040,-11.185100,...,-11.705529,-12.087832,-11.997735,-10.649890,-11.542805,-11.382209,-11.012721,-11.832090,-11.737761,-11.501302
2023-10-09 14:55:00-04:00,-11.277812,-11.982515,-11.641741,-12.104332,-12.100556,-11.159378,-11.756749,-11.311528,-12.244460,-11.504707,...,-11.609444,-12.312214,-12.193028,-10.515182,-11.398901,-11.671222,-11.120266,-11.961500,-12.197529,-11.755987


In [107]:
class Preprocess:
    def __init__(self, input_dfs: List[pd.DataFrame], target, back_day: List, forward_day: int):
        self.input_dfs = input_dfs
        self.target = target
        self.back_day = back_day
        self.forward_day = forward_day

        self.obs_unprocessed = self.input_dfs[0].shape[0] # get the number of observations
        self.eop = self.obs_unprocessed - len(self.back_day) - (forward_day-1) # calculate the expected number of observations after preprocessing 
        
        self._generate_shifted_sequences()
        self._generate_target_mask_and_date()

    def _generate_shifted_sequences(self):
        self.x = []
        for df in self.input_dfs: # for each input df
            shifted_df = pd.concat(
                [df.shift(n) for n in self.back_day], axis=1 # shift and concatenate
            ).reset_index(drop=True).iloc[:, ::-1] # reset index and reverse order

            self.x.append(np.expand_dims(np.array(shifted_df), axis=2)) # expand dims and append on the new dim

        self.x = np.concatenate(tuple(self.x), axis=2) # concatenate on the new dim (now we have a single 3d array)
        # Sanity check - shape should not change until we apply the mask later
        # (num observations, num back days, num features) (7516, 15, 1)
        assert self.x.shape == (self.input_dfs[0].shape[0], len(self.back_day), len(self.input_dfs)),\
            f"Input shape is incorrect, expected {(self.input_dfs[0].shape[0], len(self.back_day), len(self.input_dfs))} but got {self.x.shape}"

    def _generate_target_mask_and_date(self):
        non_na_mask = [~np.any(np.isnan(p)) for p in self.x] # make a mask for non-nan values over all features
        
        # print(f"Target type: {type(self.target)}") #? Debug
        if type(self.target) == pd.Series: # if the target is a series
            self.target = pd.DataFrame(self.target) # convert to df to avoid error in line below where we call .all() and axis=1
        
        self.y = self.target.shift(-self.forward_day).reset_index(drop=True) # shift target
        valid_target_mask = self.y.notna().all(axis=1) # make a mask for valid target values
        
        self.final_mask = np.logical_and(non_na_mask, valid_target_mask) # combine masks
        
        self.x = self.x[self.final_mask] # apply mask
        self.y = np.array(self.y[self.final_mask].reset_index(drop=True)) # apply mask
        self.idx = self.target.index[self.final_mask] # get all dates

        # Sanity check - shape should be reduced by the number of back days
        # (num observations, num features) (7502, 1)
        assert self.x.shape == (self.eop, len(self.back_day), len(self.input_dfs)),\
            f"Input shape is incorrect, should be {self.eop} x {len(self.back_day)}, {len(self.input_dfs)}, is {self.x.shape}"

class Model(ABC):
    @abstractmethod
    def train(self, x, y):
        pass

    @abstractmethod
    def predict(self, x):
        pass

class LinearRegression:
    def __init__(self, fit_intercept=True):
        self.coefficients = None
        self.fit_intercept = fit_intercept  # Added

    def train(self, X, y):
        if X.shape[0] != y.shape[0]:
            raise ValueError("Mismatched dimensions: X has {} rows but y has {} rows".format(X.shape[0], y.shape[0]))

        if self.fit_intercept:
            X = self._add_bias_term(X)  # Added

        self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y

    def predict(self, X):
        if self.coefficients is None:
            raise ValueError("Model has not been trained yet.")

        if self.fit_intercept: 
            X = self._add_bias_term(X)  # Added

        return X @ self.coefficients

    def _add_bias_term(self, X):  # Added
        return np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)  # Added

class RollingPredict:
    def __init__(self, back_day: List, data: pd.DataFrame, namelist: List, window_length: int, forward_day: int = 1):
        self.back_day = back_day
        self.data = data # store data in attribute
        self.namelist = namelist # store namelist in attribute
        self.window_length = window_length
        self.forward_day = forward_day

        # ! Some assertions to make sure everything is good
        self.obs_unprocessed = self.data.shape[0] 
        self.expected_obs_processed = self.obs_unprocessed - len(self.back_day) - (forward_day-1)
        # 7516 - 14 - + 1

        self._rolling_preprocess() # preprocess all the data fort his window

        # some statment to figure out how many models should be trained (based on the window length)
        # would be total number of data points 

        eop = self.expected_obs_processed * len(self.namelist) # expected observations per ticker
        x_len = self.preprocess_obj.x.shape[0]
        y_len = self.preprocess_obj.y.shape[0]
        idx_len = self.preprocess_obj.idx.shape[0]

        assert x_len == eop, f"X shape is incorrect: expected {eop} but got {x_len}"
        assert y_len == eop, f"Y shape is incorrect: expected {eop} but got {y_len}"
        assert idx_len == eop, f"IDX shape is incorrect expected {eop} but got {idx_len}"

    def _rolling_preprocess(self):
        initial_df = [self.data[self.namelist[0]+'_logvol']] # get the first df
        
        self.preprocess_obj = Preprocess(initial_df, self.data[self.namelist[0]+'_logvol'], self.back_day, self.forward_day)

        assert self.preprocess_obj.x.shape[0] == self.expected_obs_processed, "Initial preprocess failed - debug in the preprocess class"

        for ticker in self.namelist[1:]:
            temp_df = [self.data[ticker + '_logvol']]
            temp_preprocess_obj = Preprocess(temp_df, self.data[ticker + '_logvol'], self.back_day, self.forward_day)

            self._concatenate_preprocessed_data(temp_preprocess_obj)

    def _concatenate_preprocessed_data(self, temp_preprocess_obj):
        self.preprocess_obj.x = np.concatenate([self.preprocess_obj.x, temp_preprocess_obj.x], axis=0)
        self.preprocess_obj.y = np.concatenate([self.preprocess_obj.y, temp_preprocess_obj.y], axis=0)
        self.preprocess_obj.idx = np.concatenate([self.preprocess_obj.idx, temp_preprocess_obj.idx], axis=0)

    def train(self, train_index, predict_index):
        # Find the indices that correspond to the starting points of our training window.
        temp_train_start = np.where(self.preprocess_obj.idx == train_index[0])[0]
        
        # Create a list of indices that correspond to all data points in the training window.
        # For each starting point found, we create a range that defines our actual training window.
        temp_index_train = [i for start in temp_train_start for i in range(start, start + len(train_index))]
        
        # same for predict
        temp_predict_start = np.where(self.preprocess_obj.idx == predict_index[0])[0]
        temp_index_predict = [i for start in temp_predict_start for i in range(start, start + len(predict_index))]
        
        train_x = self.preprocess_obj.x[temp_index_train]
        train_y = self.preprocess_obj.y[temp_index_train]
        test_x = self.preprocess_obj.x[temp_index_predict]
        test_y = self.preprocess_obj.y[temp_index_predict]
        
        # reshape features
        train_x = train_x.reshape(train_x.shape[0], -1)
        test_x = test_x.reshape(test_x.shape[0], -1)
        
        model = LinearRegression()
        model.train(train_x, train_y)
        
        predictions = model.predict(test_x) 
        
        # Reshape data for side-by-side comparison.
        predictions = np.reshape(predictions, (-1, len(self.namelist)), 'F')
        test_y = np.reshape(test_y, (-1, len(self.namelist)), 'F')

        results_df = pd.DataFrame(np.concatenate((predictions, test_y), axis=1))
        
        # ! CONCISE VERSION
        # results_df.index = self.preprocess_obj.idx[
        #                    (np.where(self.preprocess_obj.idx == predict_index[0])[0][0] + 1):
        #                    (np.where(self.preprocess_obj.idx == predict_index[-1])[0][0] + 2)]
        # results_df.columns = [x + '_predicted' for x in self.namelist] + [x + '_actual' for x in self.namelist]

        # ! VERSION FOR READABILITY 
        start_idx_position = np.where(self.preprocess_obj.idx == predict_index[0])[0][0] # find the index of the first predict index
        end_idx_position = np.where(self.preprocess_obj.idx == predict_index[-1])[0][0] # find the index of the last predict index

        # Adjust the starting and ending positions to match the DataFrame index.
        adjusted_start = start_idx_position + 1
        adjusted_end = end_idx_position + 2

        results_df.index = self.preprocess_obj.idx[adjusted_start:adjusted_end]

        predicted_columns = [x + '_predicted' for x in self.namelist]
        actual_columns = [x + '_actual' for x in self.namelist]
        results_df.columns = predicted_columns + actual_columns

        # Step 13: Return the results DataFrame.
        return results_df

    def run(self, window_length, train_size, Epoch_num=0, lr=None):
        # Calculate the total number of observations
        T = int(self.preprocess_obj.x.shape[0] / len(self.namelist))
        result_list = []

        # Set the starting date for the test data
        test_start_dt_str = '2020-06-30 10:35'
        test_start_dt = pd.Timestamp(test_start_dt_str, tz='US/Eastern')
        # TODO - make this a parameter or something

        assert test_start_dt in self.preprocess_obj.idx, "Test start date is not in the index"
        start_index = np.where(self.preprocess_obj.idx == test_start_dt)[0][0] 

        num_windows = int((T - train_size) / window_length) + 1 # ? make sure this work 

        for start in range(start_index, T - 1, window_length):
            print(self.preprocess_obj.idx[start]) #? Debugging

            # Determine the range for training and prediction
            train_idx_start = self.preprocess_obj.idx[start_index - train_size:start]
            
            # min function to handle the edge case for the last window
            predict_idx_range = self.preprocess_obj.idx[start:min(start + window_length, T)] # T not T-1 because range is exclusive

            kwargs = {
                "train_index": train_idx_start,
                "predict_index": predict_idx_range,
            }

            if Epoch_num != 0: kwargs["Epoch_num"] = Epoch_num # for the NN
            if lr != None: kwargs["lr"] = lr # for the NN

            result = self.train(**kwargs)
            result_list.append(result)
        # ! ============================================================

        return result_list

In [108]:
rp_args = [back_day, rv_data, namelist, window_length, forward_day]
q = RollingPredict(*rp_args)

result = q.run(window_length, train_size)

2020-06-30 10:35:00-04:00
2021-06-28 14:55:00-04:00
2022-06-27 10:35:00-04:00
2023-06-26 12:45:00-04:00
