In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import json
import os

## Load the 65M data

In [4]:
data = pd.read_csv("/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/65min.csv", index_col=0)
# assuming 6 65-minute periods per day
back_day = 6*20 # 20 days
window_length = 6*250 # 250 days
train_size = 6*1000 # 1000 days

In [10]:
data.ffill(inplace=True)
data.bfill(inplace=True)
assert data.isna().sum().sum() == 0

In [14]:
data['datetime'] = pd.to_datetime(data['datetime'], utc=True)
data.set_index('datetime', inplace=True)

In [15]:
namelist = data.columns.tolist()

## More Preprocessing

In [20]:
def rv(series: pd.Series, window: int) -> pd.Series:
    """
    Realized volatility is defined in [Volatility Forecasting with Machine Learning
    and Intraday Commonality](https://arxiv.org/pdf/2202.08962.pdf) as:

    $$RV_{i,t}(h)=\log(\sum_{s=t-h+1}^{t}r^2_{i,s})$$
    """
    assert window > 0, "Window must be greater than 0"
    fuzz = 1e-16
    log_returns = np.log(series).diff() # log returns
    sum_of_squares = log_returns.rolling(window=window).apply(lambda x: np.sum(x**2), raw=True)
    rv = np.log(sum_of_squares + fuzz)
    assert rv.isna().sum() == window, "RV should have NaNs at the beginning" # ? should have one nan from logret and window - 1 from rolling = window
    return rv

for ind in namelist:
    data[ind + "_logvol"] = rv(data[ind], window_length)

In [21]:
date = data.index

## Time to reverse engineer the preprocess code

In [22]:
class preprocess():
    def __init__(self, input, target, back_day = list(range(0,15)), forward_day = 1):
        # x attribute will hold the predictor variables
        # y attribute will hold the target variable
        # idx attribute will hold the date

        #input is a list of dataframes, for example [price,volatility] with index as the same as target.
        self.x = []
        for df in input:
            # Shift the dataframe by each value in back_day and concatenate along columns
            shifted_df = pd.concat(
                list(map(lambda n: df.shift(n), back_day)), axis=1
            ).reset_index(drop=True).loc[:, ::-1] # Also, reset index and drop to align with the target
            self.x.append(np.expand_dims(np.array(shifted_df), axis=2)) # Expand dimensions to make it compatible for future concatenation
        
        self.x = np.concatenate(tuple(self.x), axis=2) # Concatenate all processed input data along the last axis
        self.idx1 = [~np.any(np.isnan(p)) for p in self.x] # Create an index mask where none of the elements in the x dataframes are NaN
        self.y = target.shift(-forward_day) # Shift the target by forward_day to align with predictor variables
        self.y = pd.DataFrame((self.y)).reset_index(drop=True) # Reset index to align with self.x
        self.idx2 = self.y.notna().all(axis=1) # Create an index mask where none of the elements in the y dataframe are NaN
        self.idx = np.logical_and(self.idx1, self.idx2) # Combine the two index masks
        
        # Filter x and y data based on combined index mask
        self.x = self.x[self.idx]
        self.y = np.array(self.y[self.idx].reset_index(drop=True))

        # Filter date based on combined index mask
        self.idx = data.index[self.idx]

In [57]:
class Preprocess:
    def __init__(self, input, target, back_day = list(range(0,15)), forward_day = 1):
        # this liss(range(0,15)), forward_day = 1 seem to correspond to the lookback window and the forecast horizon
        # ! input is a list of dataframes, for example [price,volatility] with index as the same as target.
        # list of dfs holds all the results, target is the actual "input" column
        
        # ! Section 1 - make incrementally shifted seqences for each df in the input list
        self.x = []
        for df in input:
            # Shift the dataframe by each value in back_day and concatenate along columns
            # ! detailed explanation below
            shifted_df = pd.concat(
                list(map(lambda n: df.shift(n), back_day)), axis=1
            ).reset_index(drop=True).loc[:, ::-1]
            self.x.append(np.expand_dims(np.array(shifted_df), axis=2)) # Expand dimensions to make it compatible for future concatenation

        self.x = np.concatenate(tuple(self.x), axis=2) # Concatenate all processed input data along the last axis

        # ! X shape = (7516, 15, 1), rows / columns / channels
        # ! X shape = (number of agg bars) / (back day list len) / (#dfs in input list)

        self.x = np.concatenate(tuple(self.x), axis=2) # Concatenate all processed input data along the last axis

```python
shifted_df = pd.concat(
    list(map(lambda n: df.shift(n), back_day)), axis=1
).reset_index(drop=True).loc[:, ::-1]
```

Okay, so. df:

```
   value
0      1
1      2
2      3
```

gets mapped and shifted to the back_day list (list(range(0, 15)))

example with list [0, 1]

```
   value       value
0      1           NaN
1      2           1
2      3           2
```

Then they all get concatenated into one df. Index is dropped and the columns are reversed (putting the lagging columns first)

In [27]:
test_dfs = [data[name+"_logvol"] for name in namelist]

In [29]:
for df in test_dfs:
    shifted = pd.concat(
        list(map(lambda n: df.shift(n), list(range(0,15)))), axis=1
    ).reset_index(drop=True).loc[:, ::-1]

In [44]:
shifted[6*250:]

Unnamed: 0,TMO_logvol,TMO_logvol.1,TMO_logvol.2,TMO_logvol.3,TMO_logvol.4,TMO_logvol.5,TMO_logvol.6,TMO_logvol.7,TMO_logvol.8,TMO_logvol.9,TMO_logvol.10,TMO_logvol.11,TMO_logvol.12,TMO_logvol.13,TMO_logvol.14
1500,,,,,,,,,,,,,,,-2.746800
1501,,,,,,,,,,,,,,-2.746800,-2.749263
1502,,,,,,,,,,,,,-2.746800,-2.749263,-2.750394
1503,,,,,,,,,,,,-2.746800,-2.749263,-2.750394,-2.752106
1504,,,,,,,,,,,-2.746800,-2.749263,-2.750394,-2.752106,-2.753366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7511,-2.610986,-2.610055,-2.609934,-2.609838,-2.609731,-2.610711,-2.611303,-2.611142,-2.611172,-2.612094,-2.612153,-2.616261,-2.615733,-2.613411,-2.613493
7512,-2.610055,-2.609934,-2.609838,-2.609731,-2.610711,-2.611303,-2.611142,-2.611172,-2.612094,-2.612153,-2.616261,-2.615733,-2.613411,-2.613493,-2.614106
7513,-2.609934,-2.609838,-2.609731,-2.610711,-2.611303,-2.611142,-2.611172,-2.612094,-2.612153,-2.616261,-2.615733,-2.613411,-2.613493,-2.614106,-2.613911
7514,-2.609838,-2.609731,-2.610711,-2.611303,-2.611142,-2.611172,-2.612094,-2.612153,-2.616261,-2.615733,-2.613411,-2.613493,-2.614106,-2.613911,-2.618732


In [50]:
df_list = []
# print(np.array(shifted)) # get matrix
# print(np.expand_dims(np.array(shifted), axis=2)) # add dimension
new = np.expand_dims(np.array(shifted), axis=2)
print(new.shape)

(7516, 15, 1)


In [56]:
new[0][1][0]

nan