In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import json
import os

## Load the 65M data

In [4]:
data = pd.read_csv("/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/65min.csv", index_col=0)
# assuming 6 65-minute periods per day
back_day = 6*20 # 20 days
window_length = 6*250 # 250 days
train_size = 6*1000 # 1000 days

In [10]:
data.ffill(inplace=True)
data.bfill(inplace=True)
assert data.isna().sum().sum() == 0

In [14]:
data['datetime'] = pd.to_datetime(data['datetime'], utc=True)
data.set_index('datetime', inplace=True)

In [15]:
namelist = data.columns.tolist()

## More Preprocessing

In [20]:
def rv(series: pd.Series, window: int) -> pd.Series:
    """
    Realized volatility is defined in [Volatility Forecasting with Machine Learning
    and Intraday Commonality](https://arxiv.org/pdf/2202.08962.pdf) as:

    $$RV_{i,t}(h)=\log(\sum_{s=t-h+1}^{t}r^2_{i,s})$$
    """
    assert window > 0, "Window must be greater than 0"
    fuzz = 1e-16
    log_returns = np.log(series).diff() # log returns
    sum_of_squares = log_returns.rolling(window=window).apply(lambda x: np.sum(x**2), raw=True)
    rv = np.log(sum_of_squares + fuzz)
    assert rv.isna().sum() == window, "RV should have NaNs at the beginning" # ? should have one nan from logret and window - 1 from rolling = window
    return rv

for ind in namelist:
    data[ind + "_logvol"] = rv(data[ind], window_length)

In [21]:
date = data.index

## Time to reverse engineer the preprocess code

In [22]:
class preprocess():
    def __init__(self, input, target, back_day = list(range(0,15)), forward_day = 1):
        # x attribute will hold the predictor variables
        # y attribute will hold the target variable
        # idx attribute will hold the date

        #input is a list of dataframes, for example [price,volatility] with index as the same as target.
        self.x = []
        for df in input:
            # Shift the dataframe by each value in back_day and concatenate along columns
            shifted_df = pd.concat(
                list(map(lambda n: df.shift(n), back_day)), axis=1
            ).reset_index(drop=True).loc[:, ::-1] # Also, reset index and drop to align with the target
            self.x.append(np.expand_dims(np.array(shifted_df), axis=2)) # Expand dimensions to make it compatible for future concatenation
        
        self.x = np.concatenate(tuple(self.x), axis=2) # Concatenate all processed input data along the last axis
        self.idx1 = [~np.any(np.isnan(p)) for p in self.x] # Create an index mask where none of the elements in the x dataframes are NaN
        self.y = target.shift(-forward_day) # Shift the target by forward_day to align with predictor variables
        self.y = pd.DataFrame((self.y)).reset_index(drop=True) # Reset index to align with self.x
        self.idx2 = self.y.notna().all(axis=1) # Create an index mask where none of the elements in the y dataframe are NaN
        self.idx = np.logical_and(self.idx1, self.idx2) # Combine the two index masks
        
        # Filter x and y data based on combined index mask
        self.x = self.x[self.idx]
        self.y = np.array(self.y[self.idx].reset_index(drop=True))

        # Filter date based on combined index mask
        self.idx = data.index[self.idx]

In [107]:
class Preprocess:
    def __init__(self, input, target, back_day = list(range(0,15)), forward_day = 1):
        # this liss(range(0,15)), forward_day = 1 seem to correspond to the lookback window and the forecast horizon
        # ! input is a list of dataframes, for example [price,volatility] with index as the same as target.
        # list of dfs holds all the results, target is the actual "input" column
        
        # ! Section 1 - make incrementally shifted seqences for each df in the input list
        self.x = []
        for df in input:
            # Shift the dataframe by each value in back_day and concatenate along columns
            # ! detailed explanation below
            shifted_df = pd.concat(
                list(map(lambda n: df.shift(n), back_day)), axis=1
            ).reset_index(drop=True).loc[:, ::-1]
            self.x.append(np.expand_dims(np.array(shifted_df), axis=2)) # Expand dimensions to make it compatible for future concatenation

        self.x = np.concatenate(tuple(self.x), axis=2) # Concatenate all processed input data along the last axis

        # ! X shape = (7516, 15, 1), rows / columns / channels
        # ! X shape = (number of agg bars) / (back day list len) / (#dfs in input list)

        # ! Section 2 - make the target, mask, and date
        idx1 = [~np.any(np.isnan(p)) for p in self.x] # Create an index mask where none of the elements in the x dataframes are NaN
        # ! for each row in x (which includes the row data from all dfs), if any of the values are NaN, then return False, else return True (therefore the length of idx will be the number of rows)
        self.y = target.shift(-forward_day) # Shift the target by forward_day to align with predictor variables
        self.y = pd.DataFrame(self.y).reset_index(drop=True) # Reset index to align with self.x (i removed the double parentheses around self.y)
        self.idx2 = self.y.notna().all(axis=1) # simple mask all rows where there are no NaNs (i.e. all values are present) note that this is notna, not isna like before. So this is the opposite of the previous mask
        self.idx = np.logical_and(idx1, self.idx2) # Combine the two index masks (element-wise and)
        # ! final mask "idx" is of shape (rows, )

        self.x = self.x[self.idx] # Filter x and y data based on combined index mask
        self.y = np.array(self.y[self.idx].reset_index(drop=True)) # Filter date based on combined index mask, make it an array
        
        # ! Section 3 - make the date
        self.idx = data.index[self.idx] #! this is weird naming convention, because now self.idx is the date index from the data df, not a mask anymore

```python
shifted_df = pd.concat(
    list(map(lambda n: df.shift(n), back_day)), axis=1
).reset_index(drop=True).loc[:, ::-1]
```

Okay, so. df:

```
   value
0      1
1      2
2      3
```

gets mapped and shifted to the back_day list (list(range(0, 15)))

example with list [0, 1]

```
   value       value
0      1           NaN
1      2           1
2      3           2
```

Then they all get concatenated into one df. Index is dropped and the columns are reversed (putting the lagging columns first)

In [58]:
dfs = [data[name+"_logvol"] for name in namelist]

In [66]:
x = []
for df in dfs:
    # Shift the dataframe by each value in back_day and concatenate along columns
    # ! detailed explanation below
    shifted_df = pd.concat(
        list(map(lambda n: df.shift(n), list(range(0, 15)))), axis=1
    ).reset_index(drop=True).loc[:, ::-1]
    x.append(np.expand_dims(np.array(shifted_df), axis=2)) # Expand dimensions to make it compatible for future concatenation

x = np.concatenate(tuple(x), axis=2) # Concatenate all processed input data along the last axis

In [88]:
idx1 = [~np.any(np.isnan(p)) for p in x] # Create an index mask where none of the elements in the x dataframes are NaN
# for each row in x (which includes the row data from all dfs), if any of the values are NaN, then return False, else return True (therefore the length of idx will be the number of rows)

In [89]:
target_example = data[namelist[0]+"_logvol"]

In [91]:
df1 = pd.DataFrame(target_example).reset_index(drop=True)
df2 = pd.DataFrame((target_example)).reset_index(drop=True)

In [95]:
df2

Unnamed: 0,TMO_logvol
0,
1,
2,
3,
4,
...,...
7511,-2.613493
7512,-2.614106
7513,-2.613911
7514,-2.618732


In [96]:
idx2 = df1.notna().all(axis=1) # Create an index mask where none of the elements in the y dataframe are NaN

In [99]:
idx = np.logical_and(idx1, idx2) # Combine the two index masks (element-wise and)

In [105]:
idx

0       False
1       False
2       False
3       False
4       False
        ...  
7511     True
7512     True
7513     True
7514     True
7515     True
Length: 7516, dtype: bool

In [106]:
data.index[idx]

DatetimeIndex(['2019-10-15 15:40:00+00:00', '2019-10-15 16:45:00+00:00',
               '2019-10-15 17:50:00+00:00', '2019-10-15 18:55:00+00:00',
               '2019-10-16 13:30:00+00:00', '2019-10-16 14:35:00+00:00',
               '2019-10-16 15:40:00+00:00', '2019-10-16 16:45:00+00:00',
               '2019-10-16 17:50:00+00:00', '2019-10-16 18:55:00+00:00',
               ...
               '2023-10-06 15:40:00+00:00', '2023-10-06 16:45:00+00:00',
               '2023-10-06 17:50:00+00:00', '2023-10-06 18:55:00+00:00',
               '2023-10-09 13:30:00+00:00', '2023-10-09 14:35:00+00:00',
               '2023-10-09 15:40:00+00:00', '2023-10-09 16:45:00+00:00',
               '2023-10-09 17:50:00+00:00', '2023-10-09 18:55:00+00:00'],
              dtype='datetime64[ns, UTC]', name='datetime', length=6002, freq=None)

In [65]:


self.idx2 = self.y.notna().all(axis=1) # Create an index mask where none of the elements in the y dataframe are NaN
self.idx = np.logical_and(self.idx1, self.idx2) # Combine the two index masks

[array([[[        nan],
         [        nan],
         [        nan],
         ...,
         [        nan],
         [        nan],
         [        nan]],
 
        [[        nan],
         [        nan],
         [        nan],
         ...,
         [        nan],
         [        nan],
         [        nan]],
 
        [[        nan],
         [        nan],
         [        nan],
         ...,
         [        nan],
         [        nan],
         [        nan]],
 
        ...,
 
        [[-2.60993375],
         [-2.60983791],
         [-2.60973135],
         ...,
         [-2.61349264],
         [-2.61410586],
         [-2.61391108]],
 
        [[-2.60983791],
         [-2.60973135],
         [-2.61071067],
         ...,
         [-2.61410586],
         [-2.61391108],
         [-2.61873232]],
 
        [[-2.60973135],
         [-2.61071067],
         [-2.61130301],
         ...,
         [-2.61391108],
         [-2.61873232],
         [-2.61872203]]]),
 array([[[        n