In [1]:
import numpy as np
import pandas as pd

from qamsi.utils.data import read_csv

from run import Dataset

DATASET = Dataset.TOPN_US
TOP_N = 30

config = DATASET.value(topn=TOP_N)
config

ExperimentConfig(RANDOM_SEED=12, PATH_OUTPUT=PosixPath('/Users/buchkovv/qamsi/data/output'), SAVE_PATH=PosixPath('/Users/buchkovv/qamsi/backtests/runs'), DF_FILENAME='top30_data.csv', PRESENCE_MATRIX_FILENAME='top30_presence_matrix.csv', START_DATE=Timestamp('2000-12-18 00:00:00'), END_DATE=Timestamp('2024-08-30 00:00:00'), REBALANCE_FREQ=21, HEDGE_FREQ=1, N_LOOKBEHIND_PERIODS=252, MIN_ROLLING_PERIODS=252, CAUSAL_WINDOW_SIZE=21, FACTORS=('spx',), TARGETS=('vol', 'naive_vol', 'target', 'cgp_ucb', 'irl'), HEDGING_ASSETS=('spx_fut',), RF_NAME='acc_rate', MKT_NAME='spx', PATH_INPUT=PosixPath('/Users/buchkovv/qamsi/data/input'), INITIAL_DF_FILENAME='initial_df.csv', JKP_DATA_FILENAME='jkp_data.csv', STOCKS_LIST_FILENAME='spx_stocks_list.csv', INITIAL_FEATURES_FILENAME='initial_features_df.csv', RETURNS_FILENAME='returns_incl_div_consituents_w_name.csv', BETTER_RETURNS_FILENAME='returns_data_cleaned_better.parquet', ASSET_UNIVERSE=('spx',))

In [2]:
data = read_csv(config.PATH_OUTPUT, config.DF_FILENAME)
pm = read_csv(config.PATH_OUTPUT, config.PRESENCE_MATRIX_FILENAME)
ret = data[pm.columns]

In [4]:
# Function to compute rolling correlation
def rolling_feature(df, feature_fn, pm, name):
    # Initialize a list to store results
    results = []

    # Perform calculation for each rolling window
    for end in df.index:
        start = end - pd.DateOffset(months=1)

        curr_matrix = pm.loc[:end].iloc[-1]
        selection = curr_matrix[curr_matrix == 1].index.tolist()
        rolling_window = df[selection].loc[start:end]

        feature = feature_fn(rolling_window)

        results.append([end, feature])

    # Create a series with the results
    rolling_feat = pd.DataFrame(results, columns=["date", name])
    rolling_feat["date"] = pd.to_datetime(rolling_feat["date"])
    rolling_feat = rolling_feat.set_index("date")
    return rolling_feat

### 1. Avg Corr.

In [4]:
# Function to compute the average of non-diagonal elements in each correlation matrix
def avg_non_diagonal_elements(corr_matrix):
    # Select the non-diagonal elements using numpy
    non_diag = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)]
    return np.nanmean(non_diag)

def avg_corr(rolling_window):
    # Compute the correlation matrix for the rolling window
    corr_matrix = rolling_window.corr()

    # Compute the average of non-diagonal elements
    return avg_non_diagonal_elements(corr_matrix)

# Calculate rolling average correlation of non-diagonal elements
rolling_avg_corr = rolling_feature(ret, avg_corr, pm, "avg_corr")

# Output rolling_avg_corr as the result
rolling_avg_corr.shape

(11324, 1)

### 2. Average volatility.

In [5]:
avg_vol = rolling_feature(ret, lambda s: s.std(axis=0).mean(), pm, "avg_vol")
avg_vol.shape

(11324, 1)

### 3. EW Portfolio.

In [6]:
ew = rolling_feature(ret, lambda s: np.prod(1 + np.nanmean(s, axis=1)) - 1, pm, "ew")
ew

Unnamed: 0_level_0,ew
date,Unnamed: 1_level_1
1980-01-31,-0.009858
1980-02-01,0.000682
1980-02-04,-0.003901
1980-02-05,-0.002505
1980-02-06,0.018755
...,...
2024-12-24,0.027452
2024-12-26,0.026098
2024-12-27,0.006078
2024-12-30,-0.006538


In [7]:
ewma = []
for end in ew.index:
    start = end - pd.DateOffset(months=1)

    if end > ew.index[-1]:
        break

    sample = ew.loc[start:end]

    ma = sample.ewm(alpha=0.1).mean().iloc[-1].item()

    ewma.append([end, ma])

In [8]:
ewma = pd.DataFrame(ewma, columns=["date", "ewma"])
ewma["date"] = pd.to_datetime(ewma["date"])
ewma = ewma.set_index("date")
ewma

Unnamed: 0_level_0,ewma
date,Unnamed: 1_level_1
1980-01-31,-0.009858
1980-02-01,-0.004310
1980-02-04,-0.004159
1980-02-05,-0.003678
1980-02-06,0.001800
...,...
2024-12-24,0.033623
2024-12-26,0.032652
2024-12-27,0.029418
2024-12-30,0.024774


### 4. Ledoit-Wolf Shrinkage Intensity.

In [9]:
from sklearn.covariance import LedoitWolf

def get_intensity(s: pd.DataFrame):
    s = s.copy().fillna(0)
    lw = LedoitWolf()
    lw.fit(s)
    return lw.shrinkage_

lw = rolling_feature(ret, lambda s: get_intensity(s), pm, "lw_shrinkage")
lw

Unnamed: 0_level_0,lw_shrinkage
date,Unnamed: 1_level_1
1980-01-31,0.000000
1980-02-01,0.000000
1980-02-04,0.270941
1980-02-05,0.366452
1980-02-06,0.408118
...,...
2024-12-24,0.427849
2024-12-26,0.425807
2024-12-27,0.381702
2024-12-30,0.368413


### 5. Momentum

In [10]:
momentum = rolling_feature(ret, lambda s: np.nanmean(np.where(s, s > 0, 1), axis=0).mean(), pm, "momentum_feature")
momentum

Unnamed: 0_level_0,momentum_feature
date,Unnamed: 1_level_1
1980-01-31,0.233333
1980-02-01,0.433333
1980-02-04,0.411111
1980-02-05,0.466667
1980-02-06,0.520000
...,...
2024-12-24,0.530159
2024-12-26,0.520635
2024-12-27,0.484127
2024-12-30,0.453333


### 6. Trace.

In [9]:
trace = rolling_feature(ret, lambda s: np.trace(s.fillna(0).cov()), pm, "trace")
trace

Unnamed: 0_level_0,trace
date,Unnamed: 1_level_1
1980-01-31,
1980-02-01,0.013149
1980-02-04,0.010395
1980-02-05,0.007977
1980-02-06,0.012255
...,...
2024-12-24,0.012827
2024-12-26,0.012451
2024-12-27,0.012570
2024-12-30,0.013056


In [10]:
trace[trace.isna().any(axis=1)]

Unnamed: 0_level_0,trace
date,Unnamed: 1_level_1
1980-01-31,


### 7. Universe Volatility.

In [12]:
ew_vol = ew.rolling(window=252, min_periods=1).std().fillna(0)
ew_vol

Unnamed: 0_level_0,ew
date,Unnamed: 1_level_1
1980-01-31,0.000000
1980-02-01,0.007453
1980-02-04,0.005285
1980-02-05,0.004414
1980-02-06,0.010827
...,...
2024-12-24,0.027874
2024-12-26,0.027871
2024-12-27,0.027908
2024-12-30,0.027998


### Append features.

In [13]:
features = rolling_avg_corr.merge(avg_vol, how="inner", left_index=True, right_index=True)
features.shape

(11324, 2)

In [14]:
features = features.merge(ewma, how="inner", left_index=True, right_index=True)
features.shape

(11324, 3)

In [15]:
features = features.merge(lw, how="inner", left_index=True, right_index=True)
features.shape

(11324, 4)

In [16]:
features = features.merge(momentum, how="inner", left_index=True, right_index=True)
features.shape

(11324, 5)

In [17]:
features = features.merge(trace, how="inner", left_index=True, right_index=True)
features.shape

(11324, 6)

In [18]:
features = features.merge(
    ew_vol.rename(columns={"ew": "universe_vol"}), how="inner", left_index=True, right_index=True
)
features.shape

(11324, 7)

In [19]:
# features.to_csv("init_features.csv")

In [11]:
# features = pd.read_csv("init_features.csv")
# features["date"] = pd.to_datetime(features["date"])
# features = features.set_index("date")

In [12]:
# features = features.drop(columns=["trace"])

In [13]:
# features = features.drop(columns=["trace"])

# features = features.merge(trace, how="inner", left_index=True, right_index=True)
# features.shape

(11324, 7)

In [14]:
targets = pd.read_csv("targets.csv")
targets["start_date"] = pd.to_datetime(targets["start_date"])
targets["end_date"] = pd.to_datetime(targets["end_date"])

In [15]:
data_df = targets.merge(features, how="right", left_on="start_date", right_index=True)
data_df.shape

(11324, 12)

In [16]:
data_df = data_df.rename(columns={"start_date": "date"})
data_df = data_df.set_index("date")
data_df = data_df.drop(columns=["end_date"])
data_df

Unnamed: 0_level_0,vol,naive_vol,shrinkage,avg_corr,avg_vol,ewma,lw_shrinkage,momentum_feature,universe_vol,trace
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-01-31,,,,,,-0.009858,0.000000,0.233333,0.000000,
1980-02-01,,,,0.425287,0.016380,-0.004310,0.000000,0.433333,0.007453,0.013149
1980-02-04,,,,0.301685,0.014983,-0.004159,0.270941,0.411111,0.005285,0.010395
1980-02-05,,,,0.254350,0.013795,-0.003678,0.366452,0.466667,0.004414,0.007977
1980-02-06,,,,0.310843,0.017848,0.001800,0.408118,0.520000,0.010827,0.012255
...,...,...,...,...,...,...,...,...,...,...
2024-12-24,,,,0.233610,0.017114,0.033623,0.427849,0.530159,0.027874,0.012827
2024-12-26,,,,0.244026,0.016803,0.032652,0.425807,0.520635,0.027871,0.012451
2024-12-27,,,,0.245522,0.016762,0.029418,0.381702,0.484127,0.027908,0.012570
2024-12-30,,,,0.266836,0.017095,0.024774,0.368413,0.453333,0.027998,0.013056


In [17]:
data_df = data_df.rename(columns={"shrinkage": "target"})
data_df

Unnamed: 0_level_0,vol,naive_vol,target,avg_corr,avg_vol,ewma,lw_shrinkage,momentum_feature,universe_vol,trace
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-01-31,,,,,,-0.009858,0.000000,0.233333,0.000000,
1980-02-01,,,,0.425287,0.016380,-0.004310,0.000000,0.433333,0.007453,0.013149
1980-02-04,,,,0.301685,0.014983,-0.004159,0.270941,0.411111,0.005285,0.010395
1980-02-05,,,,0.254350,0.013795,-0.003678,0.366452,0.466667,0.004414,0.007977
1980-02-06,,,,0.310843,0.017848,0.001800,0.408118,0.520000,0.010827,0.012255
...,...,...,...,...,...,...,...,...,...,...
2024-12-24,,,,0.233610,0.017114,0.033623,0.427849,0.530159,0.027874,0.012827
2024-12-26,,,,0.244026,0.016803,0.032652,0.425807,0.520635,0.027871,0.012451
2024-12-27,,,,0.245522,0.016762,0.029418,0.381702,0.484127,0.027908,0.012570
2024-12-30,,,,0.266836,0.017095,0.024774,0.368413,0.453333,0.027998,0.013056


In [18]:
dnk_features = data_df.loc[:, ["target"]].copy()
dnk_features["target_rolling_mean"] = (
    dnk_features["target"].rolling(window=252, min_periods=1).mean()
)
dnk_features["target_rolling_vol"] = (
    data_df[["vol"]].rolling(window=252, min_periods=1).std().fillna(0)
)
dnk_features

Unnamed: 0_level_0,target,target_rolling_mean,target_rolling_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980-01-31,,,0.000000
1980-02-01,,,0.000000
1980-02-04,,,0.000000
1980-02-05,,,0.000000
1980-02-06,,,0.000000
...,...,...,...
2024-12-24,,0.296446,0.001180
2024-12-26,,0.294942,0.001175
2024-12-27,,0.293464,0.001172
2024-12-30,,0.291969,0.001168


In [19]:
data_df = data_df.merge(dnk_features, how="inner", left_index=True, right_index=True)
data_df.shape

(11324, 13)

In [20]:
data_df.columns

Index(['vol', 'naive_vol', 'target_x', 'avg_corr', 'avg_vol', 'ewma',
       'lw_shrinkage', 'momentum_feature', 'universe_vol', 'trace', 'target_y',
       'target_rolling_mean', 'target_rolling_vol'],
      dtype='object')

In [21]:
data_df = data_df.rename(columns={"target_x": "target", "target_y": "lagged_target"})

In [22]:
new_data = data.merge(data_df, left_index=True, right_index=True)
new_data.shape

(11324, 164)

In [23]:
new_data.columns

Index(['10078', '10104', '10107', '10147', '10401', '10604', '11042', '11081',
       '11308', '11471',
       ...
       'avg_corr_y', 'avg_vol_y', 'ewma_y', 'lw_shrinkage_y',
       'momentum_feature_y', 'universe_vol_y', 'trace_y', 'lagged_target_y',
       'target_rolling_mean_y', 'target_rolling_vol_y'],
      dtype='object', length=164)

In [43]:
new_data.to_csv(config.PATH_OUTPUT / config.DF_FILENAME)

In [24]:
data = data.drop(columns=["trace"])

data = data.merge(trace, how="inner", left_index=True, right_index=True)
data.shape

(11324, 151)

In [25]:
# data.to_csv(config.PATH_OUTPUT / config.DF_FILENAME)

In [26]:
# data.columns

Index(['10078', '10104', '10107', '10147', '10401', '10604', '11042', '11081',
       '11308', '11471',
       ...
       'avg_corr', 'avg_vol', 'ewma', 'lw_shrinkage', 'momentum_feature',
       'universe_vol', 'lagged_target', 'target_rolling_mean',
       'target_rolling_vol', 'trace'],
      dtype='object', length=151)