In [11]:
import numpy as np
import pandas as pd

from qamsi.utils.data import read_csv

from run import Dataset

DATASET = Dataset.TOPN_US
TOP_N = 30

config = DATASET.value(topn=TOP_N)
config

ExperimentConfig(RANDOM_SEED=12, PATH_OUTPUT=PosixPath('/Users/buchkovv/qamsi/data/output'), SAVE_PATH=PosixPath('/Users/buchkovv/qamsi/backtests/runs'), DF_FILENAME='top50_data.csv', PRESENCE_MATRIX_FILENAME='top50_presence_matrix.csv', START_DATE=Timestamp('2000-12-18 00:00:00'), END_DATE=Timestamp('2024-07-31 00:00:00'), REBALANCE_FREQ=21, HEDGE_FREQ=1, N_LOOKBEHIND_PERIODS=252, MIN_ROLLING_PERIODS=252, CAUSAL_WINDOW_SIZE=21, FACTORS=('spx',), TARGETS=('vol', 'naive_vol', 'target', 'cgp_ucb', 'irl'), HEDGING_ASSETS=('spx_fut',), RF_NAME='acc_rate', MKT_NAME='spx', PATH_INPUT=PosixPath('/Users/buchkovv/qamsi/data/input'), INITIAL_DF_FILENAME='initial_df.csv', JKP_DATA_FILENAME='jkp_data.csv', STOCKS_LIST_FILENAME='spx_stocks_list.csv', INITIAL_FEATURES_FILENAME='initial_features_df.csv', RETURNS_FILENAME='returns_incl_div_consituents_w_name.csv', BETTER_RETURNS_FILENAME='returns_data_cleaned_better.parquet', ASSET_UNIVERSE=('spx',))

In [12]:
data = read_csv(config.PATH_OUTPUT, config.DF_FILENAME)
pm = read_csv(config.PATH_OUTPUT, config.PRESENCE_MATRIX_FILENAME)
ret = data[pm.columns]

In [13]:
data.shape

(11324, 238)

In [14]:
data.index.is_unique

True

In [15]:
# Function to compute rolling correlation
def rolling_feature(df, feature_fn, pm, name):
    # Initialize a list to store results
    results = []

    # Perform calculation for each rolling window
    for end in df.index:
        start = end - pd.DateOffset(months=1)

        curr_matrix = pm.loc[:end].iloc[-1]
        selection = curr_matrix[curr_matrix == 1].index.tolist()
        rolling_window = df[selection].loc[start:end]

        feature = feature_fn(rolling_window)

        results.append([end, feature])

    # Create a series with the results
    rolling_feat = pd.DataFrame(results, columns=["date", name])
    rolling_feat["date"] = pd.to_datetime(rolling_feat["date"])
    rolling_feat = rolling_feat.set_index("date")
    return rolling_feat

### 1. Avg Corr.

In [16]:
# Function to compute the average of non-diagonal elements in each correlation matrix
def avg_non_diagonal_elements(corr_matrix):
    # Select the non-diagonal elements using numpy
    non_diag = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)]
    return np.nanmean(non_diag)

def avg_corr(rolling_window):
    # Compute the correlation matrix for the rolling window
    corr_matrix = rolling_window.corr()

    # Compute the average of non-diagonal elements
    return avg_non_diagonal_elements(corr_matrix)

# Calculate rolling average correlation of non-diagonal elements
rolling_avg_corr = rolling_feature(ret, avg_corr, pm, "avg_corr")

# Output rolling_avg_corr as the result
rolling_avg_corr.shape

(11324, 1)

### 2. Average volatility.

In [17]:
avg_vol = rolling_feature(ret, lambda s: s.std(axis=0).mean(), pm, "avg_vol")
avg_vol.shape

(11324, 1)

### 3. EW Portfolio.

In [18]:
ew = rolling_feature(ret, lambda s: np.prod(1 + np.nanmean(s, axis=1)) - 1, pm, "ew")
ew

Unnamed: 0_level_0,ew
date,Unnamed: 1_level_1
1980-01-31,-0.010653
1980-02-01,-0.000624
1980-02-04,-0.005363
1980-02-05,-0.002280
1980-02-06,0.012727
...,...
2024-12-24,0.004639
2024-12-26,0.001528
2024-12-27,-0.014119
2024-12-30,-0.025538


In [19]:
ewma = []
for end in ew.index:
    start = end - pd.DateOffset(months=1)

    if end > ew.index[-1]:
        break

    sample = ew.loc[start:end]

    ma = sample.ewm(alpha=0.1).mean().iloc[-1].item()

    ewma.append([end, ma])

In [20]:
ewma = pd.DataFrame(ewma, columns=["date", "ewma"])
ewma["date"] = pd.to_datetime(ewma["date"])
ewma = ewma.set_index("date")
ewma

Unnamed: 0_level_0,ewma
date,Unnamed: 1_level_1
1980-01-31,-0.010653
1980-02-01,-0.005375
1980-02-04,-0.005370
1980-02-05,-0.004472
1980-02-06,-0.000272
...,...
2024-12-24,0.019695
2024-12-26,0.017447
2024-12-27,0.013544
2024-12-30,0.008303


### 4. Ledoit-Wolf Shrinkage Intensity.

In [21]:
from sklearn.covariance import LedoitWolf

def get_intensity(s: pd.DataFrame):
    s = s.copy().fillna(0)
    lw = LedoitWolf()
    lw.fit(s)
    return lw.shrinkage_

lw = rolling_feature(ret, lambda s: get_intensity(s), pm, "lw_shrinkage")
lw

Unnamed: 0_level_0,lw_shrinkage
date,Unnamed: 1_level_1
1980-01-31,0.000000e+00
1980-02-01,-1.102724e-16
1980-02-04,2.858294e-01
1980-02-05,3.661982e-01
1980-02-06,4.687317e-01
...,...
2024-12-24,4.683230e-01
2024-12-26,4.620570e-01
2024-12-27,4.185235e-01
2024-12-30,4.048315e-01


### 5. Momentum

In [22]:
momentum = rolling_feature(ret, lambda s: np.nanmean(np.where(s, s > 0, 1), axis=0).mean(), pm, "momentum_feature")
momentum

Unnamed: 0_level_0,momentum_feature
date,Unnamed: 1_level_1
1980-01-31,0.260000
1980-02-01,0.470000
1980-02-04,0.426667
1980-02-05,0.480000
1980-02-06,0.508000
...,...
2024-12-24,0.518095
2024-12-26,0.509524
2024-12-27,0.476190
2024-12-30,0.447000


### 6. Trace.

In [23]:
trace = rolling_feature(ret, lambda s: np.trace(s.fillna(0).cov()), pm, "trace")
trace

Unnamed: 0_level_0,trace
date,Unnamed: 1_level_1
1980-01-31,
1980-02-01,0.022397
1980-02-04,0.017359
1980-02-05,0.014489
1980-02-06,0.020418
...,...
2024-12-24,0.017803
2024-12-26,0.017242
2024-12-27,0.017252
2024-12-30,0.017874


In [24]:
trace[trace.isna().any(axis=1)]

Unnamed: 0_level_0,trace
date,Unnamed: 1_level_1
1980-01-31,


### 7. Universe Volatility.

In [25]:
ew_vol = ew.rolling(window=252, min_periods=1).std().fillna(0)
ew_vol

Unnamed: 0_level_0,ew
date,Unnamed: 1_level_1
1980-01-31,0.000000
1980-02-01,0.007092
1980-02-04,0.005017
1980-02-05,0.004410
1980-02-06,0.008691
...,...
2024-12-24,0.024600
2024-12-26,0.024623
2024-12-27,0.024719
2024-12-30,0.024883


### Append features.

In [26]:
features = rolling_avg_corr.merge(avg_vol, how="inner", left_index=True, right_index=True)
features.shape

(11324, 2)

In [27]:
features = features.merge(ewma, how="inner", left_index=True, right_index=True)
features.shape

(11324, 3)

In [28]:
features = features.merge(lw, how="inner", left_index=True, right_index=True)
features.shape

(11324, 4)

In [29]:
features = features.merge(momentum, how="inner", left_index=True, right_index=True)
features.shape

(11324, 5)

In [30]:
features = features.merge(trace, how="inner", left_index=True, right_index=True)
features.shape

(11324, 6)

In [31]:
features = features.merge(
    ew_vol.rename(columns={"ew": "universe_vol"}), how="inner", left_index=True, right_index=True
)
features.shape

(11324, 7)

In [32]:
# features.to_csv("init_features.csv")

In [33]:
# features = pd.read_csv("init_features.csv")
# features["date"] = pd.to_datetime(features["date"])
# features = features.set_index("date")

In [34]:
# features = features.drop(columns=["trace"])

In [35]:
# features = features.drop(columns=["trace"])

# features = features.merge(trace, how="inner", left_index=True, right_index=True)
# features.shape

In [52]:
targets = pd.read_csv(f"targets_{TOP_N}.csv")
targets["start_date"] = pd.to_datetime(targets["start_date"])
targets["end_date"] = pd.to_datetime(targets["end_date"])

In [53]:
data_df = targets.merge(features, how="right", left_on="start_date", right_index=True)
data_df.shape

(11324, 12)

In [54]:
data_df = data_df.rename(columns={"start_date": "date"})
data_df = data_df.set_index("date")
data_df = data_df.drop(columns=["end_date"])
data_df

Unnamed: 0_level_0,vol,naive_vol,shrinkage,avg_corr,avg_vol,ewma,lw_shrinkage,momentum_feature,trace,universe_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-01-31,,,,,,-0.010653,0.000000e+00,0.260000,,0.000000
1980-02-01,,,,0.397551,0.017011,-0.005375,-1.102724e-16,0.470000,0.022397,0.007092
1980-02-04,,,,0.285805,0.015355,-0.005370,2.858294e-01,0.426667,0.017359,0.005017
1980-02-05,,,,0.252465,0.014577,-0.004472,3.661982e-01,0.480000,0.014489,0.004410
1980-02-06,,,,0.220178,0.017591,-0.000272,4.687317e-01,0.508000,0.020418,0.008691
...,...,...,...,...,...,...,...,...,...,...
2024-12-24,,,,0.259081,0.016060,0.019695,4.683230e-01,0.518095,0.017803,0.024600
2024-12-26,,,,0.265835,0.015733,0.017447,4.620570e-01,0.509524,0.017242,0.024623
2024-12-27,,,,0.267151,0.015642,0.013544,4.185235e-01,0.476190,0.017252,0.024719
2024-12-30,,,,0.290094,0.015918,0.008303,4.048315e-01,0.447000,0.017874,0.024883


In [55]:
data_df = data_df.rename(columns={"shrinkage": "target"})
data_df

Unnamed: 0_level_0,vol,naive_vol,target,avg_corr,avg_vol,ewma,lw_shrinkage,momentum_feature,trace,universe_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-01-31,,,,,,-0.010653,0.000000e+00,0.260000,,0.000000
1980-02-01,,,,0.397551,0.017011,-0.005375,-1.102724e-16,0.470000,0.022397,0.007092
1980-02-04,,,,0.285805,0.015355,-0.005370,2.858294e-01,0.426667,0.017359,0.005017
1980-02-05,,,,0.252465,0.014577,-0.004472,3.661982e-01,0.480000,0.014489,0.004410
1980-02-06,,,,0.220178,0.017591,-0.000272,4.687317e-01,0.508000,0.020418,0.008691
...,...,...,...,...,...,...,...,...,...,...
2024-12-24,,,,0.259081,0.016060,0.019695,4.683230e-01,0.518095,0.017803,0.024600
2024-12-26,,,,0.265835,0.015733,0.017447,4.620570e-01,0.509524,0.017242,0.024623
2024-12-27,,,,0.267151,0.015642,0.013544,4.185235e-01,0.476190,0.017252,0.024719
2024-12-30,,,,0.290094,0.015918,0.008303,4.048315e-01,0.447000,0.017874,0.024883


In [56]:
dnk_features = data_df.loc[:, ["target"]].copy()
dnk_features["target_rolling_mean"] = (
    dnk_features["target"].rolling(window=252, min_periods=1).mean()
)
dnk_features["target_rolling_vol"] = (
    data_df[["vol"]].rolling(window=252, min_periods=1).std().fillna(0)
)
dnk_features

Unnamed: 0_level_0,target,target_rolling_mean,target_rolling_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980-01-31,,,0.000000
1980-02-01,,,0.000000
1980-02-04,,,0.000000
1980-02-05,,,0.000000
1980-02-06,,,0.000000
...,...,...,...
2024-12-24,,0.523562,0.000745
2024-12-26,,0.521987,0.000746
2024-12-27,,0.520368,0.000748
2024-12-30,,0.518686,0.000751


In [57]:
data_df = data_df.merge(dnk_features, how="inner", left_index=True, right_index=True)
data_df.shape

(11324, 13)

In [58]:
data_df.columns

Index(['vol', 'naive_vol', 'target_x', 'avg_corr', 'avg_vol', 'ewma',
       'lw_shrinkage', 'momentum_feature', 'trace', 'universe_vol', 'target_y',
       'target_rolling_mean', 'target_rolling_vol'],
      dtype='object')

In [59]:
data_df = data_df.rename(columns={"target_x": "target", "target_y": "lagged_target"})

In [60]:
new_data = data.merge(data_df, left_index=True, right_index=True)
new_data.shape

(11324, 251)

In [61]:
new_data.columns

Index(['10078', '10104', '10107', '10145', '10147', '10161', '10401', '10604',
       '10890', '11042',
       ...
       'avg_corr', 'avg_vol', 'ewma', 'lw_shrinkage', 'momentum_feature',
       'trace', 'universe_vol', 'lagged_target', 'target_rolling_mean',
       'target_rolling_vol'],
      dtype='object', length=251)

In [63]:
new_data.index.is_unique

True

In [64]:
new_data.to_csv(config.PATH_OUTPUT / config.DF_FILENAME)

In [47]:
# data = data.drop(columns=["trace"])
#
# data = data.merge(trace, how="inner", left_index=True, right_index=True)
# data.shape

In [50]:
# data.to_csv(config.PATH_OUTPUT / config.DF_FILENAME)

In [49]:
# data.columns