In [3]:
import numpy as np
import pandas as pd

from qamsi.utils.data import read_csv

from run import Dataset

DATASET = Dataset.TOPN_US
TOP_N = 500

config = DATASET.value(topn=TOP_N)
config

ExperimentConfig(RANDOM_SEED=12, PATH_OUTPUT=PosixPath('/Users/buchkovv/qamsi/data/output'), SAVE_PATH=PosixPath('/Users/buchkovv/qamsi/backtests/runs'), DF_FILENAME='top500_data.csv', PRESENCE_MATRIX_FILENAME='top500_presence_matrix.csv', START_DATE=Timestamp('2000-12-18 00:00:00'), END_DATE=Timestamp('2024-07-31 00:00:00'), REBALANCE_FREQ=21, HEDGE_FREQ=1, N_LOOKBEHIND_PERIODS=252, MIN_ROLLING_PERIODS=252, CAUSAL_WINDOW_SIZE=30, FACTORS=('spx',), TARGETS=('vol', 'naive_vol', 'target', 'cgp_ucb', 'irl', 'irl_gail', 'qis_shrinkage', 'qis_vol'), HEDGING_ASSETS=('spx_fut',), RF_NAME='acc_rate', MKT_NAME='spx', PATH_INPUT=PosixPath('/Users/buchkovv/qamsi/data/input'), INITIAL_DF_FILENAME='initial_df.csv', JKP_DATA_FILENAME='jkp_data.csv', STOCKS_LIST_FILENAME='spx_stocks_list.csv', INITIAL_FEATURES_FILENAME='initial_features_df.csv', RETURNS_FILENAME='returns_incl_div_consituents_w_name.csv', BETTER_RETURNS_FILENAME='returns_data_cleaned_better.parquet', ASSET_UNIVERSE=('spx',))

In [4]:
data = read_csv(config.PATH_OUTPUT, config.DF_FILENAME)
pm = read_csv(config.PATH_OUTPUT, config.PRESENCE_MATRIX_FILENAME)
ret = data[pm.columns]

In [5]:
data.shape

(11324, 2465)

In [6]:
data.index.is_unique

True

In [7]:
# Function to compute rolling correlation
def rolling_feature(df, feature_fn, pm, name):
    # Initialize a list to store results
    results = []

    # Perform calculation for each rolling window
    for end in df.index:
        start = end - pd.DateOffset(months=1)

        curr_matrix = pm.loc[:end].iloc[-1]
        selection = curr_matrix[curr_matrix == 1].index.tolist()
        rolling_window = df[selection].loc[start:end]

        feature = feature_fn(rolling_window)

        results.append([end, feature])

    # Create a series with the results
    rolling_feat = pd.DataFrame(results, columns=["date", name])
    rolling_feat["date"] = pd.to_datetime(rolling_feat["date"])
    rolling_feat = rolling_feat.set_index("date")
    return rolling_feat

### 1. Avg Corr.

In [8]:
# Function to compute the average of non-diagonal elements in each correlation matrix
def avg_non_diagonal_elements(corr_matrix):
    # Select the non-diagonal elements using numpy
    non_diag = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)]
    return np.nanmean(non_diag)

def avg_corr(rolling_window):
    # Compute the correlation matrix for the rolling window
    corr_matrix = rolling_window.corr()

    # Compute the average of non-diagonal elements
    return avg_non_diagonal_elements(corr_matrix)

# Calculate rolling average correlation of non-diagonal elements
rolling_avg_corr = rolling_feature(ret, avg_corr, pm, "avg_corr")

# Output rolling_avg_corr as the result
rolling_avg_corr.shape

(11324, 1)

### 2. Average volatility.

In [9]:
avg_vol = rolling_feature(ret, lambda s: s.std(axis=0).mean(), pm, "avg_vol")
avg_vol.shape

(11324, 1)

### 3. EW Portfolio.

In [10]:
ew = rolling_feature(ret, lambda s: np.prod(1 + np.nanmean(s, axis=1)) - 1, pm, "ew")
ew

Unnamed: 0_level_0,ew
date,Unnamed: 1_level_1
1980-01-31,-0.005918
1980-02-01,-0.001735
1980-02-04,-0.005738
1980-02-05,-0.005099
1980-02-06,-0.000412
...,...
2024-12-24,-0.035007
2024-12-26,-0.042725
2024-12-27,-0.050011
2024-12-30,-0.060926


In [11]:
ewma = []
for end in ew.index:
    start = end - pd.DateOffset(months=1)

    if end > ew.index[-1]:
        break

    sample = ew.loc[start:end]

    ma = sample.ewm(alpha=0.1).mean().iloc[-1].item()

    ewma.append([end, ma])

In [12]:
ewma = pd.DataFrame(ewma, columns=["date", "ewma"])
ewma["date"] = pd.to_datetime(ewma["date"])
ewma = ewma.set_index("date")
ewma

Unnamed: 0_level_0,ewma
date,Unnamed: 1_level_1
1980-01-31,-0.005918
1980-02-01,-0.003717
1980-02-04,-0.004462
1980-02-05,-0.004648
1980-02-06,-0.003613
...,...
2024-12-24,0.004023
2024-12-26,-0.002005
2024-12-27,-0.008291
2024-12-30,-0.016266


### 4. Ledoit-Wolf Shrinkage Intensity.

In [13]:
from sklearn.covariance import LedoitWolf

def get_intensity(s: pd.DataFrame):
    s = s.copy().fillna(0)
    lw = LedoitWolf()
    lw.fit(s)
    return lw.shrinkage_

lw = rolling_feature(ret, lambda s: get_intensity(s), pm, "lw_shrinkage")
lw

Unnamed: 0_level_0,lw_shrinkage
date,Unnamed: 1_level_1
1980-01-31,0.000000
1980-02-01,0.000000
1980-02-04,0.307793
1980-02-05,0.469367
1980-02-06,0.560948
...,...
2024-12-24,0.498412
2024-12-26,0.507959
2024-12-27,0.475507
2024-12-30,0.448953


### 5. Momentum

In [14]:
momentum = rolling_feature(ret, lambda s: np.nanmean(np.where(s, s > 0, 1), axis=0).mean(), pm, "momentum_feature")
momentum

Unnamed: 0_level_0,momentum_feature
date,Unnamed: 1_level_1
1980-01-31,0.438000
1980-02-01,0.540000
1980-02-04,0.515333
1980-02-05,0.522500
1980-02-06,0.536000
...,...
2024-12-24,0.469143
2024-12-26,0.461143
2024-12-27,0.440762
2024-12-30,0.411200


### 6. Trace.

In [15]:
trace = rolling_feature(ret, lambda s: np.trace(s.fillna(0).cov()), pm, "trace")
trace

Unnamed: 0_level_0,trace
date,Unnamed: 1_level_1
1980-01-31,
1980-02-01,0.201057
1980-02-04,0.159017
1980-02-05,0.164364
1980-02-06,0.178381
...,...
2024-12-24,0.202987
2024-12-26,0.190999
2024-12-27,0.186034
2024-12-30,0.188898


In [16]:
trace[trace.isna().any(axis=1)]

Unnamed: 0_level_0,trace
date,Unnamed: 1_level_1
1980-01-31,


### 7. Universe Volatility.

In [17]:
ew_vol = ew.rolling(window=252, min_periods=1).std().fillna(0)
ew_vol

Unnamed: 0_level_0,ew
date,Unnamed: 1_level_1
1980-01-31,0.000000
1980-02-01,0.002958
1980-02-04,0.002365
1980-02-05,0.001957
1980-02-06,0.002533
...,...
2024-12-24,0.027635
2024-12-26,0.027736
2024-12-27,0.027887
2024-12-30,0.028117


### Append features.

In [18]:
features = rolling_avg_corr.merge(avg_vol, how="inner", left_index=True, right_index=True)
features.shape

(11324, 2)

In [19]:
features = features.merge(ewma, how="inner", left_index=True, right_index=True)
features.shape

(11324, 3)

In [20]:
features = features.merge(lw, how="inner", left_index=True, right_index=True)
features.shape

(11324, 4)

In [21]:
features = features.merge(momentum, how="inner", left_index=True, right_index=True)
features.shape

(11324, 5)

In [22]:
features = features.merge(trace, how="inner", left_index=True, right_index=True)
features.shape

(11324, 6)

In [23]:
features = features.merge(
    ew_vol.rename(columns={"ew": "universe_vol"}), how="inner", left_index=True, right_index=True
)
features.shape

(11324, 7)

In [24]:
# features.to_csv("init_features.csv")

In [25]:
# features = pd.read_csv("init_features.csv")
# features["date"] = pd.to_datetime(features["date"])
# features = features.set_index("date")

In [26]:
# features = features.drop(columns=["trace"])

In [27]:
# features = features.drop(columns=["trace"])

# features = features.merge(trace, how="inner", left_index=True, right_index=True)
# features.shape

In [28]:
targets = pd.read_csv(f"targets_{TOP_N}.csv")
targets["start_date"] = pd.to_datetime(targets["start_date"])
targets["end_date"] = pd.to_datetime(targets["end_date"])

In [29]:
data_df = targets.merge(features, how="right", left_on="start_date", right_index=True)
data_df.shape

(11324, 12)

In [30]:
data_df = data_df.rename(columns={"start_date": "date"})
data_df = data_df.set_index("date")
data_df = data_df.drop(columns=["end_date"])
data_df

Unnamed: 0_level_0,vol,naive_vol,shrinkage,avg_corr,avg_vol,ewma,lw_shrinkage,momentum_feature,trace,universe_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-01-31,,,,,,-0.005918,0.000000,0.438000,,0.000000
1980-02-01,,,,0.039339,0.014624,-0.003717,0.000000,0.540000,0.201057,0.002958
1980-02-04,,,,0.066038,0.014648,-0.004462,0.307793,0.515333,0.159017,0.002365
1980-02-05,,,,0.044011,0.015299,-0.004648,0.469367,0.522500,0.164364,0.001957
1980-02-06,,,,0.045337,0.016311,-0.003613,0.560948,0.536000,0.178381,0.002533
...,...,...,...,...,...,...,...,...,...,...
2024-12-24,,,,0.280020,0.017608,0.004023,0.498412,0.469143,0.202987,0.027635
2024-12-26,,,,0.279611,0.016998,-0.002005,0.507959,0.461143,0.190999,0.027736
2024-12-27,,,,0.292113,0.016732,-0.008291,0.475507,0.440762,0.186034,0.027887
2024-12-30,,,,0.303308,0.016887,-0.016266,0.448953,0.411200,0.188898,0.028117


In [31]:
data_df = data_df.rename(columns={"shrinkage": "target"})
data_df

Unnamed: 0_level_0,vol,naive_vol,target,avg_corr,avg_vol,ewma,lw_shrinkage,momentum_feature,trace,universe_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-01-31,,,,,,-0.005918,0.000000,0.438000,,0.000000
1980-02-01,,,,0.039339,0.014624,-0.003717,0.000000,0.540000,0.201057,0.002958
1980-02-04,,,,0.066038,0.014648,-0.004462,0.307793,0.515333,0.159017,0.002365
1980-02-05,,,,0.044011,0.015299,-0.004648,0.469367,0.522500,0.164364,0.001957
1980-02-06,,,,0.045337,0.016311,-0.003613,0.560948,0.536000,0.178381,0.002533
...,...,...,...,...,...,...,...,...,...,...
2024-12-24,,,,0.280020,0.017608,0.004023,0.498412,0.469143,0.202987,0.027635
2024-12-26,,,,0.279611,0.016998,-0.002005,0.507959,0.461143,0.190999,0.027736
2024-12-27,,,,0.292113,0.016732,-0.008291,0.475507,0.440762,0.186034,0.027887
2024-12-30,,,,0.303308,0.016887,-0.016266,0.448953,0.411200,0.188898,0.028117


In [32]:
dnk_features = data_df.loc[:, ["target"]].copy()
dnk_features["target_rolling_mean"] = (
    dnk_features["target"].rolling(window=252, min_periods=1).mean()
)
dnk_features["target_rolling_vol"] = (
    data_df[["vol"]].rolling(window=252, min_periods=1).std().fillna(0)
)
dnk_features

Unnamed: 0_level_0,target,target_rolling_mean,target_rolling_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980-01-31,,,0.000000
1980-02-01,,,0.000000
1980-02-04,,,0.000000
1980-02-05,,,0.000000
1980-02-06,,,0.000000
...,...,...,...
2024-12-24,,0.662188,0.000666
2024-12-26,,0.661763,0.000658
2024-12-27,,0.661501,0.000651
2024-12-30,,0.661335,0.000645


In [33]:
data_df = data_df.merge(dnk_features, how="inner", left_index=True, right_index=True)
data_df.shape

(11324, 13)

In [34]:
data_df.columns

Index(['vol', 'naive_vol', 'target_x', 'avg_corr', 'avg_vol', 'ewma',
       'lw_shrinkage', 'momentum_feature', 'trace', 'universe_vol', 'target_y',
       'target_rolling_mean', 'target_rolling_vol'],
      dtype='object')

In [35]:
data_df = data_df.rename(columns={"target_x": "target", "target_y": "lagged_target"})

In [36]:
new_data = data.merge(data_df, left_index=True, right_index=True)
new_data.shape

(11324, 2478)

In [37]:
new_data.columns

Index(['10006', '10078', '10095', '10104', '10107', '10108', '10119', '10137',
       '10138', '10145',
       ...
       'avg_corr', 'avg_vol', 'ewma', 'lw_shrinkage', 'momentum_feature',
       'trace', 'universe_vol', 'lagged_target', 'target_rolling_mean',
       'target_rolling_vol'],
      dtype='object', length=2478)

In [38]:
new_data.index.is_unique

True

In [39]:
new_data.to_csv(config.PATH_OUTPUT / config.DF_FILENAME)

In [40]:
# data = data.drop(columns=["trace"])
#
# data = data.merge(trace, how="inner", left_index=True, right_index=True)
# data.shape

In [41]:
# data.to_csv(config.PATH_OUTPUT / config.DF_FILENAME)

In [42]:
# data.columns