In [1]:
import pandas as pd

from run import Dataset

DATASET = Dataset.SPX_US

config = DATASET.value()
config

ExperimentConfig:
* RANDOM_SEED = 12
* PATH_OUTPUT = /Users/buchkovv/qamsi/data/output
* SAVE_PATH = /Users/buchkovv/qamsi/backtests/runs
* DF_FILENAME = spx_data.csv
* START_DATE = 2000-12-18 00:00:00
* END_DATE = 2024-12-31 00:00:00
* REBALANCE_FREQ = 21
* HEDGE_FREQ = 1
* N_LOOKBEHIND_PERIODS = 252
* MIN_ROLLING_PERIODS = 252
* CAUSAL_WINDOW_SIZE = None
* ASSET_UNIVERSE = ('spx',)
* FACTORS = ('spx',)
* TARGETS = ('vol', 'naive_vol', 'target')
* HEDGING_ASSETS = ('spx_fut',)
* RF_NAME = acc_rate
* MKT_NAME = spx
* PATH_FEATURES = /data/gw
* PATH_MARKET_DATA = /data/spx_stocks
* PATH_BETTER_MARKET_DATA = /data/spxc
* PATH_INPUT = /Users/buchkovv/qamsi/data/input
* INITIAL_DF_FILENAME = initial_df.csv
* JKP_DATA_FILENAME = jkp_data.csv
* STOCKS_LIST_FILENAME = spx_stocks_list.csv
* INITIAL_FEATURES_FILENAME = initial_features_df.csv
* RETURNS_FILENAME = returns_incl_div_consituents_w_name.csv
* BETTER_RETURNS_FILENAME = returns_data_cleaned_better.parquet
* PRESENCE_MATRIX_FILENAME = 

In [2]:
from pathlib import Path

PATH = Path("../../data/output")

data = pd.read_csv(PATH / "spx_data.csv")
data["date"] = pd.to_datetime(data["date"])
data = data.set_index("date")
data.shape

(11093, 1465)

In [3]:
# data = data.drop(columns=["gp_ucb_pred"])

In [14]:
stocks = list(
    tuple(
        pd.read_csv(config.PATH_OUTPUT / config.STOCKS_LIST_FILENAME)
        .iloc[:, 0]
        .astype(str)
        .tolist(),
    )
)

ret = data[stocks]
ret.shape

(11093, 1457)

### 1. Avg Corr.

In [15]:
# import pandas as pd
#
# # Assuming `ret` is your DataFrame of returns (date as index), and it is already loaded
#
# # Define the rolling window and minimum periods
# window = 252
# min_periods = 252
#
# # Function to compute the average of non-diagonal elements in each correlation matrix
# def avg_non_diagonal_elements(corr_matrix):
#     # Select the non-diagonal elements using numpy
#     non_diag = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)]
#     return np.nanmean(non_diag)
#
# # Function to compute rolling correlation
# def rolling_avg_non_diag_corr(df, window, min_periods):
#     # Initialize a list to store results
#     results = []
#
#     # Perform calculation for each rolling window
#     for start in range(len(df) - window + 1):
#         end = start + window
#         rolling_window = df.iloc[start:end]
#
#         # Compute the correlation matrix for the rolling window
#         corr_matrix = rolling_window.corr()
#
#         # Compute the average of non-diagonal elements
#         avg_correlation = avg_non_diagonal_elements(corr_matrix)
#         results.append(avg_correlation)
#
#     # Create a series with the results
#     rolling_avg_corr = pd.Series(results, index=df.index[window - 1 :])
#     return rolling_avg_corr
#
# # Calculate rolling average correlation of non-diagonal elements
# rolling_avg_corr = rolling_avg_non_diag_corr(ret, window, min_periods)
#
# # Output rolling_avg_corr as the result
# rolling_avg_corr.shape

(10842,)

In [16]:
rolling_avg_corr = pd.read_csv(PATH / "avg_corr.csv")

In [17]:
# rolling_avg_corr.to_csv(PATH / "avg_corr.csv")

### 2. Average volatility.

In [18]:
avg_vol = ret.rolling(window=252, min_periods=1).std().mean(axis=1)
avg_vol.shape

(11093,)

### 3. EW Portfolio.

In [19]:
import numpy as np

ew = ret.apply(np.nanmean, axis=1)
ew

date
1980-01-02   -0.016411
1980-01-03   -0.007948
1980-01-04    0.015455
1980-01-07    0.003652
1980-01-08    0.016765
                ...   
2023-12-22    0.003390
2023-12-26    0.005998
2023-12-27    0.001137
2023-12-28    0.001180
2023-12-29   -0.002990
Length: 11093, dtype: float64

In [20]:
ewma = []
for date in ew.index[21 - 1 :]:
    end = date
    sample = ew.loc[:end].iloc[-21:]

    ma = sample.ewm(alpha=0.1).mean().iloc[-1]

    ewma.append([date, ma])

In [21]:
ewma = pd.DataFrame(ewma, columns=["date", "ewma"]).set_index("date").iloc[:, 0]
ewma

date
1980-01-30    0.003116
1980-01-31    0.002309
1980-02-01    0.002758
1980-02-04    0.002024
1980-02-05    0.001906
                ...   
2023-12-22    0.003470
2023-12-26    0.003755
2023-12-27    0.003549
2023-12-28    0.003336
2023-12-29    0.002619
Name: ewma, Length: 11073, dtype: float64

### 4. Ledoit-Wolf Shrinkage Intensity.

In [22]:
from tqdm import tqdm
from sklearn.covariance import LedoitWolf

lw_intensity = []
for date in tqdm(ew.index[252 - 1 :]):
    end = date
    sample = ret.loc[:end].iloc[-252:].dropna(axis=1)

    lw = LedoitWolf()
    lw.fit(sample)

    lw_intensity.append([date, lw.shrinkage_])
lw_intensity = (
    pd.DataFrame(lw_intensity, columns=["date", "lw_intensity"])
    .set_index("date")
    .iloc[:, 0]
)

100%|██████████| 10842/10842 [04:39<00:00, 38.78it/s]


In [23]:
lw_intensity.head()

date
1980-12-30    0.103118
1980-12-31    0.105223
1981-01-02    0.105667
1981-01-05    0.106538
1981-01-06    0.106711
Name: lw_intensity, dtype: float64

In [24]:
sample.shape

(252, 432)

In [25]:
np.nanmean(np.where(sample, sample > 0, 1), axis=0).mean()

np.float64(0.5251322751322751)

### 5. Momentum

In [26]:
momentum = []
for date in tqdm(ew.index[21 - 1 :]):
    end = date
    sample = ret.loc[:end].iloc[-21:].dropna(axis=1)

    mom = np.nanmean(np.where(sample, sample > 0, 1), axis=0).mean()

    momentum.append([date, mom])
momentum = (
    pd.DataFrame(momentum, columns=["date", "momentum_feature"])
    .set_index("date")
    .iloc[:, 0]
)

100%|██████████| 11073/11073 [00:02<00:00, 3738.05it/s]


### 6. Trace.

In [27]:
np.trace(sample.cov())

np.float64(0.12288092370262858)

In [28]:
trace = []
for date in tqdm(ew.index[252 - 1 :]):
    end = date
    sample = ret.loc[:end].iloc[-252:].dropna(axis=1)

    mom = np.trace(sample.cov())

    trace.append([date, mom])
trace = pd.DataFrame(trace, columns=["date", "trace"]).set_index("date").iloc[:, 0]

100%|██████████| 10842/10842 [00:11<00:00, 952.87it/s] 


### 7. Universe Volatility.

In [29]:
ew_vol = ew.rolling(window=252, min_periods=1).std()
ew_vol

date
1980-01-02         NaN
1980-01-03    0.005984
1980-01-04    0.016506
1980-01-07    0.013878
1980-01-08    0.014485
                ...   
2023-12-22    0.009123
2023-12-26    0.009099
2023-12-27    0.009090
2023-12-28    0.009090
2023-12-29    0.009049
Length: 11093, dtype: float64

### Append features.

In [31]:
# rolling_avg_corr["date"] = pd.to_datetime(rolling_avg_corr["date"])
# rolling_avg_corr = rolling_avg_corr.set_index("date")
# rolling_avg_corr.columns = ["avg_corr_stocks"]

In [33]:
features = rolling_avg_corr.to_frame("avg_corr").merge(
    avg_vol.rename("stocks_vol"), how="inner", left_index=True, right_index=True
)
features.shape

(10842, 2)

In [34]:
features = features.merge(ewma, how="inner", left_index=True, right_index=True)
features.shape

(10842, 3)

In [35]:
features = features.merge(lw_intensity, how="inner", left_index=True, right_index=True)
features.shape

(10842, 4)

In [36]:
features = features.merge(momentum, how="inner", left_index=True, right_index=True)
features.shape

(10842, 5)

In [37]:
features = features.merge(trace, how="inner", left_index=True, right_index=True)
features.shape

(10842, 6)

In [38]:
features = features.merge(
    ew_vol.rename("universe_vol"), how="inner", left_index=True, right_index=True
)
features.shape

(10842, 7)

In [39]:
# features.to_csv("init_features.csv")

In [4]:
features = pd.read_csv("init_features.csv")
features["date"] = pd.to_datetime(features["date"])
features = features.set_index("date")

In [5]:
targets = pd.read_csv("targets.csv")
targets["start_date"] = pd.to_datetime(targets["start_date"])
targets["end_date"] = pd.to_datetime(targets["end_date"])

In [6]:
targets.shape

(10813, 5)

In [7]:
data_df = targets.merge(features, how="inner", left_on="start_date", right_index=True)
data_df.shape

(10813, 12)

In [8]:
data_df

Unnamed: 0,end_date,start_date,vol,naive_vol,shrinkage,avg_corr,stocks_vol,ewma,lw_intensity,momentum_feature,trace,universe_vol
0,1981-01-29,1980-12-31,0.006478,0.006508,0.167364,0.190375,0.021256,0.002557,0.105223,0.563100,0.216770,0.009231
1,1981-01-30,1981-01-02,0.006500,0.006614,0.382393,0.189589,0.021260,0.003088,0.105667,0.572604,0.216851,0.009219
2,1981-02-02,1981-01-05,0.005181,0.005253,0.197455,0.189760,0.021279,0.004551,0.106538,0.581325,0.217056,0.009223
3,1981-02-03,1981-01-06,0.004921,0.004940,0.145795,0.189453,0.021304,0.004652,0.106711,0.581031,0.217361,0.009225
4,1981-02-04,1981-01-07,0.004314,0.004395,0.210126,0.193561,0.021344,0.001649,0.103907,0.573192,0.217902,0.009310
...,...,...,...,...,...,...,...,...,...,...,...,...
10808,2023-12-12,2023-11-13,0.004721,0.004726,0.122149,0.266976,0.018326,0.001329,0.052586,0.506435,0.146626,0.009336
10809,2023-12-13,2023-11-14,0.004817,0.005012,0.410885,0.274954,0.018364,0.004217,0.050911,0.507400,0.147174,0.009479
10810,2023-12-14,2023-11-15,0.005649,0.005740,0.313682,0.274016,0.018356,0.004370,0.051184,0.505338,0.147103,0.009459
10811,2023-12-15,2023-11-16,0.005901,0.005952,0.283729,0.273183,0.018347,0.003686,0.051374,0.517937,0.146914,0.009443


In [9]:
data_df = data_df.rename(columns={"end_date": "date"})
data_df = data_df.set_index("date")
data_df = data_df.drop(columns=["start_date"])
data_df

Unnamed: 0_level_0,vol,naive_vol,shrinkage,avg_corr,stocks_vol,ewma,lw_intensity,momentum_feature,trace,universe_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1981-01-29,0.006478,0.006508,0.167364,0.190375,0.021256,0.002557,0.105223,0.563100,0.216770,0.009231
1981-01-30,0.006500,0.006614,0.382393,0.189589,0.021260,0.003088,0.105667,0.572604,0.216851,0.009219
1981-02-02,0.005181,0.005253,0.197455,0.189760,0.021279,0.004551,0.106538,0.581325,0.217056,0.009223
1981-02-03,0.004921,0.004940,0.145795,0.189453,0.021304,0.004652,0.106711,0.581031,0.217361,0.009225
1981-02-04,0.004314,0.004395,0.210126,0.193561,0.021344,0.001649,0.103907,0.573192,0.217902,0.009310
...,...,...,...,...,...,...,...,...,...,...
2023-12-12,0.004721,0.004726,0.122149,0.266976,0.018326,0.001329,0.052586,0.506435,0.146626,0.009336
2023-12-13,0.004817,0.005012,0.410885,0.274954,0.018364,0.004217,0.050911,0.507400,0.147174,0.009479
2023-12-14,0.005649,0.005740,0.313682,0.274016,0.018356,0.004370,0.051184,0.505338,0.147103,0.009459
2023-12-15,0.005901,0.005952,0.283729,0.273183,0.018347,0.003686,0.051374,0.517937,0.146914,0.009443


In [10]:
data_df = data_df.rename(columns={"shrinkage": "target"})
data_df

Unnamed: 0_level_0,vol,naive_vol,target,avg_corr,stocks_vol,ewma,lw_intensity,momentum_feature,trace,universe_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1981-01-29,0.006478,0.006508,0.167364,0.190375,0.021256,0.002557,0.105223,0.563100,0.216770,0.009231
1981-01-30,0.006500,0.006614,0.382393,0.189589,0.021260,0.003088,0.105667,0.572604,0.216851,0.009219
1981-02-02,0.005181,0.005253,0.197455,0.189760,0.021279,0.004551,0.106538,0.581325,0.217056,0.009223
1981-02-03,0.004921,0.004940,0.145795,0.189453,0.021304,0.004652,0.106711,0.581031,0.217361,0.009225
1981-02-04,0.004314,0.004395,0.210126,0.193561,0.021344,0.001649,0.103907,0.573192,0.217902,0.009310
...,...,...,...,...,...,...,...,...,...,...
2023-12-12,0.004721,0.004726,0.122149,0.266976,0.018326,0.001329,0.052586,0.506435,0.146626,0.009336
2023-12-13,0.004817,0.005012,0.410885,0.274954,0.018364,0.004217,0.050911,0.507400,0.147174,0.009479
2023-12-14,0.005649,0.005740,0.313682,0.274016,0.018356,0.004370,0.051184,0.505338,0.147103,0.009459
2023-12-15,0.005901,0.005952,0.283729,0.273183,0.018347,0.003686,0.051374,0.517937,0.146914,0.009443


In [11]:
data_df

Unnamed: 0_level_0,vol,naive_vol,target,avg_corr,stocks_vol,ewma,lw_intensity,momentum_feature,trace,universe_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1981-01-29,0.006478,0.006508,0.167364,0.190375,0.021256,0.002557,0.105223,0.563100,0.216770,0.009231
1981-01-30,0.006500,0.006614,0.382393,0.189589,0.021260,0.003088,0.105667,0.572604,0.216851,0.009219
1981-02-02,0.005181,0.005253,0.197455,0.189760,0.021279,0.004551,0.106538,0.581325,0.217056,0.009223
1981-02-03,0.004921,0.004940,0.145795,0.189453,0.021304,0.004652,0.106711,0.581031,0.217361,0.009225
1981-02-04,0.004314,0.004395,0.210126,0.193561,0.021344,0.001649,0.103907,0.573192,0.217902,0.009310
...,...,...,...,...,...,...,...,...,...,...
2023-12-12,0.004721,0.004726,0.122149,0.266976,0.018326,0.001329,0.052586,0.506435,0.146626,0.009336
2023-12-13,0.004817,0.005012,0.410885,0.274954,0.018364,0.004217,0.050911,0.507400,0.147174,0.009479
2023-12-14,0.005649,0.005740,0.313682,0.274016,0.018356,0.004370,0.051184,0.505338,0.147103,0.009459
2023-12-15,0.005901,0.005952,0.283729,0.273183,0.018347,0.003686,0.051374,0.517937,0.146914,0.009443


In [12]:
dnk_features = data_df.loc[:, ["target"]].copy()
dnk_features["target_rolling_mean"] = dnk_features["target"].rolling(window=252, min_periods=1).mean()
dnk_features["target_rolling_vol"] = data_df[["vol"]].rolling(window=252, min_periods=1).std().fillna(0)
dnk_features

Unnamed: 0_level_0,target,target_rolling_mean,target_rolling_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1981-01-29,0.167364,0.167364,0.000000
1981-01-30,0.382393,0.274879,0.000016
1981-02-02,0.197455,0.249071,0.000755
1981-02-03,0.145795,0.223252,0.000837
1981-02-04,0.210126,0.220627,0.000974
...,...,...,...
2023-12-12,0.122149,0.283967,0.001180
2023-12-13,0.410885,0.285598,0.001181
2023-12-14,0.313682,0.286508,0.001181
2023-12-15,0.283729,0.287319,0.001181


In [13]:
data_df = data_df.merge(dnk_features, how="inner", left_index=True, right_index=True)
data_df.shape

(10813, 13)

In [14]:
data_df

Unnamed: 0_level_0,vol,naive_vol,target_x,avg_corr,stocks_vol,ewma,lw_intensity,momentum_feature,trace,universe_vol,target_y,target_rolling_mean,target_rolling_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1981-01-29,0.006478,0.006508,0.167364,0.190375,0.021256,0.002557,0.105223,0.563100,0.216770,0.009231,0.167364,0.167364,0.000000
1981-01-30,0.006500,0.006614,0.382393,0.189589,0.021260,0.003088,0.105667,0.572604,0.216851,0.009219,0.382393,0.274879,0.000016
1981-02-02,0.005181,0.005253,0.197455,0.189760,0.021279,0.004551,0.106538,0.581325,0.217056,0.009223,0.197455,0.249071,0.000755
1981-02-03,0.004921,0.004940,0.145795,0.189453,0.021304,0.004652,0.106711,0.581031,0.217361,0.009225,0.145795,0.223252,0.000837
1981-02-04,0.004314,0.004395,0.210126,0.193561,0.021344,0.001649,0.103907,0.573192,0.217902,0.009310,0.210126,0.220627,0.000974
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-12,0.004721,0.004726,0.122149,0.266976,0.018326,0.001329,0.052586,0.506435,0.146626,0.009336,0.122149,0.283967,0.001180
2023-12-13,0.004817,0.005012,0.410885,0.274954,0.018364,0.004217,0.050911,0.507400,0.147174,0.009479,0.410885,0.285598,0.001181
2023-12-14,0.005649,0.005740,0.313682,0.274016,0.018356,0.004370,0.051184,0.505338,0.147103,0.009459,0.313682,0.286508,0.001181
2023-12-15,0.005901,0.005952,0.283729,0.273183,0.018347,0.003686,0.051374,0.517937,0.146914,0.009443,0.283729,0.287319,0.001181


In [15]:
data_df = data_df.rename(columns={"target_x": "target", "target_y": "lagged_target"})

In [16]:
new_data = data.merge(data_df, left_index=True, right_index=True)
new_data.shape

(10813, 1477)

In [18]:
new_data.columns

Index(['10006', '10057', '10078', '10104', '10107', '10108', '10137', '10138',
       '10145', '10147',
       ...
       'avg_corr', 'stocks_vol', 'ewma', 'lw_intensity', 'momentum_feature',
       'trace', 'universe_vol', 'lagged_target', 'target_rolling_mean',
       'target_rolling_vol'],
      dtype='object', length=1477)

In [19]:
new_data.to_csv(PATH / "spx_data.csv")