In [2]:
import numpy as np
import pandas as pd
import datetime
import yfinance as yf
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# For importing universal scripts
import sys
import os
# Go up two levels from the subfolder
sys.path.append(os.path.abspath("../.."))
from indicators_returns import final_df #Universal script for indicator set and actuals

In [3]:
ticker = 'QQQ'
returns = [5, 10, 20, 30, 45, 60, 90]
lb = 20
df = final_df(ticker, returns, lb)
df = df.iloc[:-101].replace([np.inf, -np.inf], 0)

# Import ccsvs from earlier univariate analysis

In [5]:
cluster_df = pd.read_csv('../Indicator_Correlations/cluster.csv')  # includes 'indicator', 'cluster' columns
high_corr = pd.read_csv('../Indicator_Correlations/highly_correlated_pairs.csv')  # includes 'indicator_1', 'indicator_2'
ind_dynamics_df = pd.read_csv('../Indicator_Dynamics/indicator_dynamics.csv')  # includes 'r', 'indicator', volatility/zscore/autocorr columns
logistic_results_df = pd.read_csv('../Univariate_Predictive_Power/logistic_results_prod.csv')  # includes AUC, logloss, etc.

# Filter out indicators with poor AUC or KS first

In [6]:
# Only keep if either AUC or KS is strong, and coef isn't completely flat
logi_filtered = logistic_results_df[
    (
        (logistic_results_df['flag'] != 'no_signal') |
        (logistic_results_df['ks_ts'] > .2)
    ) 
]

logi_filtered

Unnamed: 0.1,Unnamed: 0,r,class_balance,flag,indicator,auc_rs,auc_ts,log_loss_rs,log_loss_ts,coef_rs,coef_ts,ks_rs,ks_ts
0,0,5,N,keep,100_EMA_200,0.494549,0.424242,0.676573,0.675429,-0.517016,0.021185,0.051531,0.118791
1,1,10,N,keep,100_EMA_200,0.503271,0.395961,0.658263,0.653156,-0.072074,0.385507,0.049863,0.201726
2,2,20,N,keep,100_EMA_200,0.527863,0.654937,0.642633,0.616307,-0.009967,-0.081769,0.102268,0.294863
3,3,30,N,keep,100_EMA_200,0.544469,0.701299,0.626834,0.620562,-0.316896,-0.300490,0.131156,0.316488
4,4,45,N,keep,100_EMA_200,0.512505,0.669870,0.605028,0.596595,-0.648517,-0.240014,0.077935,0.370564
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4540,4540,45,N,keep,vol_5_MA10,0.503338,0.556141,0.604589,0.594873,0.092188,0.062532,0.054201,0.157684
4544,4544,10,N,keep,vol_5_MA25,0.515533,0.550027,0.658223,0.652500,0.040007,0.004800,0.044251,0.132964
4547,4547,45,N,keep,vol_5_MA25,0.512318,0.556633,0.604223,0.593366,0.121215,0.092651,0.051115,0.162916
4561,4561,45,N,keep,vol_5_MA50,0.516008,0.555059,0.604317,0.593068,0.143012,0.111864,0.059223,0.174561


# Filter Based on Z-Score + Volatility

In [9]:
def is_too_flat(row):
    return all([
        row[f'tight_zrange_pct_{y}y'] > 0.9 for y in [5, 10, 15]
    ])

def is_too_noisy(row):
    return all([
        row[f'avg_vol_{w}'] > 5 for w in [21, 42, 63]  # tweak threshold as needed
    ])

ind_dynamics_df['too_flat'] = ind_dynamics_df.apply(is_too_flat, axis=1)
ind_dynamics_df['too_noisy'] = ind_dynamics_df.apply(is_too_noisy, axis=1)

ind_dynamics_filtered = ind_dynamics_df[
    (~ind_dynamics_df['too_flat']) & (~ind_dynamics_df['too_noisy'])
].copy()

ind_dynamics_filtered

Unnamed: 0,r,indicator,avg_vol_21,avg_vol_42,avg_vol_63,tight_zrange_pct_5y,skew_5y,kurtosis_5y,tight_zrange_pct_10y,skew_10y,kurtosis_10y,tight_zrange_pct_15y,skew_15y,kurtosis_15y,autocorr_lag1,autocorr_lag5,autocorr_lag10,too_flat,too_noisy
0,5,QQQ_SMA_10,0.016324,0.018051,0.018748,0.0913,-1.060779,3.280899,0.1071,-1.074009,3.894510,0.0926,-1.156130,4.533029,0.810498,0.251337,0.012752,False,False
1,5,QQQ_EMA_10,0.013249,0.014828,0.015524,0.0706,-1.275087,3.887865,0.0754,-1.328596,4.671043,0.1093,-1.418590,5.514152,0.780154,0.332386,0.139805,False,False
2,5,QQQ_SMA_25,0.019650,0.024815,0.027178,0.0786,-1.247942,3.237450,0.0869,-1.302638,4.063127,0.0881,-1.339709,4.695366,0.925333,0.679746,0.415659,False,False
3,5,QQQ_EMA_25,0.016900,0.020614,0.022593,0.0714,-1.469925,4.045750,0.0786,-1.561139,5.278722,0.0749,-1.598159,5.687162,0.905915,0.648929,0.444341,False,False
4,5,QQQ_SMA_50,0.020345,0.027699,0.032400,0.0786,-1.371163,3.180743,0.0885,-1.479195,4.655209,0.1053,-1.439438,4.785597,0.962726,0.838188,0.693966,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4558,90,Typical Price,3.401498,4.859261,5.924167,0.1405,-0.301521,-0.104337,0.0552,0.743222,-0.734888,0.1085,1.330516,1.273958,0.999849,0.999070,0.998151,False,False
4560,90,Median Close,1.428442,2.709632,3.881820,0.1373,-0.347515,-0.045589,0.0409,0.782887,-0.671036,0.1026,1.286512,1.124756,0.999993,0.999879,0.999599,False,False
4561,90,Liquidity Score,0.345174,0.386972,0.410124,0.1016,1.707394,5.679392,0.0964,1.878374,7.061227,0.1042,2.632067,11.859544,0.658589,0.370517,0.221622,False,False
4562,90,Regime,0.824756,1.025321,1.141540,0.0960,3.123725,16.078357,0.1079,5.618137,58.775770,0.1074,5.163241,41.095058,0.858285,0.629137,0.393121,False,False


# Filter out anything that fails the flat/noisy or low AUC test

## First block is an or across the two dfs

In [17]:
# Step 1: Build sets of (indicator, r) pairs from both filters
logi_pairs = set(zip(logi_filtered['indicator'], logi_filtered['r']))
dyn_pairs = set(zip(ind_dynamics_filtered['indicator'], ind_dynamics_filtered['r']))

# Step 2: Take union of both sets
valid_and_df = logi_pairs.union(dyn_pairs)
print(len(valid_and_df))

# Convert to DataFrame for easy counting
valid_and_df = pd.DataFrame(list(valid_and_df), columns=['indicator', 'r'])
print(valid_and_df.groupby('r')['indicator'].nunique())

4197
r
5     589
10    597
20    597
30    602
45    601
60    601
90    610
Name: indicator, dtype: int64


## Second block is an and across the two dfs

In [18]:
# These are indicators that PASSED each test
logi_pairs = set(zip(logi_filtered['indicator'], logi_filtered['r']))
dyn_pairs  = set(zip(ind_dynamics_filtered['indicator'], ind_dynamics_filtered['r']))

# Only keep indicators that passed BOTH filters (intersection)
valid_or_df = logi_pairs.intersection(dyn_pairs)
print(len(valid_or_df))

# Convert to DataFrame for easy counting
valid_or_df = pd.DataFrame(list(valid_or_df), columns=['indicator', 'r'])
print(valid_or_df.groupby('r')['indicator'].nunique())

1355
r
5     112
10    162
20    163
30    214
45    247
60    205
90    252
Name: indicator, dtype: int64


# Break down into categorical indicator sets for each r