In [29]:
import performance_flow
import importlib
importlib.reload(performance_flow)
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)
import train_flow
importlib.reload(train_flow)
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

# --------- #
# LEAVE ME  #
# --------- #
ticker = 'QQQ'
include_minute_feats = "N"
returns = [1, 2, 3, 5, 10, 20, 30]
df_daily, feature_sets, return_cols, daily_cols, feature_dict, features = train_flow.import_data(ticker, include_minute_feats, returns)

Available Feature Sets: dict_keys(['ma', 'rsi', 'macd', 'volume', 'atr_adx', 'volatility', 'vix_skew', 'experimental_slope', 'past_return'])


# Full Run
- Retrain ALL models through most recent aod
- Calculate performance
- Select and save top n
- Make predictions
- Ensemble

In [36]:
from itertools import chain
import deployment_flow, performance_flow
import importlib
importlib.reload(deployment_flow)
importlib.reload(performance_flow)

def resolve_feature_cols(feature_set_name: str, features_dict: dict, sep: str = "-") -> list[str]:

    # --- Kitchen sink case ---
    if feature_set_name == "kitch_sink":
        all_cols = chain.from_iterable(features_dict.values())
        # dedupe preserve order
        seen = set()
        out = []
        for c in all_cols:
            if c not in seen:
                seen.add(c)
                out.append(c)
        return out

    # --- Normal composite case ---
    parts = feature_set_name.split(sep)

    cols = []
    for p in parts:
        if p not in features_dict:
            raise KeyError(f"{p} not in features_dict")
        cols.append(features_dict[p])

    # flatten + dedupe
    seen = set()
    out = []
    for c in chain.from_iterable(cols):
        if c not in seen:
            seen.add(c)
            out.append(c)

    return out

h=[30]
master_results = []
master_preds = []
n = 2 # number of top models to select 
file_ext = "performance_all"
min_th = 0.55
cov_th = 0.75
perf_cutoff_date = '2025-09-01'

# Retrain ALL
for r in h:
    
    df = pd.read_csv(f"h{r}_{file_ext}.csv")
    df = df.dropna().copy()
    #df = df.rename(columns={"feature_set": "features"})

    df["feature_cols"] = df["features"].apply(lambda x: resolve_feature_cols(x, feature_dict))

    grain_cols = ["horizon","features","train_years","min_feats","pi_size","model","pi_handling"]

    max_train = (
        df.groupby(grain_cols, as_index=False)["test_start"]
        .max()
        .rename(columns={"test_start": "max_test_start"})
    )

    models = (
        df[grain_cols].drop_duplicates(subset=grain_cols, keep="first")
        .merge(df[grain_cols + ["feature_cols"]].drop_duplicates(subset=grain_cols), on=grain_cols, how="left")
        .merge(max_train, on=grain_cols, how="left")   # <-- this is the missing piece
    )
    
    for row in models.itertuples(index=False):

        target_horizon = row.horizon
        pi_handling    = 'run_separately' #row.pi_handling
        type           = 'Actualized'
        feature_cols   = row.feature_cols   # list-of-cols wrapped in a list
        list_name      = row.features
        train_year     = row.train_years
        pi_year        = row.pi_size
        min_feat       = row.min_feats
        max_test_start = row.max_test_start
        days_assessed  = len(df_daily.iloc[r:][df_daily['Date'] > max_test_start].copy())
        groups = list_name.split("-")

        if days_assessed > 0:

            model = XGBClassifier(n_estimators=300, random_state=42, n_jobs=-1)
            model_name = "xgboost-3"

            print(f"{target_horizon} | {pi_handling} | {list_name} | {train_year} | {pi_year} | {min_feat} | {days_assessed}")
            results_df = deployment_flow.run_deploy_flow(days_assessed, r, pi_handling, feature_cols, df_daily, model_name, model,
                            train_year, pi_year, min_feat, list_name, feature_dict, groups, type)
            
            master_results.append(results_df)

        else:
            1
            #print(f"{list_name} already trained through most recent as_of_date")
    
    print(f"Retrainig Done")
    if len(master_results) > 0: 
        
        master_results_df = pd.concat(master_results, ignore_index=True)
        performance_df = pd.read_csv(f"h{r}_{file_ext}.csv")
        df_concat = pd.concat([performance_df, master_results_df], ignore_index=True)    
        df_concat.to_csv(f"h{r}_{file_ext}.csv", index=False)

# Performance and Top n
for r in h:

    keys = ["horizon", "features", "train_years", "min_feats", "pi_size", "pi_handling", "model"]

    results_file_name = f"h{r}_{file_ext}.csv" # Match prior cell saved as file name horizon_2_baseline_new
    return_cols, perf_df = performance_flow.import_data(results_file_name, df_daily)
    perf_df = perf_df[perf_df['test_start'] >= perf_cutoff_date].rename(columns={"feature_set": "features"})
    composite_score = performance_flow.run_performance(perf_df[perf_df['horizon'] == r].dropna(), min_th, cov_th)
    bucket_df = performance_flow.bucket_scores(df_daily.dropna(), perf_df[perf_df['horizon'] == r].dropna(), returns, min_th, keys)

    top_n = (
    composite_score.sort_values("composite", ascending=False)
    .drop_duplicates(subset=["features"], keep="first").head(n).copy())

    # Ensure dtypes match so the join actually hits
    for df in (top_n, perf_df):

        df["horizon"] = r
        df["features"] = df["features"].astype(str)
        df["model"]       = df["model"].astype(str)
        df["pi_size"]     = df["pi_size"]
        df["pi_handling"]     = df["pi_handling"].astype(str)
        df["train_years"] = df["train_years"].astype(int)
        df["min_feats"]   = df["min_feats"].astype(int)

    # Filter master predictions to only rows matching one of the 10 configs
    pred_filtered = perf_df.merge(top_n[keys].drop_duplicates(), on=keys, how="inner")
    #print(len(pred_filtered))
    pred_filtered.to_csv(f"h{r}_top{n}_{file_ext}.csv", index=False)
    print(f"Horizon {r} Top {n} Models Saved")

# Predictions Top n
for r in h:
    
    days_assessed = len(df_daily[df_daily[f"Return_{r}"].isna()])

    df = pd.read_csv(f"h{r}_top{n}_{file_ext}.csv")

    df["feature_cols"] = df["features"].apply(lambda x: resolve_feature_cols(x, feature_dict))

    grain_cols = ["horizon","features","train_years","min_feats","pi_size","model","pi_handling"]

    top_n = (
        df[grain_cols].drop_duplicates(subset=grain_cols, keep="first")
        .merge(df[grain_cols + ["feature_cols"]].drop_duplicates(subset=grain_cols), on=grain_cols, how="left"))
    
    for row in top_n.itertuples(index=False):

        target_horizon = row.horizon
        pi_handling    = 'run_separately' #row.pi_handling
        type           = 'New_Predict'
        feature_cols   = row.feature_cols   # list-of-cols wrapped in a list
        list_name      = row.features
        train_year     = row.train_years
        pi_year        = row.pi_size
        min_feat       = row.min_feats
        groups = list_name.split("-")

        model = XGBClassifier(n_estimators=300, random_state=42, n_jobs=-1)
        model_name = "xgboost-3"

        print(f"{target_horizon} | {pi_handling} | {list_name} | {train_year} | {pi_year} | {min_feat} | {days_assessed}")
        results_df = deployment_flow.run_deploy_flow(days_assessed, r, pi_handling, feature_cols, df_daily, model_name, model,
                        train_year, pi_year, min_feat, list_name, feature_dict, groups, type)
        
        master_preds.append(results_df)
        print(f"Horizon {r} Top {n} Models Predicted")

    master_preds_df = pd.concat(master_preds, ignore_index=True)
    predictions_df = master_preds_df.copy()
    composite_score[['pprec', 'nprec'] + keys].drop_duplicates().merge(predictions_df, on=keys, how="inner")

#predictions_df.sort_values(by='test_start', ascending=False).head(n)

output_df = composite_score[['pprec', 'nprec'] + keys].drop_duplicates().merge(predictions_df, on=keys, how="inner")
output_df = output_df.rename(columns={"test_start": "Date"})
cols = ['Date', 'features', 'pred', 'pprec', 'nprec']
output_df = output_df[cols].sort_values(by='Date').copy()
output_df = output_df.merge(df_daily[['Close', 'Date']].round(2), on='Date', how="inner")
output_df = output_df.rename(columns={"Close": "Predicted_Price"})
last_close = (df_daily.sort_values("Date", ascending=False).iloc[0]["Close"].round(2))
output_df['Last_Close'] = last_close
output_df['LC_R_PP'] = round(output_df['Last_Close'] / output_df['Predicted_Price'] - 1, 3)

output_df.loc[output_df['pred'] < 0.45, 'pred'] = output_df['pred'] - 1

output_df['pred_edge'] = np.where(output_df['pred'] > 0, output_df['pprec'] - 0.5, -output_df['nprec'] + 0.5).round(2)

ensemble_df = (
    output_df
    .groupby('Date', as_index=False)
    .agg({
        'Date': 'first',
        'Predicted_Price': 'first',
        'Last_Close': 'first',
        'LC_R_PP': 'first',
        'pred_edge': 'sum'
    })
    .rename(columns={'pred_edge': 'ensemble_edge'})
)
ensemble_df['ensemble_pred'] = (ensemble_df['ensemble_edge'] / n + .5).round(2)
ensemble_df

30 | run_separately | ma_lag-ma_rel | 3 | 1.5 | 4 | 4
Running actualized predictions for horizon 30 | run_separately
30 | run_separately | ma_lag-ma_sma | 3 | 1.5 | 4 | 4
Running actualized predictions for horizon 30 | run_separately
30 | run_separately | ma_lag-ma_num | 3 | 1.5 | 4 | 4
Running actualized predictions for horizon 30 | run_separately
30 | run_separately | ma_lag-rsi_macd | 3 | 1.5 | 4 | 4
Running actualized predictions for horizon 30 | run_separately
30 | run_separately | ma_lag-volu | 3 | 1.5 | 4 | 4
Running actualized predictions for horizon 30 | run_separately
30 | run_separately | ma_lag-atr_adxvola | 3 | 1.5 | 4 | 4
Running actualized predictions for horizon 30 | run_separately
30 | run_separately | ma_lag-vix_skew | 3 | 1.5 | 4 | 4
Running actualized predictions for horizon 30 | run_separately
30 | run_separately | ma_lag-experimental_slope | 3 | 1.5 | 4 | 4
Running actualized predictions for horizon 30 | run_separately
30 | run_separately | ma_lag-past_return | 3 

Unnamed: 0,Date,Predicted_Price,Last_Close,LC_R_PP,ensemble_edge,ensemble_pred
0,2026-01-08,620.47,608.81,-0.019,-0.13,0.44
1,2026-01-09,626.65,608.81,-0.028,0.11,0.56
2,2026-01-12,627.17,608.81,-0.029,-0.13,0.44
3,2026-01-13,626.24,608.81,-0.028,-0.13,0.44
4,2026-01-14,619.55,608.81,-0.017,-0.13,0.44
5,2026-01-15,621.78,608.81,-0.021,-0.13,0.44
6,2026-01-16,621.26,608.81,-0.02,0.11,0.56
7,2026-01-20,608.06,608.81,0.001,0.11,0.56
8,2026-01-21,616.28,608.81,-0.012,0.11,0.56
9,2026-01-22,620.76,608.81,-0.019,0.56,0.78


In [37]:
output_df[output_df['Date'] == '2026-01-08']

Unnamed: 0,Date,features,pred,pprec,nprec,Predicted_Price,Last_Close,LC_R_PP,pred_edge
0,2026-01-08,ma_lag-ma_sma,-0.9,0.84,0.61,620.47,608.81,-0.019,-0.11
1,2026-01-08,ma_lag-vix_skew,-0.65,0.72,0.52,620.47,608.81,-0.019,-0.02
