In [3]:
import performance_flow
import importlib
importlib.reload(performance_flow)
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)
import train_flow
importlib.reload(train_flow)
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

# --------- #
# LEAVE ME  #
# --------- #
ticker = 'QQQ'
include_minute_feats = "N"
returns = [1, 2, 3, 5, 10, 20, 30]
df_daily, feature_sets, return_cols, daily_cols, feature_dict, features = train_flow.import_data(ticker, include_minute_feats, returns)

Available Feature Sets: dict_keys(['ma', 'rsi', 'macd', 'volume', 'atr_adx', 'volatility', 'vix_skew', 'experimental_slope', 'past_return'])


# Full Run
- Retrain ALL models through most recent aod
- Calculate performance
- Select and save top n
- Make predictions
- Ensemble

In [None]:
from itertools import chain
import deployment_flow, performance_flow
import importlib
importlib.reload(deployment_flow)
importlib.reload(performance_flow)

def resolve_feature_cols(feature_set_name: str, features_dict: dict, sep: str = "-") -> list[str]:

    # --- Kitchen sink case ---
    if feature_set_name == "kitch_sink":
        all_cols = chain.from_iterable(features_dict.values())
        # dedupe preserve order
        seen = set()
        out = []
        for c in all_cols:
            if c not in seen:
                seen.add(c)
                out.append(c)
        return out

    # --- Normal composite case ---
    parts = feature_set_name.split(sep)

    cols = []
    for p in parts:
        if p not in features_dict:
            raise KeyError(f"{p} not in features_dict")
        cols.append(features_dict[p])

    # flatten + dedupe
    seen = set()
    out = []
    for c in chain.from_iterable(cols):
        if c not in seen:
            seen.add(c)
            out.append(c)

    return out

h=[2]
master_results = []
master_preds = []
n = 4 # number of top models to select 
file_ext = "performance_all"
min_th = 0.55
cov_th = 0.75

# Retrain ALL
for r in h:
    
    df = pd.read_csv(f"h{r}_{file_ext}.csv")
    df = df.dropna().copy()
    #df = df.rename(columns={"feature_set": "features"})

    df["feature_cols"] = df["features"].apply(lambda x: resolve_feature_cols(x, feature_dict))

    grain_cols = ["horizon","features","train_years","min_feats","pi_size","model","pi_handling"]

    max_train = (
        df.groupby(grain_cols, as_index=False)["test_start"]
        .max()
        .rename(columns={"test_start": "max_test_start"})
    )

    models = (
        df[grain_cols].drop_duplicates(subset=grain_cols, keep="first")
        .merge(df[grain_cols + ["feature_cols"]].drop_duplicates(subset=grain_cols), on=grain_cols, how="left")
        .merge(max_train, on=grain_cols, how="left")   # <-- this is the missing piece
    )
    
    for row in models.itertuples(index=False):

        target_horizon = row.horizon
        pi_handling    = 'run_separately' #row.pi_handling
        type           = 'Actualized'
        feature_cols   = row.feature_cols   # list-of-cols wrapped in a list
        list_name      = row.features
        train_year     = row.train_years
        pi_year        = row.pi_size
        min_feat       = row.min_feats
        max_test_start = row.max_test_start
        days_assessed  = len(df_daily.iloc[r:][df_daily['Date'] > max_test_start].copy())
        groups = list_name.split("-")

        if days_assessed > 0:

            model = XGBClassifier(n_estimators=300, random_state=42, n_jobs=-1)
            model_name = "xgboost-3"

            print(f"{target_horizon} | {pi_handling} | {list_name} | {train_year} | {pi_year} | {min_feat} | {days_assessed}")
            results_df = deployment_flow.run_deploy_flow(days_assessed, r, pi_handling, feature_cols, df_daily, model_name, model,
                            train_year, pi_year, min_feat, list_name, feature_dict, groups, type)
            
            master_results.append(results_df)

        else:

            print(f"{list_name} already trained through most recent as_of_date")
    
    print(f"Retrainig Done")
    if len(master_results) > 0: 
        
        master_results_df = pd.concat(master_results, ignore_index=True)
        performance_df = pd.read_csv(f"h{r}_{file_ext}.csv")
        df_concat = pd.concat([performance_df, master_results_df], ignore_index=True)    
        df_concat.to_csv(f"h{r}_{file_ext}.csv", index=False)

# Performance and Top n
for r in h:

    keys = ["horizon", "features", "train_years", "min_feats", "pi_size", "pi_handling", "model"]

    results_file_name = f"h{r}_{file_ext}.csv" # Match prior cell saved as file name horizon_2_baseline_new
    return_cols, perf_df = performance_flow.import_data(results_file_name, df_daily)
    perf_df = perf_df.rename(columns={"feature_set": "features"})
    composite_score = performance_flow.run_performance(perf_df[perf_df['horizon'] == r].dropna(), min_th, cov_th)
    bucket_df = performance_flow.bucket_scores(df_daily.dropna(), perf_df[perf_df['horizon'] == r].dropna(), returns, min_th, keys)

    top_n = (
    composite_score.sort_values("composite", ascending=False)
    .drop_duplicates(subset=["features"], keep="first").head(n).copy())

    # Ensure dtypes match so the join actually hits
    for df in (top_n, perf_df):

        df["horizon"] = r
        df["features"] = df["features"].astype(str)
        df["model"]       = df["model"].astype(str)
        df["pi_size"]     = df["pi_size"]
        df["pi_handling"]     = df["pi_handling"].astype(str)
        df["train_years"] = df["train_years"].astype(int)
        df["min_feats"]   = df["min_feats"].astype(int)

    # Filter master predictions to only rows matching one of the 10 configs
    pred_filtered = perf_df.merge(top_n[keys].drop_duplicates(), on=keys, how="inner")
    #print(len(pred_filtered))
    pred_filtered.to_csv(f"h{r}_top{n}_{file_ext}.csv", index=False)
    print(f"Horizon {r} Top {n} Models Saved")

# Predictions Top n
for r in h:
    
    days_assessed = len(df_daily[df_daily[f"Return_{r}"].isna()])

    df = pd.read_csv(f"h{r}_top{n}_{file_ext}.csv")

    df["feature_cols"] = df["features"].apply(lambda x: resolve_feature_cols(x, feature_dict))

    grain_cols = ["horizon","features","train_years","min_feats","pi_size","model","pi_handling"]

    top_n = (
        df[grain_cols].drop_duplicates(subset=grain_cols, keep="first")
        .merge(df[grain_cols + ["feature_cols"]].drop_duplicates(subset=grain_cols), on=grain_cols, how="left"))
    
    for row in top_n.itertuples(index=False):

        target_horizon = row.horizon
        pi_handling    = 'run_separately' #row.pi_handling
        type           = 'New_Predict'
        feature_cols   = row.feature_cols   # list-of-cols wrapped in a list
        list_name      = row.features
        train_year     = row.train_years
        pi_year        = row.pi_size
        min_feat       = row.min_feats
        groups = list_name.split("-")

        model = XGBClassifier(n_estimators=300, random_state=42, n_jobs=-1)
        model_name = "xgboost-3"

        print(f"{target_horizon} | {pi_handling} | {list_name} | {train_year} | {pi_year} | {min_feat} | {days_assessed}")
        results_df = deployment_flow.run_deploy_flow(days_assessed, r, pi_handling, feature_cols, df_daily, model_name, model,
                        train_year, pi_year, min_feat, list_name, feature_dict, groups, type)
        
        master_preds.append(results_df)
        print(f"Horizon {r} Top {n} Models Predicted")

    master_preds_df = pd.concat(master_preds, ignore_index=True)
    predictions_df = master_preds_df.copy()
    composite_score[['pprec', 'nprec'] + keys].drop_duplicates().merge(predictions_df, on=keys, how="inner")

predictions_df.sort_values(by='test_start', ascending=False).head(n)


ma_lag-ma_rel already trained through most recent as_of_date
ma_lag-ma_sma already trained through most recent as_of_date
ma_lag-ma_num already trained through most recent as_of_date
ma_lag-rsi_macd already trained through most recent as_of_date
ma_lag-volu already trained through most recent as_of_date
ma_lag-atr_adxvola already trained through most recent as_of_date
ma_lag-vix_skew already trained through most recent as_of_date
ma_lag-experimental_slope already trained through most recent as_of_date
ma_lag-past_return already trained through most recent as_of_date
ma_rel-ma_sma already trained through most recent as_of_date
ma_rel-ma_num already trained through most recent as_of_date
ma_rel-rsi_macd already trained through most recent as_of_date
ma_rel-volu already trained through most recent as_of_date
ma_rel-atr_adxvola already trained through most recent as_of_date
ma_rel-vix_skew already trained through most recent as_of_date
ma_rel-experimental_slope already trained through most

Unnamed: 0,run,model,test_days,pred,train_n,train_start,train_end,test_start,test_end,train_years,n_features,pi_size,pi_handling,min_feats,features,horizon
1,2,xgboost-3,1,0.8,735,2023-03-10,2026-02-12,2026-02-18,2026-02-18,3,10,1.5,run_separately,4,rsi_macd-volu,2
3,2,xgboost-3,1,0.9,735,2023-03-10,2026-02-12,2026-02-18,2026-02-18,3,12,1.5,run_separately,4,rsi_macd-atr_adxvola,2
5,2,xgboost-3,1,0.0,735,2023-03-10,2026-02-12,2026-02-18,2026-02-18,3,10,1.5,run_separately,4,volu-past_return,2
7,2,xgboost-3,1,0.55,735,2023-03-10,2026-02-12,2026-02-18,2026-02-18,3,12,1.5,run_separately,4,atr_adxvola-past_return,2


In [None]:
output_df = composite_score[['pprec', 'nprec'] + keys].drop_duplicates().merge(predictions_df, on=keys, how="inner")
output_df = output_df.rename(columns={"test_start": "Date"})
cols = ['Date', 'features', 'pred', 'pprec', 'nprec']
output_df = output_df[cols].sort_values(by='Date').copy()
output_df = output_df.merge(df_daily[['Close', 'Date']].round(2), on='Date', how="inner")
output_df = output_df.rename(columns={"Close": "Predicted_Price"})
last_close = (df_daily.sort_values("Date", ascending=False).iloc[0]["Close"].round(2))
output_df['Last_Close'] = last_close
output_df['LC_R_PP'] = round(output_df['Last_Close'] / output_df['Predicted_Price'] - 1, 3)

output_df

Unnamed: 0,Date,features,pred,pprec,nprec,Predicted_Price,Last_Close,LC_R_PP
0,2026-02-18,rsi_macd-volu,0.8,0.62,0.5,605.79,603.47,-0.004
1,2026-02-18,volu-past_return,0.0,0.63,0.49,605.79,603.47,-0.004
2,2026-02-18,rsi_macd-atr_adxvola,0.9,0.6,0.49,605.79,603.47,-0.004
3,2026-02-18,ma_sma-volu,0.65,0.59,0.48,605.79,603.47,-0.004
4,2026-02-19,rsi_macd-volu,0.95,0.62,0.5,603.47,603.47,0.0
5,2026-02-19,volu-past_return,0.1,0.63,0.49,603.47,603.47,0.0
6,2026-02-19,rsi_macd-atr_adxvola,0.65,0.6,0.49,603.47,603.47,0.0
7,2026-02-19,ma_sma-volu,0.45,0.59,0.48,603.47,603.47,0.0


# Parse Top Models per Horizon

In [128]:
import importlib
importlib.reload(performance_flow)
# ---------- #
# UPDATE ME  #
# ---------- #
min_th = 0.55
cov_th = 0.75
horizons = [2]#[2, 5, 10, 20, 30]

for r in horizons:
    
    results_file_name = f"h{r}_performance.csv" # Match prior cell saved as file name horizon_2_baseline_new
    return_cols, perf_df = performance_flow.import_data(results_file_name, df_daily)
    perf_df = perf_df.rename(columns={"feature_set": "features"})
    composite_score = performance_flow.run_performance(perf_df[perf_df['horizon'] == r], min_th, cov_th)
    #bucket_df = performance_flow.bucket_scores(df_daily, perf_df[perf_df['horizon'] == r], returns, min_th, keys)

    top_10_r = (
    composite_score.sort_values("composite", ascending=False)
    .drop_duplicates(subset=["features"], keep="first").head(5).copy())

    keys = ["horizon", "features", "train_years", "min_feats", "pi_size", "pi_handling", "model"]

    # Ensure dtypes match so the join actually hits
    for df in (top_10_r, perf_df):

        df["horizon"] = r
        df["features"] = df["features"].astype(str)
        df["model"]       = df["model"].astype(str)
        df["pi_size"]     = df["pi_size"]
        df["pi_handling"]     = df["pi_handling"].astype(str)
        df["train_years"] = df["train_years"].astype(int)
        df["min_feats"]   = df["min_feats"].astype(int)

    # Filter master predictions to only rows matching one of the 10 configs
    pred_filtered = perf_df.merge(top_10_r[keys].drop_duplicates(), on=keys, how="inner")
    #print(len(pred_filtered))
    pred_filtered.to_csv(f"h{r}_top10_raw.csv", index=False)

# Train Top Performers through most recent as_of_date

In [123]:
from itertools import chain
import deployment_flow
import importlib
importlib.reload(deployment_flow)

def resolve_feature_cols(feature_set_name: str, features_dict: dict, sep: str = "-") -> list[str]:

    # --- Kitchen sink case ---
    if feature_set_name == "kitch_sink":
        all_cols = chain.from_iterable(features_dict.values())
        # dedupe preserve order
        seen = set()
        out = []
        for c in all_cols:
            if c not in seen:
                seen.add(c)
                out.append(c)
        return out

    # --- Normal composite case ---
    parts = feature_set_name.split(sep)

    cols = []
    for p in parts:
        if p not in features_dict:
            raise KeyError(f"{p} not in features_dict")
        cols.append(features_dict[p])

    # flatten + dedupe
    seen = set()
    out = []
    for c in chain.from_iterable(cols):
        if c not in seen:
            seen.add(c)
            out.append(c)

    return out

h=[2]
master_results = []

for r in h:
    
    df = pd.read_csv(f"h{r}_top10_raw.csv")
    df = df.rename(columns={"feature_set": "features"})

    df["feature_cols"] = df["features"].apply(lambda x: resolve_feature_cols(x, feature_dict))

    grain_cols = ["horizon","features","train_years","min_feats","pi_size","model","pi_handling"]

    max_train = (
        df.groupby(grain_cols, as_index=False)["test_start"]
        .max()
        .rename(columns={"test_start": "max_test_start"})
    )

    top_10_unique = (
        df[grain_cols].drop_duplicates(subset=grain_cols, keep="first")
        .merge(df[grain_cols + ["feature_cols"]].drop_duplicates(subset=grain_cols), on=grain_cols, how="left")
        .merge(max_train, on=grain_cols, how="left")   # <-- this is the missing piece
    )
    
    for row in top_10_unique.itertuples(index=False):

        target_horizon = row.horizon
        pi_handling    = 'run_separately' #row.pi_handling
        feature_cols   = row.feature_cols   # list-of-cols wrapped in a list
        list_name      = row.features
        train_year     = row.train_years
        pi_year        = row.pi_size
        min_feat       = row.min_feats
        max_test_start = row.max_test_start
        days_assessed  = len(df_daily.iloc[r:][df_daily['Date'] > max_test_start].copy())
        groups = list_name.split("-")

        if days_assessed > 0:

            model = XGBClassifier(n_estimators=300, random_state=42, n_jobs=-1)
            model_name = "xgboost-3"

            print(f"{target_horizon} | {pi_handling} | {list_name} | {train_year} | {pi_year} | {min_feat} | {days_assessed}")
            results_df = deployment_flow.run_deploy_flow(days_assessed, r, pi_handling, feature_cols, df_daily, model_name, model,
                            train_year, pi_year, min_feat, list_name, feature_dict, groups)
            
            master_results.append(results_df)

        else:

            1
            #print(f"{list_name} already trained through most recent as_of_date")

if len(master_results) > 0: 
    master_results_df = pd.concat(master_results, ignore_index=True)

#performance_df = pd.read_csv(f"h{r}_baseline.csv")
#df_concat = pd.concat([performance_df, results_df], ignore_index=True)    
#df_concat.to_csv(f"h{r}_baseline_v2.csv", index=False)

ma_lag-past_return already trained through most recent as_of_date
ma_rel-past_return already trained through most recent as_of_date
ma_sma-past_return already trained through most recent as_of_date
ma_num-past_return already trained through most recent as_of_date
rsi_macd-past_return already trained through most recent as_of_date
volu-past_return already trained through most recent as_of_date
atr_adxvola-vix_skew already trained through most recent as_of_date
atr_adxvola-past_return already trained through most recent as_of_date
vix_skew-past_return already trained through most recent as_of_date
experimental_slope-past_return already trained through most recent as_of_date


- Train through most recent actualized data (Done)
- Append all actuals to raw files
- Recalc performance 
- Predict on new horizon (Done)
- Show prediction (Done)
- Bring in performance
- Ensemble prediction

# Rerun Performance with updated as_of_dates 
- (to be merged with prior step)

In [165]:
import importlib
importlib.reload(performance_flow)
# ---------- #
# UPDATE ME  #
# ---------- #
min_th = 0.55
cov_th = 0.75
horizons = [2]#[2, 5, 10, 20, 30]

for r in horizons:
    
    keys = ["horizon", "features", "train_years", "min_feats", "pi_size", "pi_handling", "model"]

    results_file_name = f"h{r}_{file_ext}.csv" # Match prior cell saved as file name horizon_2_baseline_new
    return_cols, perf_df = performance_flow.import_data(results_file_name, df_daily)
    perf_df = perf_df.rename(columns={"feature_set": "features"})
    composite_score = performance_flow.run_performance(perf_df[perf_df['horizon'] == r].dropna(), min_th, cov_th)
    bucket_df = performance_flow.bucket_scores(df_daily.dropna(), perf_df[perf_df['horizon'] == r].dropna(), returns, min_th, keys)

    top_10_r = (
    composite_score.sort_values("composite", ascending=False)
    .drop_duplicates(subset=["features"], keep="first").head(10).copy())

    # Ensure dtypes match so the join actually hits
    for df in (top_10_r, perf_df):

        df["horizon"] = r
        df["features"] = df["features"].astype(str)
        df["model"]       = df["model"].astype(str)
        df["pi_size"]     = df["pi_size"]
        df["pi_handling"]     = df["pi_handling"].astype(str)
        df["train_years"] = df["train_years"].astype(int)
        df["min_feats"]   = df["min_feats"].astype(int)

    # Filter master predictions to only rows matching one of the 10 configs
    #pred_filtered = perf_df.merge(top_10_r[keys].drop_duplicates(), on=keys, how="inner")
    #print(len(pred_filtered))
    #pred_filtered.to_csv(f"h{r}_top10_raw.csv", index=False)

# Predictions for un-actualized records

In [142]:
import deployment_flow
import importlib
importlib.reload(deployment_flow)

h = [2]
master_results = []

for r in h:
    
    days_assessed = len(df_daily[df_daily[f"Return_{r}"].isna()])

    df = pd.read_csv(f"h{r}_top10_raw.csv")

    df["feature_cols"] = df["features"].apply(lambda x: resolve_feature_cols(x, feature_dict))

    grain_cols = ["horizon","features","train_years","min_feats","pi_size","model","pi_handling"]

    top_10_unique = (
        df[grain_cols].drop_duplicates(subset=grain_cols, keep="first")
        .merge(df[grain_cols + ["feature_cols"]].drop_duplicates(subset=grain_cols), on=grain_cols, how="left"))
    
    for row in top_10_unique.itertuples(index=False):

        target_horizon = row.horizon
        pi_handling    = 'run_separately' #row.pi_handling
        type           = 'New_Predict'
        feature_cols   = row.feature_cols   # list-of-cols wrapped in a list
        list_name      = row.features
        train_year     = row.train_years
        pi_year        = row.pi_size
        min_feat       = row.min_feats
        groups = list_name.split("-")

        model = XGBClassifier(n_estimators=300, random_state=42, n_jobs=-1)
        model_name = "xgboost-3"

        print(f"{target_horizon} | {pi_handling} | {list_name} | {train_year} | {pi_year} | {min_feat} | {days_assessed}")
        results_df = deployment_flow.run_deploy_flow(days_assessed, r, pi_handling, feature_cols, df_daily, model_name, model,
                        train_year, pi_year, min_feat, list_name, feature_dict, groups, type)
        
        master_results.append(results_df)

master_results_df = pd.concat(master_results, ignore_index=True)

2 | run_separately | ma_lag-vix_skew | 3 | 1.5 | 4 | 2
Running new predictions for horizon 2 | run_separately
ma_lag: 60 | 5 | ['SMA_10_Lag100_min', 'SMA_10_Lag10_max', 'SMA_10_Lag10_min', 'SMA_10_Lag150_min', 'SMA_10_Lag50_min']
vix_skew: 12 | 5 | ['VIX_10_change', 'skew', 'skew_10_change', 'skew_5_change', 'skew_rolling_std']
Run 1/2 | Train: 2023-03-08 → 2026-02-10 | Test: 2026-02-13 → 2026-02-13 | Train_n=735 | Test_n=1 | (PI Years: 1.5 - Feats: 4)
ma_lag: 60 | 5 | ['SMA_10_Lag100_min', 'SMA_10_Lag10_max', 'SMA_10_Lag10_min', 'SMA_10_Lag50_min', 'SMA_50_Lag150_min']
vix_skew: 12 | 5 | ['VIX_10_change', 'VIX_1_change', 'skew', 'skew_10_change', 'skew_rolling_std']
Run 2/2 | Train: 2023-03-07 → 2026-02-09 | Test: 2026-02-12 → 2026-02-12 | Train_n=735 | Test_n=1 | (PI Years: 1.5 - Feats: 4)
2 | run_separately | ma_sma-volu | 3 | 1.5 | 4 | 2
Running new predictions for horizon 2 | run_separately
ma_sma: 15 | 11 | ['10_SMA_200', '10_SMA_25', '10_SMA_50', '25_SMA_100', '25_SMA_200', '50_

# Join in Performance
- Need to add in close from prediction and current level and if in the money

In [148]:
predictions_df = master_results_df.copy()
top_10_r[['pprec', 'nprec'] + keys].drop_duplicates().merge(predictions_df, on=keys, how="inner")
#predictions_df.head(1)

Unnamed: 0,pprec,nprec,horizon,features,train_years,min_feats,pi_size,pi_handling,model,run,test_days,pred,train_n,train_start,train_end,test_start,test_end,n_features
0,0.64,0.53,2,volu-past_return,3,4,1.5,run_separately,xgboost-3,1,1,0.95,735,2023-03-08,2026-02-10,2026-02-13,2026-02-13,10
1,0.64,0.53,2,volu-past_return,3,4,1.5,run_separately,xgboost-3,2,1,0.75,735,2023-03-07,2026-02-09,2026-02-12,2026-02-12,10
2,0.62,0.54,2,ma_sma-volu,3,4,1.5,run_separately,xgboost-3,1,1,0.2,735,2023-03-08,2026-02-10,2026-02-13,2026-02-13,15
3,0.62,0.54,2,ma_sma-volu,3,4,1.5,run_separately,xgboost-3,2,1,0.05,735,2023-03-07,2026-02-09,2026-02-12,2026-02-12,15
4,0.64,0.5,2,ma_num-vix_skew,3,4,1.5,run_separately,xgboost-3,1,1,0.7,735,2023-03-08,2026-02-10,2026-02-13,2026-02-13,13
5,0.64,0.5,2,ma_num-vix_skew,3,4,1.5,run_separately,xgboost-3,2,1,0.9,735,2023-03-07,2026-02-09,2026-02-12,2026-02-12,13
6,0.61,0.49,2,ma_lag-vix_skew,3,4,1.5,run_separately,xgboost-3,1,1,0.5,735,2023-03-08,2026-02-10,2026-02-13,2026-02-13,10
7,0.61,0.49,2,ma_lag-vix_skew,3,4,1.5,run_separately,xgboost-3,2,1,0.9,735,2023-03-07,2026-02-09,2026-02-12,2026-02-12,10
8,0.57,0.54,2,atr_adxvola-past_return,3,4,1.5,run_separately,xgboost-3,1,1,0.9,735,2023-03-08,2026-02-10,2026-02-13,2026-02-13,13
9,0.57,0.54,2,atr_adxvola-past_return,3,4,1.5,run_separately,xgboost-3,2,1,0.95,735,2023-03-07,2026-02-09,2026-02-12,2026-02-12,12


# Archived

In [158]:
def normalize_csvs():
    
    for r in returns:
        
        past_performance = pd.read_csv(f"h{r}_baseline.csv") # Match prior cell saved as file name horizon_2_baseline_new
        """
        past_performance["features"] = (
        past_performance["feature_set"]
            .str.replace("_ba$", "", regex=True)
            .str.replace("-baseline$", "", regex=True)
            .str.replace("-pr$", "-past_return", regex=True)
            .str.replace("past_ret_cols", "past_return", regex=False)
        )
        """

        past_performance["pi_handling"] = (
        past_performance["pi_size"]
        .str.replace("1.5-r", "run_separately", regex=False)
        .str.replace("1.5", "include_new", regex=False)
        )

        past_performance["pi_size"] = 1.5
        past_performance.to_csv(f"h{r}_baseline.csv", index=False)