### Libraries

In [None]:
pip install pandas_ta

In [None]:
import pandas as pd
import numpy as np
import pandas_ta as ta

### Loading Dataset

In [None]:
df_2024 = pd.read_csv('df_2024.csv')
df_all = pd.read_csv('df_all.csv')

In [None]:
df_2024['timestamp'] = pd.to_datetime(df_2024['timestamp'])
df_all['timestamp'] = pd.to_datetime(df_all['timestamp'])

df_all = df_all[df_all['timestamp'] < '2024-01-01 00:00:00']
df_combined = pd.concat([df_all, df_2024], ignore_index=True)

### Adding indicators

In [None]:
def preprocess(df):
    # ----- 1) Timestamp handling
    if 'timestamp' in df.columns:
        ts = pd.to_datetime(df['timestamp'], utc=True, errors='coerce')
        df = df.set_index(ts)           # keep index tz-aware
    elif not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("No datetime index or column found")

    df.drop(columns=['timestamp'], errors='ignore', inplace=True)

    # ----- 2) Candle statistics
    rng = df['high'] - df['low']
    body = (df['close'] - df['open']).abs()
    df['body_ratio']        = body.div(rng).replace([np.inf, -np.inf], np.nan)
    df['upper_wick_ratio']  = (df['high'] - df[['open', 'close']].max(axis=1)).div(rng)
    df['lower_wick_ratio']  = (df[['open', 'close']].min(axis=1) - df['low']).div(rng)

    df['day_of_week'] = df.index.day_name()

    return df.dropna(subset=['body_ratio', 'upper_wick_ratio', 'lower_wick_ratio'])

def add_indicators(df, price='close'):
    df = df.copy()

    # -------- Trend structure
    df['ema_9']   = df[price].ewm(span=9,  adjust=False).mean()
    df['ema_21']  = df[price].ewm(span=21, adjust=False).mean()
    df['ema_50']  = df[price].ewm(span=50, adjust=False).mean()
    df['ema_100']  = df[price].ewm(span=100, adjust=False).mean()
    df['ema_200'] = df[price].ewm(span=200,adjust=False).mean()   # long-term trend filter

    # -------- Momentum / volatility
    macd = ta.macd(df[price], fast=12, slow=26, signal=9)
    macd.columns = ['macd', 'macd_signal', 'macd_hist']
    df = df.join(macd)

    df['rsi_14']  = ta.rsi(df[price], length=14)
    df['atr_14']  = ta.atr(df.high, df.low, df.close, length=14)
    df['adx_14']  = ta.adx(df.high, df.low, df.close, length=14)['ADX_14']  # trend strength
    bb = ta.bbands(df[price], length=20, std=2)
    df = df.join(bb)   # upper, middle, lower bands

    # -------- Boolean helpers (vectorised)
    df['bull_9_21'] = (df['ema_9'] > df['ema_21']).astype(int)
    df['bull_21_50']= (df['ema_21']> df['ema_50']).astype(int)
    df['bull_50_100'] = (df['ema_50'] > df['ema_100']).astype(int)
    df['bull_100_200']= (df['ema_100']> df['ema_200']).astype(int)
    df['above_200'] = (df[price]   > df['ema_200']).astype(int)
    df['macd_cross_up']   = ((df['macd_hist'] > 0) & (df['macd_hist'].shift() <= 0)).astype(int)
    df['macd_cross_down'] = ((df['macd_hist'] < 0) & (df['macd_hist'].shift() >= 0)).astype(int)

    return df

df_combined.dropna(inplace = True)

df_cleaned = preprocess(df_combined)
df_indicators = add_indicators(df_cleaned)

df_indicators.dropna(inplace = True)

### Generating Trade Labels

In [None]:
def generate_trade_labels(df, window=20, time_limit=36, max_R=2):
    
    """
    Generate multi-label TP/SL labels for an XAU/USD dataset, assuming only BUY trades.
    Adds:
      - TP1_hit, TP2_hit, TP3_hit, SL_hit (binary)
      - entry_price: the open price of each candle
    """
    
    df = df.copy()

    # 1) Entry price is the open of the candle
    df['entry_price'] = df['open']

    # 2) Compute the past swing low (shifted to avoid lookahead)
    swing_low = df['low'].rolling(window).min().shift(1)

    # 3) Calculate R = distance from entry down to that swing low
    df['R'] = (df['entry_price'] - swing_low).abs()

    df['R'] = df['R'].apply(lambda x: min(x, max_R))  # Cap R at 2

    # 4) SL and TP levels (all BUY trades)
    df['SL_price'] = df['entry_price'] - df['R']
    df['TP1']      = df['entry_price'] + 0.5 * df['R']
    df['TP2']      = df['entry_price'] + 1 * df['R']
    df['TP3']      = df['entry_price'] + 1.5 * df['R']

    # 5) Initialize label columns
    df['TP1_hit'] = 0
    df['TP2_hit'] = 0
    df['TP3_hit'] = 0
    df['SL_hit']  = 0

    # 6) Evaluate hits in the next `time_limit` bars
    for idx in range(len(df)):
        entry_idx = df.index[idx]
        sl  = df.at[entry_idx, 'SL_price']
        tp1 = df.at[entry_idx, 'TP1']
        tp2 = df.at[entry_idx, 'TP2']
        tp3 = df.at[entry_idx, 'TP3']

        future = df.iloc[idx + 1 : idx + 1 + time_limit]

        hit_TP1 = hit_TP2 = hit_TP3 = hit_SL = 0

        for _, row in future.iterrows():
           
            if row['low'] <= sl:
                hit_SL = 1
                break
                
            elif row['high'] >= tp3:
                hit_TP1 = hit_TP2 = hit_TP3 = 1
                break
                
            elif row['high'] >= tp2:
                hit_TP1 = hit_TP2 = 1
                break
                
            elif row['high'] >= tp1:
                hit_TP1 = 1
                break

        # Assign to df
        df.at[entry_idx, 'TP1_hit'] = hit_TP1
        df.at[entry_idx, 'TP2_hit'] = hit_TP2
        df.at[entry_idx, 'TP3_hit'] = hit_TP3
        df.at[entry_idx, 'SL_hit']  = hit_SL

    return df

df_labels = generate_trade_labels(df_indicators)
df_labels.dropna(inplace = True)

### Model Training

In [None]:
import warnings, json, numpy as np, pandas as pd, joblib, catboost as cb
warnings.filterwarnings("ignore")

from sklearn.experimental import enable_halving_search_cv        # noqa: F401
from sklearn.model_selection import TimeSeriesSplit, HalvingGridSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# ---------- 1. DATA & FEATURE ENGINEERING -------------------------------

df = df_labels.copy()
df.sort_values("timestamp", inplace=True)

# basic returns & volatility
df["ret_1h"]  = df["close"].pct_change(1)
df["ret_6h"]  = df["close"].pct_change(6)
df["ret_24h"] = df["close"].pct_change(24)
df["vol_12"]  = df["close"].pct_change().rolling(12).std()

le_day_of_week = LabelEncoder()
df["day_of_week"] = le_day_of_week.fit_transform(df["day_of_week"])

# Save the encoder
joblib.dump(le_day_of_week, 'label_encoder_day_of_week.pkl')

df.dropna(inplace=True)

# ---------- 2.  FEATURES / LABELS ---------------------------------------
label_cols  = ["TP1_hit", "TP2_hit", "TP3_hit"]
feature_cols = sorted(set(df.columns) - set(label_cols) - {"timestamp"} - {"SL_hit"})

print(feature_cols)

X, y = df[feature_cols], df[label_cols].astype(int)

split = int(len(df)*0.80)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# ---------- 3.  CLASS WEIGHTS -------------------------------------------
class_wt = {
    c: compute_class_weight(
            class_weight='balanced',         # ← kw-arg
            classes=[0, 1],                  # ← kw-arg
            y=y_train[c]                     # ← kw-arg
        )
    for c in label_cols
}

# ---------- 4.  CatBoost factory ----------------------------------------
def make_cat(spw):
    return cb.CatBoostClassifier(
        loss_function   = "Logloss",
        eval_metric     = "F1",
        task_type       = "GPU",
        devices         = "0",
        iterations      = 4000,
        learning_rate   = 0.03,
        depth           = 6,
        l2_leaf_reg     = spw[0]/spw[1],
        random_seed     = 42,
        verbose         = False
    )

param_grid = {
    "learning_rate": [0.02, 0.05],
    "depth"        : [4, 6, 8],
    "bagging_temperature": [0.5, 1.0],
    "l2_leaf_reg"  : [1, 5, 10]
}

tscv = TimeSeriesSplit(n_splits=3)
best_models = {}
best_params = {}  # Dictionary to store the best parameters for each label

for col in label_cols:
    print(f"\n🔍  Optimising label: {col}")
    cat = make_cat(class_wt[col])

    search = HalvingGridSearchCV(
        estimator      = cat,
        param_grid     = param_grid,
        resource       = "iterations",
        max_resources  = 4000,
        min_resources  = 400,
        factor         = 3,
        scoring        = "f1",
        cv             = tscv,
        n_jobs         = 1,
        verbose        = 1,
        refit          = True,
    )

    # ★ single eval_set (GPU requirement)
    search.fit(
        X_train, y_train[col],
        eval_set=(X_test, y_test[col]),
        early_stopping_rounds=100
    )

    best_models[col] = search.best_estimator_
    best_params[col] = search.best_params_  # Save the best parameters
    print("   best CV-F1 :", search.best_score_)

# ---------- 5. Set a Fixed Threshold of 0.5 ----------------------------------
best_thresh = {col: 0.5 for col in label_cols}

# ---------- 6. Report ---------------------------------------------------
y_pred = pd.DataFrame({
    c: (best_models[c].predict_proba(X_test)[:,1] >= 0.5).astype(int)
    for c in label_cols
}, index=y_test.index)

print("\n" + "="*60)
print("📊  HOLD-OUT PERFORMANCE")
print("="*60)
print(classification_report(
    y_test, y_pred,
    target_names=label_cols,
    digits=4, zero_division=0
))

# ------------------------------------------ 7. Save artefacts ------------------------------------------
for c in label_cols:
    joblib.dump(best_models[c], f"catboost_{c}.pkl")

# Save the best parameters from the grid search
with open("catboost_best_params.json", "w") as fp:
    json.dump(best_params, fp, indent=2)

with open("catboost_thresholds.json", "w") as fp:
    json.dump(best_thresh, fp, indent=2)

# Save label encoder
joblib.dump(le_day_of_week, 'label_encoder_day_of_week.pkl')

print("\n✅  Saved models, thresholds, best parameters, and label encoder.")
