In [16]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Clean the data by replacing infinities with NaN and dropping rows with NaN."""
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)
    return df


def generate_labels(df: pd.DataFrame, horizon=5, atr_mult=1.5, threshold = 0.2) -> pd.DataFrame:
    df = df.copy()
    df['future_close'] = df['Close'].shift(-horizon)
    df['future_ret'] = (df['future_close'] - df['Close']) / df['Close']

    # Consensus of indicators
    consensus = (
        0.25 * np.sign(df.get('ema_diff_9_21', 0)) +
        0.20 * np.sign(df.get('macd_hist', 0)) +
        0.15 * np.sign(df.get('rsi_norm', 0)) +
        0.15 * df.get('supertrend_dir', 0) +
        0.10 * np.sign(df.get('cci_norm', 0)) +
        0.10 * np.sign(df.get('obv_slope', 0)) +
        0.05 * np.sign(df.get('mfi_norm', 0))
    )
    df['consensus_score'] = consensus.clip(-1,1)

    atr_threshold = df['atr_pct'].rolling(20).mean() * atr_mult
    df['label_prob'] = np.tanh(df['future_ret'] / (atr_threshold + 1e-6)) * df['consensus_score']

    buy_cond = df['label_prob'] > threshold
    sell_cond = df['label_prob'] < -threshold
    df['label'] = 0
    df.loc[buy_cond, 'label'] = 1
    df.loc[sell_cond, 'label'] = -1

    df.dropna(subset=['label_prob'], inplace=True)
    return df



def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """
    df: pandas DataFrame with columns ['open','high','low','close','volume']
    Returns: df with features for Core 12 indicators ready for ML/backtesting
    """

    df = df.copy()

    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['Open'] = pd.to_numeric(df['Open'], errors='coerce')
    df['High'] = pd.to_numeric(df['High'], errors='coerce')
    df['Low'] = pd.to_numeric(df['Low'], errors='coerce')
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')

    df = clean_data(df)
    return df

def preprocess_data(df):
    #Load the dataset
    # df = pd.read_csv(file_path)
    
    #Clean Data
    df = clean_data(df)

    #Convert ''Open Time' and 'Close Time' to datetime
    df['Open Time'] = pd.to_datetime(df['Open Time'], unit='ms')
    df['Close Time'] = pd.to_datetime(df['Close Time'], unit='ms')

    # Set the 'Open Time' as the index
    df.set_index('Open Time', inplace=True)

    # Feature engineering and label generation
    df = feature_engineering(df)

    # Replace infinite values with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Drop rows with NaN values generated during calculations
    df.dropna(inplace=True)

    return df

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Layer
from tensorflow.keras.metrics import MeanSquaredError, MeanAbsoluteError
from tensorflow.keras import backend as K
import tensorflow as tf

2025-12-07 17:20:43.261497: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [141]:
from config_loader import load_config
config = load_config()

In [142]:
# Define the Attention layer
@tf.keras.utils.register_keras_serializable()
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(
            name='attention_weight',
            shape=(input_shape[-1], 1),
            initializer='random_normal',
            trainable=True
        )
        self.b = self.add_weight(
            name='attention_bias',
            shape=(input_shape[1], 1),
            initializer='zeros',
            trainable=True
        )
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        e = K.squeeze(e, axis=-1)
        alpha = K.softmax(e)
        alpha = K.expand_dims(alpha, axis=-1)
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

In [23]:
symbol = config['data']['symbol']
interval = config['data']['interval']
start_time = int(pd.Timestamp(config['backtesting']['start_date']).timestamp() * 1000)
end_time = int(pd.Timestamp(config['backtesting']['end_date']).timestamp() * 1000)

# df = ingest_data(symbol, interval, start_time, end_time,r'C:/Users/arunm/Documents/Projects/Trading-App/Data/Raw/Pipeline_raw.csv')

df = pd.read_csv(r'Pipeline_raw.csv')
# logger.info("Data ingestion completed successfully.")

In [24]:
preprocessed_df = preprocess_data(df)
preprocessed_df.dropna(inplace=True)
# logger.info("Data preprocessing completed successfully.")

feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Scale features once here
preprocessed_df[['Open', 'High', 'Low', 'Volume']] = feature_scaler.fit_transform(
    preprocessed_df[['Open', 'High', 'Low', 'Volume']]
)
preprocessed_df['Close'] = target_scaler.fit_transform(preprocessed_df[['Close']])

In [25]:
custom_objects = {
    "Custom>Attention": Attention,
    "Attention": Attention,
    "mse": MeanSquaredError(),
    "mae": MeanAbsoluteError(),
}

model_path = r"gru_model.h5"
model = load_model(model_path, custom_objects=custom_objects, compile=False)

In [26]:
features = ['Open', 'High', 'Low', 'Close', 'Volume']
X_test = preprocessed_df[features].copy()   # <<< already scaled; no second scaler

SEQ_LENGTH = 60
FORECAST_LENGTH = 25

def create_sequences(df, seq_length, forecast_length):
    X, y = [], []
    for i in range(len(df) - seq_length - forecast_length):
        seq = df.iloc[i:i + seq_length].values
        tgt = df['Close'].iloc[i + seq_length:i + seq_length + forecast_length]
        X.append(seq)
        y.append(tgt)
    return X, y

X, y = create_sequences(X_test, SEQ_LENGTH, FORECAST_LENGTH)
X = np.array(X, dtype=np.float32)

In [27]:
y_pred = model.predict(X)

[1m19919/19919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m472s[0m 24ms/step


In [109]:
import numpy as np
import pandas as pd


def backtest_regression_simple_centered_v4(
    df: pd.DataFrame,
    y_pred: np.ndarray,
    config,
    horizon: int = 50,
    quantile: float = 0.90,
    max_hold: int = None,
    invert_signal: bool = True,
    side_mode: str = "both",   # "both", "long_only", "short_only"
    max_loss_cap: float = -3.5,
    decay_factor: float = 0.5,  # exit when |pred_now| < decay_factor * |pred_at_entry|
):
    """
    Simple but enhanced regression backtester:

      - Centered predicted returns
      - Quantile threshold -> signals
      - Optional inversion (model anti-directional)
      - Optional side filter: both / long_only / short_only
      - Fixed-horizon exit + predictive exits:
          * HARD_STOP (pnl <= max_loss_cap)
          * MODEL_FLIP (pred sign flips vs entry)
          * DECAY (pred magnitude collapses vs entry)
          * TIME (max_hold reached)
      - Diagnostics for analysis

    All trades are implemented as long-only in PnL math (position > 0),
    but side_mode controls which signal directions are allowed to enter.
    """

    df = df.copy()

    # -----------------------------
    # 1. Align predictions
    # -----------------------------
    if y_pred.ndim > 1:
        # Average first `horizon` steps if shape is (N, H)
        future_pred = np.mean(y_pred[:, :horizon], axis=1)
    else:
        future_pred = y_pred

    # Align df to prediction length
    df = df.iloc[-len(future_pred):].copy()
    df["future_pred"] = future_pred

    # -----------------------------
    # 2. Predicted return (raw)
    # -----------------------------
    df["pred_ret_raw"] = (df["future_pred"] - df["Close"]) / df["Close"]

    # Model seems anti-directional → invert by default
    if invert_signal:
        df["pred_ret_raw"] = -df["pred_ret_raw"]

    # -----------------------------
    # 3. Centering (CRITICAL)
    # -----------------------------
    center_lb = 200  # rolling median window
    df["center"] = df["pred_ret_raw"].rolling(center_lb).median().fillna(0.0)
    df["pred_ret"] = df["pred_ret_raw"] - df["center"]

    # -----------------------------
    # 4. Quantile threshold on centered predictions
    # -----------------------------
    thr = df["pred_ret"].abs().quantile(quantile)
    if (not np.isfinite(thr)) or thr == 0:
        thr = df["pred_ret"].abs().mean()  # fallback

    df["signal"] = 0
    df.loc[df["pred_ret"] > thr, "signal"] = 1
    df.loc[df["pred_ret"] < -thr, "signal"] = -1

    # Only enter when signal changes (avoid spam)
    df["entry"] = (df["signal"] != 0) & df["signal"].ne(df["signal"].shift(1))

    # -----------------------------
    # 5. Backtest core
    # -----------------------------
    if max_hold is None:
        max_hold = horizon

    initial_balance = config["backtesting"]["initial_balance"]
    risk_pct = config["risk_management"]["risk_percentage"]

    balance = initial_balance
    position = 0.0
    entry_price = 0.0
    hold = 0

    entry_signal = None
    entry_pred = None

    trades: list[dict] = []
    equity_curve: list[float] = []

    for i, row in df.iterrows():
        price = float(row["Close"])
        pred_now = float(row["pred_ret"])
        equity_curve.append(balance + position * price)

        # --------------------------------
        # Decide if this signal side is allowed
        # --------------------------------
        side_ok = True
        if side_mode == "long_only":
            side_ok = (row["signal"] == 1)
        elif side_mode == "short_only":
            side_ok = (row["signal"] == -1)

        # --------------------------------
        # ENTRY
        # --------------------------------
        if position == 0 and row["entry"] and side_ok:
            size = balance * (risk_pct / 100.0)
            if size <= 0:
                continue

            position = size / price
            balance -= size
            entry_price = price
            hold = 0
            entry_signal = row["signal"]
            entry_pred = pred_now

            trades.append({
                "type": "ENTRY",
                "signal": entry_signal,
                "price": price,
                "index": i,
                "pred_ret_entry": entry_pred,
            })

        # --------------------------------
        # EXIT logic
        # --------------------------------
        elif position > 0:
            hold += 1
            pnl = (price - entry_price) * position
            exit_reason = None

            # 1) HARD_STOP
            if pnl <= max_loss_cap:
                exit_reason = "HARD_STOP"

            # 2) MODEL_FLIP (prediction sign flips vs entry)
            elif entry_pred is not None and entry_signal is not None:
                if np.sign(pred_now) * np.sign(entry_pred) < 0:
                    exit_reason = "MODEL_FLIP"

            # 3) DECAY (prediction magnitude has collapsed vs entry)
            if exit_reason is None and entry_pred is not None:
                if abs(pred_now) < decay_factor * abs(entry_pred):
                    exit_reason = "DECAY"

            # 4) TIME EXIT
            if exit_reason is None and hold >= max_hold:
                exit_reason = "TIME"

            if exit_reason is not None:
                balance += position * price
                trades.append({
                    "type": "EXIT",
                    "price": price,
                    "index": i,
                    "pnl": pnl,
                    "hold": hold,
                    "reason": exit_reason,
                })
                position = 0.0
                entry_signal = None
                entry_pred = None

    # -----------------------------
    # 6. Final liquidation if needed
    # -----------------------------
    if position > 0:
        balance += position * df["Close"].iloc[-1]
        position = 0.0

    final_balance = balance
    profit_pct = 100.0 * (final_balance - initial_balance) / initial_balance

    trades_df = pd.DataFrame(trades)
    trades_df["pnl"] = trades_df.get("pnl", np.nan)

    if len(equity_curve) > 0:
        equity = pd.Series(equity_curve, index=df.index[:len(equity_curve)])
        max_dd = (equity / equity.cummax() - 1).min()
    else:
        equity = pd.Series(dtype=float)
        max_dd = 0.0

    # -----------------------------
    # 7. Diagnostics
    # -----------------------------
    diagnostics = {}

    diagnostics["pred_ret_raw_stats"] = df["pred_ret_raw"].describe()
    diagnostics["pred_ret_centered_stats"] = df["pred_ret"].describe()
    diagnostics["signal_counts"] = df["signal"].value_counts(dropna=False)
    diagnostics["threshold_value"] = thr
    diagnostics["center_median"] = df["center"].median()
    diagnostics["entry_rate_pct"] = 100.0 * df["entry"].sum() / len(df)
    diagnostics["trade_count"] = len(trades_df)
    diagnostics["win_rate"] = (trades_df["pnl"] > 0).mean() if len(trades_df) else np.nan
    diagnostics["avg_pnl"] = trades_df["pnl"].mean() if len(trades_df) else np.nan
    diagnostics["max_loss_cap"] = max_loss_cap
    diagnostics["decay_factor"] = decay_factor
    diagnostics["side_mode"] = side_mode

    if "hold" in trades_df:
        diagnostics["hold_distribution"] = trades_df["hold"].describe()

    if "reason" in trades_df:
        diagnostics["exit_reason_counts"] = trades_df["reason"].value_counts(dropna=False)

    if "pnl" in trades_df and len(trades_df) > 0:
        diagnostics["top_5_winners"] = trades_df.nlargest(5, "pnl")
        diagnostics["top_5_losers"] = trades_df.nsmallest(5, "pnl")

        joined = trades_df.join(df["pred_ret"], on="index", rsuffix="_pred")
        diagnostics["pred_ret_pnl_corr"] = joined[["pred_ret", "pnl"]].corr().iloc[0, 1]
    else:
        diagnostics["top_5_winners"] = pd.DataFrame()
        diagnostics["top_5_losers"] = pd.DataFrame()
        diagnostics["pred_ret_pnl_corr"] = np.nan

    return {
        "final_balance": final_balance,
        "profit_pct": profit_pct,
        "trades": trades_df,
        "equity_curve": equity,
        "max_drawdown": max_dd,
        "diagnostics": diagnostics,
    }


In [48]:
y_pred_flat = y_pred.reshape(-1, 1)
y_pred_unscaled_flat = target_scaler.inverse_transform(y_pred_flat)
y_pred_unscaled = y_pred_unscaled_flat.reshape(y_pred.shape)

In [143]:
results = backtest_regression_simple_centered_v4(
    df=df,
    y_pred=y_pred_unscaled,
    config=config,
    horizon=50,
    quantile=0.98,
    max_hold=50,
    invert_signal=True,
    side_mode="both",   # try "both", "long_only", "short_only"
    max_loss_cap=-3.0,
    decay_factor=0.1,
)

print("\n--- Backtest Results ---")
print(f"Final Balance: {results['final_balance']:.2f}")
print(f"Profit %: {results['profit_pct']:.2f}%")
print(f"Trades: {len(results['trades'])}")
print(f"Win Rate: {results['diagnostics']['win_rate']:.2%}")
print(f"Avg PnL: {results['diagnostics']['avg_pnl']:.6f}")
print(f"Max DD: {results['max_drawdown']:.2%}")

print("\n=== DIAGNOSTICS ===")
for k, v in results["diagnostics"].items():
    print(f"\n--- {k} ---\n{v}")



--- Backtest Results ---
Final Balance: 10130.13
Profit %: 1.30%
Trades: 1156
Win Rate: 27.85%
Avg PnL: 0.225137
Max DD: -0.30%

=== DIAGNOSTICS ===

--- pred_ret_raw_stats ---
count    637393.000000
mean          0.067066
std           0.043382
min          -0.084849
25%           0.027922
50%           0.059439
75%           0.112999
max           0.165325
Name: pred_ret_raw, dtype: float64

--- pred_ret_centered_stats ---
count    637393.000000
mean         -0.000030
std           0.004890
min          -0.093883
25%          -0.001774
50%           0.000005
75%           0.001823
max           0.064364
Name: pred_ret, dtype: float64

--- signal_counts ---
signal
 0    624645
-1      6749
 1      5999
Name: count, dtype: int64

--- threshold_value ---
0.014644089453639372

--- center_median ---
0.058381780818469266

--- entry_rate_pct ---
0.20160246504119123

--- trade_count ---
1156

--- win_rate ---
0.27854671280276816

--- avg_pnl ---
0.22513745480994052

--- max_loss_cap ---
-3.