# RL Trading Agent v1.0

**A reinforcement learning agent that learns to trade Gold by doing.**

- Starts with $1,000
- Can go LONG or SHORT any size (1+ units)
- Can pyramid, scale in/out, set stops
- Gets liquidated at 30% equity ($300)
- Learns from blowups and successes
- Goal: MAXIMIZE PROFIT

No rules. Just profit.

In [None]:
# Setup
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn -q

import os
if os.path.exists('gold-ml-trading'):
    %cd gold-ml-trading
    !git pull
else:
    !git clone https://github.com/altommo/gold-ml-trading.git
    %cd gold-ml-trading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import random
import json
import joblib
import warnings
warnings.filterwarnings('ignore')

# For the neural network
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

print("Libraries loaded!")

In [None]:
# ============================================================
# TRADING ENVIRONMENT
# - Reward = Points (% account change)
# - Losses hurt 3x
# - Winning trades get +2 bonus
# - Drawdown penalty when equity drops from peak
# - Consistency bonus (rolling Sharpe)
# - Liquidation penalty scales by opportunity lost
# ============================================================

class TradingEnvironment:
    def __init__(self, df, initial_capital=1000, margin_per_unit=36.57,
                 unit_size=0.15, spread_pct=0.0004, liquidation_pct=0.30,
                 max_bars=5000):
        """
        Real broker simulation with robust reward system.
        """
        self.df = df.reset_index(drop=True)
        self.initial_capital = initial_capital
        self.margin_per_unit = margin_per_unit
        self.unit_size = unit_size
        self.spread_pct = spread_pct
        self.liquidation_level = initial_capital * liquidation_pct
        self.max_bars = max_bars
        
        # Reward parameters
        self.loss_multiplier = 3.0  # Losses hurt 3x
        self.win_bonus = 2.0  # Bonus points for profitable trades
        self.drawdown_penalty_threshold = 0.05  # Start penalizing at 5% drawdown
        self.drawdown_penalty_rate = 0.5  # Penalty per % drawdown above threshold
        self.sharpe_window = 100  # Bars for rolling Sharpe
        self.sharpe_bonus_interval = 100  # Give bonus every N bars
        
        self.reset()
    
    def reset(self, start_idx=None):
        """Reset environment for new episode."""
        self.capital = self.initial_capital
        self.position = 0
        self.entry_price = 0
        self.entry_idx = 0
        self.entry_indicators = {}
        self.stop_loss = None
        self.trailing_stop = None
        self.trailing_stop_pct = None
        self.highest_since_entry = 0
        self.lowest_since_entry = float('inf')
        
        self.trades = []  # Detailed trade log
        self.equity_curve = [self.initial_capital]
        self.bars_since_trade = 0
        
        # For drawdown tracking
        self.peak_equity = self.initial_capital
        self.current_drawdown = 0
        
        # For rolling Sharpe
        self.bar_returns = []  # % returns per bar
        self.last_equity = self.initial_capital
        
        if start_idx is None:
            self.current_idx = random.randint(200, len(self.df) - self.max_bars - 100)
        else:
            self.current_idx = start_idx
        
        self.start_idx = self.current_idx
        self.done = False
        
        return self._get_state()
    
    def _get_price(self):
        return self.df.iloc[self.current_idx]['close']
    
    def _get_indicators(self):
        """Get current indicator values for logging."""
        row = self.df.iloc[self.current_idx]
        return {
            'idx': self.current_idx,
            'price': row['close'],
            'wt1': row.get('wt1', 0),
            'wt2': row.get('wt2', 0),
            'rsi': row.get('rsi', 50),
            'macd_hist': row.get('macd_hist', 0),
            'bb_pct': row.get('bb_pct', 0.5),
            'trend_score': row.get('trend_score', 0),
            'atr_pct': row.get('atr_pct', 0),
            'hour': row.get('hour', 12),
        }
    
    def _get_state(self):
        """Get current state for agent."""
        row = self.df.iloc[self.current_idx]
        price = row['close']
        
        if self.position != 0:
            if self.position > 0:
                unrealized_pnl = (price - self.entry_price) / self.entry_price
            else:
                unrealized_pnl = (self.entry_price - price) / self.entry_price
            unrealized_pnl *= abs(self.position)
        else:
            unrealized_pnl = 0
        
        margin_used = abs(self.position) * self.margin_per_unit
        margin_pct = margin_used / self.capital if self.capital > 0 else 1
        available_margin = self.capital - margin_used
        max_units = int(available_margin / self.margin_per_unit)
        
        state = {
            'wt1': row.get('wt1', 0),
            'wt2': row.get('wt2', 0),
            'wolfpack': row.get('wolfpack', 0),
            'rsi': row.get('rsi', 50),
            'stoch_rsi': row.get('stoch_rsi', 50),
            'macd_hist': row.get('macd_hist', 0),
            'bb_pct': row.get('bb_pct', 0.5),
            'atr_pct': row.get('atr_pct', 0),
            'roc_5': row.get('roc_5', 0),
            'roc_10': row.get('roc_10', 0),
            'price_vs_ma20': row.get('price_vs_ma20', 0),
            'price_vs_ma50': row.get('price_vs_ma50', 0),
            'trend_score': row.get('trend_score', 0),
            'volatility_24h': row.get('volatility_24h', 0),
            'hour': row.get('hour', 12),
            'day_of_week': row.get('day_of_week', 2),
            'position': self.position,
            'position_side': 1 if self.position > 0 else (-1 if self.position < 0 else 0),
            'unrealized_pnl': unrealized_pnl,
            'margin_pct': margin_pct,
            'max_units': max_units,
            'capital': self.capital,
            'capital_pct': self.capital / self.initial_capital,
            'bars_since_trade': min(self.bars_since_trade / 168, 1),
            'drawdown_pct': self.current_drawdown,  # Add drawdown to state
        }
        
        return state
    
    def _calculate_equity(self):
        if self.position == 0:
            return self.capital
        
        price = self._get_price()
        notional = abs(self.position) * self.unit_size * price
        
        if self.position > 0:
            pnl = (price - self.entry_price) / self.entry_price * notional
        else:
            pnl = (self.entry_price - price) / self.entry_price * notional
        
        return self.capital + pnl
    
    def _pnl_to_points(self, pnl_dollars, capital_before, is_trade_close=False):
        """Convert P&L to points. Losses hurt 3x. Wins get bonus."""
        if capital_before <= 0:
            return 0
        
        pnl_pct = pnl_dollars / capital_before * 100
        
        if pnl_pct < 0:
            return pnl_pct * self.loss_multiplier  # Losses hurt 3x
        else:
            bonus = self.win_bonus if is_trade_close and pnl_pct > 0 else 0
            return pnl_pct + bonus  # Wins get +2 bonus
    
    def _calculate_drawdown_penalty(self, equity):
        """Penalize when in drawdown above threshold."""
        if equity > self.peak_equity:
            self.peak_equity = equity
            self.current_drawdown = 0
            return 0
        
        self.current_drawdown = (self.peak_equity - equity) / self.peak_equity
        
        if self.current_drawdown > self.drawdown_penalty_threshold:
            excess_dd = (self.current_drawdown - self.drawdown_penalty_threshold) * 100
            return -excess_dd * self.drawdown_penalty_rate
        
        return 0
    
    def _calculate_sharpe_bonus(self):
        """Bonus for consistent returns (rolling Sharpe)."""
        if len(self.bar_returns) < self.sharpe_window:
            return 0
        
        recent_returns = self.bar_returns[-self.sharpe_window:]
        mean_ret = np.mean(recent_returns)
        std_ret = np.std(recent_returns)
        
        if std_ret < 0.001:  # Avoid division by zero
            if mean_ret > 0:
                sharpe = 2.0  # Cap at 2 for very consistent positive returns
            else:
                sharpe = -2.0
        else:
            sharpe = mean_ret / std_ret
        
        # Reward positive Sharpe, penalize negative
        # Scale: Sharpe of 1.0 = +5 points, Sharpe of -1.0 = -5 points
        return sharpe * 5
    
    def step(self, action):
        if self.done:
            return self._get_state(), 0, True, {}
        
        price = self._get_price()
        reward = 0
        info = {'trade': None}
        capital_before = self.capital
        bars_into_episode = self.current_idx - self.start_idx
        
        # Update trailing stop
        if self.position > 0:
            self.highest_since_entry = max(self.highest_since_entry, price)
            if self.trailing_stop_pct:
                self.trailing_stop = self.highest_since_entry * (1 - self.trailing_stop_pct)
        elif self.position < 0:
            self.lowest_since_entry = min(self.lowest_since_entry, price)
            if self.trailing_stop_pct:
                self.trailing_stop = self.lowest_since_entry * (1 + self.trailing_stop_pct)
        
        # Check stops
        stop_triggered = False
        exit_reason = None
        if self.position > 0:
            low = self.df.iloc[self.current_idx]['low']
            if self.stop_loss and low <= self.stop_loss:
                stop_triggered = True
                exit_price = self.stop_loss
                exit_reason = 'stop_loss'
            elif self.trailing_stop and low <= self.trailing_stop:
                stop_triggered = True
                exit_price = self.trailing_stop
                exit_reason = 'trailing_stop'
        elif self.position < 0:
            high = self.df.iloc[self.current_idx]['high']
            if self.stop_loss and high >= self.stop_loss:
                stop_triggered = True
                exit_price = self.stop_loss
                exit_reason = 'stop_loss'
            elif self.trailing_stop and high >= self.trailing_stop:
                stop_triggered = True
                exit_price = self.trailing_stop
                exit_reason = 'trailing_stop'
        
        if stop_triggered:
            pnl = self._close_position(exit_price, exit_reason)
            reward += self._pnl_to_points(pnl, capital_before, is_trade_close=True)
            capital_before = self.capital
            info['trade'] = exit_reason
        
        # Process action
        action_type = action.get('type', 'HOLD')
        units = action.get('units', 0)
        
        if action_type == 'LONG' and units > 0:
            cost = self._open_long(units, price)
            reward += self._pnl_to_points(cost, capital_before)
            capital_before = self.capital
            if action.get('stop_loss'):
                self.stop_loss = price * (1 - action['stop_loss'])
            if action.get('trailing_stop'):
                self.trailing_stop_pct = action['trailing_stop']
                self.highest_since_entry = price
            self.bars_since_trade = 0
            info['trade'] = f'LONG {units}'
            
        elif action_type == 'SHORT' and units > 0:
            cost = self._open_short(units, price)
            reward += self._pnl_to_points(cost, capital_before)
            capital_before = self.capital
            if action.get('stop_loss'):
                self.stop_loss = price * (1 + action['stop_loss'])
            if action.get('trailing_stop'):
                self.trailing_stop_pct = action['trailing_stop']
                self.lowest_since_entry = price
            self.bars_since_trade = 0
            info['trade'] = f'SHORT {units}'
            
        elif action_type == 'CLOSE' and self.position != 0:
            pnl = self._close_position(price, 'manual_close')
            reward += self._pnl_to_points(pnl, capital_before, is_trade_close=True)
            self.bars_since_trade = 0
            info['trade'] = 'CLOSE'
            
        else:
            self.bars_since_trade += 1
            if self.bars_since_trade > 168 and self.position == 0:
                reward -= 0.1
        
        # Update equity and track bar return
        equity = self._calculate_equity()
        bar_return = (equity - self.last_equity) / self.last_equity * 100 if self.last_equity > 0 else 0
        self.bar_returns.append(bar_return)
        self.last_equity = equity
        self.equity_curve.append(equity)
        
        # Drawdown penalty
        dd_penalty = self._calculate_drawdown_penalty(equity)
        reward += dd_penalty
        
        # Sharpe bonus every N bars
        if bars_into_episode > 0 and bars_into_episode % self.sharpe_bonus_interval == 0:
            sharpe_bonus = self._calculate_sharpe_bonus()
            reward += sharpe_bonus
            info['sharpe_bonus'] = sharpe_bonus
        
        # Check liquidation
        if equity <= self.liquidation_level:
            self.done = True
            # Penalty scales by opportunity lost
            bars_remaining = self.max_bars - bars_into_episode
            opportunity_lost = bars_remaining / self.max_bars * 100
            reward -= (50 + opportunity_lost)  # -50 base + opportunity cost
            info['liquidated'] = True
            if self.position != 0:
                self._close_position(price, 'liquidation')
        
        # Move to next bar
        self.current_idx += 1
        if self.current_idx >= len(self.df) - 1 or bars_into_episode >= self.max_bars:
            self.done = True
            if self.position != 0:
                pnl = self._close_position(self._get_price(), 'end_of_episode')
                reward += self._pnl_to_points(pnl, self.capital - pnl, is_trade_close=True)
            
            # Final Sharpe bonus at end
            final_sharpe = self._calculate_sharpe_bonus()
            reward += final_sharpe * 2  # Double weight for final Sharpe
        
        return self._get_state(), reward, self.done, info
    
    def _open_long(self, units, price):
        margin_needed = units * self.margin_per_unit
        margin_used = abs(self.position) * self.margin_per_unit
        available = self.capital - margin_used
        
        if margin_needed > available:
            units = int(available / self.margin_per_unit)
            if units <= 0:
                return 0
        
        spread_cost = units * self.unit_size * price * self.spread_pct
        self.capital -= spread_cost
        
        if self.position < 0:
            pnl = self._close_position(price, 'reversed')
            self.position = units
            self.entry_price = price
            self.entry_idx = self.current_idx
            self.entry_indicators = self._get_indicators()
            return pnl - spread_cost
        
        if self.position > 0:
            total_cost = self.position * self.entry_price + units * price
            self.position += units
            self.entry_price = total_cost / self.position
        else:
            self.position = units
            self.entry_price = price
            self.entry_idx = self.current_idx
            self.entry_indicators = self._get_indicators()
            self.highest_since_entry = price
        
        return -spread_cost
    
    def _open_short(self, units, price):
        margin_needed = units * self.margin_per_unit
        margin_used = abs(self.position) * self.margin_per_unit
        available = self.capital - margin_used
        
        if margin_needed > available:
            units = int(available / self.margin_per_unit)
            if units <= 0:
                return 0
        
        spread_cost = units * self.unit_size * price * self.spread_pct
        self.capital -= spread_cost
        
        if self.position > 0:
            pnl = self._close_position(price, 'reversed')
            self.position = -units
            self.entry_price = price
            self.entry_idx = self.current_idx
            self.entry_indicators = self._get_indicators()
            return pnl - spread_cost
        
        if self.position < 0:
            total_cost = abs(self.position) * self.entry_price + units * price
            self.position -= units
            self.entry_price = total_cost / abs(self.position)
        else:
            self.position = -units
            self.entry_price = price
            self.entry_idx = self.current_idx
            self.entry_indicators = self._get_indicators()
            self.lowest_since_entry = price
        
        return -spread_cost
    
    def _close_position(self, price, exit_reason='unknown'):
        """Close position with detailed logging."""
        if self.position == 0:
            return 0
        
        notional = abs(self.position) * self.unit_size * price
        
        if self.position > 0:
            pnl_pct = (price - self.entry_price) / self.entry_price
        else:
            pnl_pct = (self.entry_price - price) / self.entry_price
        
        pnl_dollars = pnl_pct * notional
        spread_cost = notional * self.spread_pct
        pnl_dollars -= spread_cost
        
        # Calculate account impact
        account_pct_change = pnl_dollars / self.capital * 100 if self.capital > 0 else 0
        
        self.capital += pnl_dollars
        
        # Detailed trade log
        exit_indicators = self._get_indicators()
        self.trades.append({
            'entry_idx': self.entry_idx,
            'exit_idx': self.current_idx,
            'bars_held': self.current_idx - self.entry_idx,
            'position': self.position,
            'entry_price': self.entry_price,
            'exit_price': price,
            'pnl_dollars': pnl_dollars,
            'pnl_pct': pnl_pct * 100,
            'account_pct_change': account_pct_change,
            'exit_reason': exit_reason,
            'entry_indicators': self.entry_indicators.copy(),
            'exit_indicators': exit_indicators,
        })
        
        self.position = 0
        self.entry_price = 0
        self.entry_idx = 0
        self.entry_indicators = {}
        self.stop_loss = None
        self.trailing_stop = None
        self.trailing_stop_pct = None
        
        return pnl_dollars
    
    def get_episode_stats(self):
        """Get episode statistics."""
        trades_df = pd.DataFrame(self.trades) if self.trades else pd.DataFrame()
        
        # Calculate episode Sharpe
        if len(self.bar_returns) > 10:
            mean_ret = np.mean(self.bar_returns)
            std_ret = np.std(self.bar_returns)
            episode_sharpe = mean_ret / std_ret if std_ret > 0.001 else 0
        else:
            episode_sharpe = 0
        
        stats = {
            'starting_capital': self.initial_capital,
            'ending_capital': self.capital,
            'total_return_pct': (self.capital - self.initial_capital) / self.initial_capital * 100,
            'num_trades': len(self.trades),
            'bars_traded': self.current_idx - self.start_idx,
            'liquidated': self.capital <= self.liquidation_level,
            'max_drawdown': self.current_drawdown * 100 if hasattr(self, 'current_drawdown') else 0,
            'episode_sharpe': episode_sharpe,
            'trades': self.trades,  # Full trade log for analysis
        }
        
        if len(trades_df) > 0:
            stats['win_rate'] = (trades_df['pnl_dollars'] > 0).mean() * 100
            stats['avg_win'] = trades_df[trades_df['pnl_dollars'] > 0]['pnl_dollars'].mean() if (trades_df['pnl_dollars'] > 0).any() else 0
            stats['avg_loss'] = trades_df[trades_df['pnl_dollars'] < 0]['pnl_dollars'].mean() if (trades_df['pnl_dollars'] < 0).any() else 0
            stats['best_trade'] = trades_df['pnl_dollars'].max()
            stats['worst_trade'] = trades_df['pnl_dollars'].min()
            stats['avg_bars_held'] = trades_df['bars_held'].mean()
            
            # Outliers
            stats['big_wins'] = trades_df[trades_df['account_pct_change'] > 5].to_dict('records')
            stats['big_losses'] = trades_df[trades_df['account_pct_change'] < -3].to_dict('records')
        
        # Calculate max drawdown from equity curve
        equity = np.array(self.equity_curve)
        rolling_max = np.maximum.accumulate(equity)
        drawdown = (rolling_max - equity) / rolling_max * 100
        stats['max_drawdown'] = drawdown.max()
        
        return stats

print("TradingEnvironment defined!")
print("  - Losses hurt 3x")
print("  - Winning trades get +2 bonus")
print("  - Drawdown penalty above 5%")
print("  - Rolling Sharpe bonus every 100 bars")

In [None]:
# ============================================================
# RL TRADING AGENT
# - Epsilon decays only on NEW BEST
# - Trains during episodes + full episode replay at end
# ============================================================

class TradingAgent:
    def __init__(self, state_size=25, learning_rate=0.001, gamma=0.95,
                 epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.95):
        """
        RL Agent that learns to trade.
        Trains during episodes + episode replay at end.
        """
        self.state_size = state_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate
        
        # Experience replay buffer (all experiences)
        self.memory = deque(maxlen=100000)
        
        # Current episode experiences (for end-of-episode training)
        self.episode_memory = []
        
        # Action space
        self.action_size = 28
        
        # Q-network
        self.model = MLPRegressor(
            hidden_layer_sizes=(128, 64, 32),
            activation='relu',
            solver='adam',
            learning_rate_init=learning_rate,
            max_iter=1,
            warm_start=True
        )
        
        # Initialize model
        dummy_X = np.random.randn(100, state_size)
        dummy_y = np.random.randn(100, self.action_size)
        self.model.fit(dummy_X, dummy_y)
        
        self.scaler = StandardScaler()
        self.scaler_fitted = False
    
    def state_to_array(self, state):
        keys = ['wt1', 'wt2', 'wolfpack', 'rsi', 'stoch_rsi', 'macd_hist',
                'bb_pct', 'atr_pct', 'roc_5', 'roc_10', 'price_vs_ma20',
                'price_vs_ma50', 'trend_score', 'volatility_24h', 'hour',
                'day_of_week', 'position', 'position_side', 'unrealized_pnl',
                'margin_pct', 'max_units', 'capital', 'capital_pct', 
                'bars_since_trade', 'drawdown_pct']  # Added drawdown_pct
        
        arr = np.array([state.get(k, 0) for k in keys], dtype=np.float32)
        arr = np.nan_to_num(arr, nan=0, posinf=0, neginf=0)
        return arr
    
    def action_to_dict(self, action_idx, max_units):
        if action_idx == 0:
            return {'type': 'HOLD'}
        elif 1 <= action_idx <= 10:
            units = min(action_idx, max_units)
            return {'type': 'LONG', 'units': max(1, units)}
        elif 11 <= action_idx <= 20:
            units = min(action_idx - 10, max_units)
            return {'type': 'SHORT', 'units': max(1, units)}
        elif action_idx == 21:
            return {'type': 'CLOSE'}
        elif action_idx == 22:
            return {'type': 'LONG', 'units': min(3, max_units), 'stop_loss': 0.01}
        elif action_idx == 23:
            return {'type': 'LONG', 'units': min(5, max_units), 'stop_loss': 0.02}
        elif action_idx == 24:
            return {'type': 'LONG', 'units': min(3, max_units), 'trailing_stop': 0.02}
        elif action_idx == 25:
            return {'type': 'SHORT', 'units': min(3, max_units), 'stop_loss': 0.01}
        elif action_idx == 26:
            return {'type': 'SHORT', 'units': min(5, max_units), 'stop_loss': 0.02}
        elif action_idx == 27:
            return {'type': 'SHORT', 'units': min(3, max_units), 'trailing_stop': 0.02}
        else:
            return {'type': 'HOLD'}
    
    def choose_action(self, state, training=True):
        if training and random.random() < self.epsilon:
            return random.randint(0, self.action_size - 1)
        
        state_arr = self.state_to_array(state).reshape(1, -1)
        if self.scaler_fitted:
            state_arr = self.scaler.transform(state_arr)
        q_values = self.model.predict(state_arr)
        return np.argmax(q_values[0])
    
    def remember(self, state, action, reward, next_state, done):
        """Store in both global and episode memory."""
        experience = (state, action, reward, next_state, done)
        self.memory.append(experience)
        self.episode_memory.append(experience)
    
    def start_episode(self):
        """Clear episode memory for new episode."""
        self.episode_memory = []
    
    def replay(self, batch_size=64):
        """Train on random batch from all experiences."""
        if len(self.memory) < batch_size:
            return 0
        
        batch = random.sample(self.memory, batch_size)
        return self._train_on_batch(batch)
    
    def replay_episode(self, boost_factor=1.0):
        """
        Train on the full episode that just completed.
        boost_factor: multiply rewards (e.g., 2.0 for good episodes, 0.5 for bad)
        """
        if len(self.episode_memory) < 10:
            return 0
        
        # Train on episode experiences multiple times for reinforcement
        batch = self.episode_memory.copy()
        
        # Apply boost factor to rewards
        if boost_factor != 1.0:
            batch = [(s, a, r * boost_factor, ns, d) for s, a, r, ns, d in batch]
        
        return self._train_on_batch(batch)
    
    def _train_on_batch(self, batch):
        """Internal training on a batch of experiences."""
        if len(batch) == 0:
            return 0
        
        states = np.array([self.state_to_array(s) for s, _, _, _, _ in batch])
        next_states = np.array([self.state_to_array(ns) for _, _, _, ns, _ in batch])
        
        if not self.scaler_fitted:
            self.scaler.fit(states)
            self.scaler_fitted = True
        
        states = self.scaler.transform(states)
        next_states = self.scaler.transform(next_states)
        
        current_q = self.model.predict(states)
        next_q = self.model.predict(next_states)
        
        for i, (state, action, reward, next_state, done) in enumerate(batch):
            if done:
                target = reward
            else:
                target = reward + self.gamma * np.max(next_q[i])
            current_q[i][action] = target
        
        self.model.fit(states, current_q)
        return np.mean(np.abs(current_q))
    
    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

print("TradingAgent defined!")
print("  - State size: 25 features (includes drawdown_pct)")

In [None]:
# ============================================================
# INDICATOR FUNCTIONS (same as before)
# ============================================================

def calculate_wavetrend(df, n1=10, n2=21):
    df = df.copy()
    ap = (df['high'] + df['low'] + df['close']) / 3
    esa = ap.ewm(span=n1, adjust=False).mean()
    d = (ap - esa).abs().ewm(span=n1, adjust=False).mean()
    ci = (ap - esa) / (0.015 * d)
    df['wt1'] = ci.ewm(span=n2, adjust=False).mean()
    df['wt2'] = df['wt1'].rolling(4).mean()
    return df

def calculate_wolfpack(df):
    df = df.copy()
    df['wolfpack'] = df['close'].ewm(span=3, adjust=False).mean() - df['close'].ewm(span=8, adjust=False).mean()
    return df

def calculate_rsi(df, period=14):
    df = df.copy()
    delta = df['close'].diff()
    gain = delta.clip(lower=0).rolling(period).mean()
    loss = (-delta.clip(upper=0)).rolling(period).mean()
    df['rsi'] = 100 - (100 / (1 + gain / loss))
    return df

def calculate_atr(df, period=14):
    df = df.copy()
    df['atr'] = (df['high'] - df['low']).rolling(period).mean()
    df['atr_pct'] = df['atr'] / df['close'] * 100
    return df

def calculate_moving_averages(df):
    df = df.copy()
    df['ma20'] = df['close'].rolling(20).mean()
    df['ma50'] = df['close'].rolling(50).mean()
    df['price_vs_ma20'] = (df['close'] - df['ma20']) / df['ma20'] * 100
    df['price_vs_ma50'] = (df['close'] - df['ma50']) / df['ma50'] * 100
    return df

def calculate_returns(df):
    df = df.copy()
    df['ret_1h'] = df['close'].pct_change() * 100
    return df

def calculate_bollinger_bands(df, period=20, std_dev=2):
    df = df.copy()
    df['bb_mid'] = df['close'].rolling(period).mean()
    df['bb_std'] = df['close'].rolling(period).std()
    df['bb_upper'] = df['bb_mid'] + (df['bb_std'] * std_dev)
    df['bb_lower'] = df['bb_mid'] - (df['bb_std'] * std_dev)
    df['bb_pct'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
    return df

def calculate_momentum(df):
    df = df.copy()
    df['roc_5'] = (df['close'] / df['close'].shift(5) - 1) * 100
    df['roc_10'] = (df['close'] / df['close'].shift(10) - 1) * 100
    ema12 = df['close'].ewm(span=12, adjust=False).mean()
    ema26 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    rsi = df['rsi']
    rsi_min = rsi.rolling(14).min()
    rsi_max = rsi.rolling(14).max()
    df['stoch_rsi'] = (rsi - rsi_min) / (rsi_max - rsi_min) * 100
    return df

def calculate_time_features(df):
    df = df.copy()
    if isinstance(df.index, pd.DatetimeIndex):
        df['hour'] = df.index.hour
        df['day_of_week'] = df.index.dayofweek
    return df

def calculate_trend(df):
    df = df.copy()
    df['trend_score'] = np.where(df['ma20'] > df['ma50'], 1, -1)
    return df

def calculate_volatility(df):
    df = df.copy()
    df['volatility_24h'] = df['ret_1h'].rolling(24).std()
    return df

def add_all_indicators(df):
    df = calculate_wavetrend(df)
    df = calculate_wolfpack(df)
    df = calculate_rsi(df)
    df = calculate_atr(df)
    df = calculate_moving_averages(df)
    df = calculate_returns(df)
    df = calculate_bollinger_bands(df)
    df = calculate_momentum(df)
    df = calculate_time_features(df)
    df = calculate_trend(df)
    df = calculate_volatility(df)
    return df

print("Indicators defined!")

In [None]:
# ============================================================
# LOAD DATA
# ============================================================

print("Loading data...")
df = pd.read_csv('data/XAUUSD_KAGGLE_1h.csv', parse_dates=['datetime'], index_col='datetime')
print(f"Loaded {len(df):,} bars")
print(f"Date range: {df.index.min()} to {df.index.max()}")

print("\nCalculating indicators...")
df = add_all_indicators(df)
df = df.dropna()
print(f"Clean data: {len(df):,} bars")

In [None]:
# ============================================================
# TRAINING LOOP
# - Train during episodes (every 4 steps) + FULL episode replay at end
# - Episode replay boosted for good results, reduced for bad
# - Phase 1: Pure exploration (eps 1-100, epsilon=1.0)
# - Phase 2: Learning (epsilon decays on new best until 0.1)
# - Phase 3: Baselined (epsilon=0.1), plateau detection starts
# - BEST = highest total POINTS (not just P&L)
# ============================================================

print("="*70)
print("RL TRADING AGENT - TRAIN UNTIL PLATEAU")
print("="*70)

# Initialize
env = TradingEnvironment(df, initial_capital=1000, max_bars=5000)
agent = TradingAgent(epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.95)

# Training parameters
MIN_EXPLORATION_EPISODES = 100   # Pure exploration (epsilon=1.0)
PLATEAU_THRESHOLD = 150          # Episodes without improvement after baselined
BATCH_SIZE = 64
TRAIN_EVERY = 4

# Tracking
episode_rewards = []
episode_stats = []
all_trades = []
best_stats = None
best_points = -float('inf')  # Track best by POINTS, not P&L
episodes_since_best = 0
plateau_counter = 0  # Only counts when baselined (epsilon=0.1)
episode = 0

print(f"\nReward System:")
print(f"  - Losses hurt 3x")
print(f"  - Winning trades get +2 bonus")
print(f"  - Drawdown penalty above 5%")
print(f"  - Rolling Sharpe bonus every 100 bars")
print(f"  - BEST = highest total POINTS (not just P&L)")
print(f"\nPhase 1: Episodes 1-{MIN_EXPLORATION_EPISODES} = pure exploration (epsilon=1.0)")
print(f"Phase 2: Learning until epsilon reaches 0.1")
print(f"Phase 3: BASELINED (epsilon=0.1) - plateau detection ({PLATEAU_THRESHOLD} eps no improvement)")
print()
print(f"{'Ep':>5} | {'Points':>8} | {'Return':>8} | {'MaxDD':>6} | {'Sharpe':>6} | {'Trades':>6} | {'WR':>5} | {'Eps':>5} | {'Status':<12}")
print("-" * 100)

while True:
    # Start new episode
    agent.start_episode()
    state = env.reset()
    total_reward = 0
    step = 0
    
    # Run episode
    while not env.done:
        action_idx = agent.choose_action(state)
        action = agent.action_to_dict(action_idx, state['max_units'])
        next_state, reward, done, info = env.step(action)
        
        agent.remember(state, action_idx, reward, next_state, done)
        
        # Train during episode on random past experiences
        if step % TRAIN_EVERY == 0 and len(agent.memory) >= BATCH_SIZE:
            agent.replay(BATCH_SIZE)
        
        total_reward += reward
        state = next_state
        step += 1
    
    # Episode complete - get stats
    stats = env.get_episode_stats()
    stats['total_points'] = total_reward  # Add points to stats
    episode_rewards.append(total_reward)
    episode_stats.append(stats)
    all_trades.extend(stats.get('trades', []))
    
    # Determine boost factor for episode replay
    if stats.get('liquidated'):
        boost_factor = 0.25  # Reduce learning from liquidated episodes
    elif total_reward > 50:
        boost_factor = 3.0   # Strong boost for high-scoring episodes
    elif total_reward > 0:
        boost_factor = 2.0   # Boost positive point episodes
    elif total_reward > -20:
        boost_factor = 0.75  # Slight reduction for small negative
    else:
        boost_factor = 0.5   # Reduce learning from bad episodes
    
    # Train on full episode with boost
    agent.replay_episode(boost_factor=boost_factor)
    
    # Check if baselined (epsilon at minimum)
    is_baselined = agent.epsilon <= agent.epsilon_min
    
    # Check for new best - BY POINTS, not P&L
    is_new_best = False
    if total_reward > best_points and not stats.get('liquidated'):
        best_points = total_reward
        best_stats = stats.copy()
        episodes_since_best = 0
        plateau_counter = 0  # Reset plateau counter on new best
        is_new_best = True
        
        if episode >= MIN_EXPLORATION_EPISODES:
            agent.decay_epsilon()
        
        status = f"BEST! ε→{agent.epsilon:.2f}"
    elif stats.get('liquidated'):
        episodes_since_best += 1
        if is_baselined:
            plateau_counter += 1
        status = "LIQUIDATED"
    elif total_reward > 0:
        episodes_since_best += 1
        if is_baselined:
            plateau_counter += 1
        status = f"+pts x{boost_factor}"
    else:
        episodes_since_best += 1
        if is_baselined:
            plateau_counter += 1
        status = f"-pts x{boost_factor}"
    
    # Print every episode
    max_dd = stats.get('max_drawdown', 0)
    ep_sharpe = stats.get('episode_sharpe', 0)
    win_rate = stats.get('win_rate', 0)
    
    print(f"{episode+1:5d} | {total_reward:+7.1f} | {stats['total_return_pct']:+7.1f}% | {max_dd:5.1f}% | {ep_sharpe:+5.2f} | "
          f"{stats['num_trades']:6d} | {win_rate:4.1f}% | {agent.epsilon:.3f} | {status}")
    
    # Progress summary every 50 episodes
    if (episode + 1) % 50 == 0:
        print("-" * 100)
        if not is_baselined:
            phase = "EXPLORATION" if episode < MIN_EXPLORATION_EPISODES else f"LEARNING (ε={agent.epsilon:.2f})"
        else:
            phase = f"BASELINED - plateau {plateau_counter}/{PLATEAU_THRESHOLD}"
        print(f"Episode {episode+1} [{phase}] | Best points: {best_points:.1f} | "
              f"Eps since best: {episodes_since_best}")
        if best_stats:
            print(f"Best episode: {best_stats.get('total_points', 0):.1f} pts, "
                  f"{best_stats['total_return_pct']:.1f}% return, "
                  f"{best_stats['num_trades']} trades, "
                  f"{best_stats.get('win_rate', 0):.1f}% WR, "
                  f"Sharpe: {best_stats.get('episode_sharpe', 0):.2f}")
        print(f"Total experiences in memory: {len(agent.memory):,}")
        print("-" * 100)
    
    episode += 1
    
    # Check for plateau - ONLY when baselined (epsilon at floor)
    if is_baselined and plateau_counter >= PLATEAU_THRESHOLD:
        print("\n" + "="*100)
        print(f"PLATEAU REACHED: Baselined and no improvement for {plateau_counter} episodes. Stopping.")
        print(f"Best points: {best_points:.1f} | Best return: {best_stats['total_return_pct']:.1f}%")
        print("="*100)
        break
    
    # Safety max (very high)
    if episode >= 10000:
        print("\nMax episodes reached (10000). Stopping.")
        break

print(f"\nTraining complete after {episode} episodes!")
print(f"Best total points: {best_points:.1f}")
print(f"Best P&L: {best_stats['total_return_pct']:.1f}%")
print(f"Total experiences collected: {len(agent.memory):,}")

In [None]:
# ============================================================
# OUTLIER ANALYSIS - What patterns lead to big wins/losses?
# ============================================================

print("\n" + "="*70)
print("OUTLIER ANALYSIS")
print("="*70)

if all_trades:
    trades_df = pd.DataFrame(all_trades)
    
    print(f"\nTotal trades analyzed: {len(trades_df):,}")
    
    # Big wins (>5% account growth)
    big_wins = trades_df[trades_df['account_pct_change'] > 5]
    print(f"\n{'='*50}")
    print(f"BIG WINS (>5% account growth): {len(big_wins)}")
    print(f"{'='*50}")
    
    if len(big_wins) > 0:
        print("\nAverage entry conditions for big wins:")
        for col in ['wt1', 'rsi', 'macd_hist', 'bb_pct', 'trend_score']:
            vals = [t['entry_indicators'].get(col, 0) for t in big_wins.to_dict('records')]
            print(f"  {col}: {np.mean(vals):.2f}")
        
        print(f"\nBig wins by position type:")
        print(f"  Long:  {(big_wins['position'] > 0).sum()}")
        print(f"  Short: {(big_wins['position'] < 0).sum()}")
        
        print(f"\nAvg bars held for big wins: {big_wins['bars_held'].mean():.0f}")
        
        print("\nTop 3 biggest wins:")
        for i, trade in big_wins.nlargest(3, 'account_pct_change').iterrows():
            entry = trade['entry_indicators']
            print(f"  +{trade['account_pct_change']:.1f}% | "
                  f"{'LONG' if trade['position'] > 0 else 'SHORT'} | "
                  f"Held {trade['bars_held']} bars | "
                  f"RSI={entry.get('rsi', 0):.0f}, WT={entry.get('wt1', 0):.0f}")
    
    # Big losses (>3% account loss)
    big_losses = trades_df[trades_df['account_pct_change'] < -3]
    print(f"\n{'='*50}")
    print(f"BIG LOSSES (>3% account loss): {len(big_losses)}")
    print(f"{'='*50}")
    
    if len(big_losses) > 0:
        print("\nAverage entry conditions for big losses:")
        for col in ['wt1', 'rsi', 'macd_hist', 'bb_pct', 'trend_score']:
            vals = [t['entry_indicators'].get(col, 0) for t in big_losses.to_dict('records')]
            print(f"  {col}: {np.mean(vals):.2f}")
        
        print(f"\nBig losses by position type:")
        print(f"  Long:  {(big_losses['position'] > 0).sum()}")
        print(f"  Short: {(big_losses['position'] < 0).sum()}")
        
        print(f"\nBig losses by exit reason:")
        print(big_losses['exit_reason'].value_counts().to_string())
        
        print("\nTop 3 biggest losses:")
        for i, trade in big_losses.nsmallest(3, 'account_pct_change').iterrows():
            entry = trade['entry_indicators']
            print(f"  {trade['account_pct_change']:.1f}% | "
                  f"{'LONG' if trade['position'] > 0 else 'SHORT'} | "
                  f"Exit: {trade['exit_reason']} | "
                  f"RSI={entry.get('rsi', 0):.0f}, WT={entry.get('wt1', 0):.0f}")
    
    # Pattern comparison
    print(f"\n{'='*50}")
    print("PATTERN COMPARISON: Wins vs Losses")
    print(f"{'='*50}")
    
    wins = trades_df[trades_df['pnl_dollars'] > 0]
    losses = trades_df[trades_df['pnl_dollars'] < 0]
    
    if len(wins) > 0 and len(losses) > 0:
        print(f"\n{'Indicator':<15} | {'Wins Avg':>10} | {'Losses Avg':>10} | {'Diff':>10}")
        print("-" * 50)
        for col in ['wt1', 'rsi', 'macd_hist', 'bb_pct', 'trend_score', 'atr_pct']:
            win_vals = [t['entry_indicators'].get(col, 0) for t in wins.to_dict('records')]
            loss_vals = [t['entry_indicators'].get(col, 0) for t in losses.to_dict('records')]
            win_avg = np.mean(win_vals)
            loss_avg = np.mean(loss_vals)
            diff = win_avg - loss_avg
            print(f"{col:<15} | {win_avg:>10.2f} | {loss_avg:>10.2f} | {diff:>+10.2f}")
    
    # Exit reason analysis
    print(f"\n{'='*50}")
    print("EXIT REASON ANALYSIS")
    print(f"{'='*50}")
    print(trades_df.groupby('exit_reason')['pnl_dollars'].agg(['count', 'mean', 'sum']).round(2))
    
else:
    print("No trades to analyze.")

In [None]:
# ============================================================
# TEST THE TRAINED AGENT
# ============================================================

print("\n" + "="*60)
print("TESTING TRAINED AGENT")
print("="*60)

# Test on last 20% of data (unseen)
test_start = int(len(df) * 0.8)
test_df = df.iloc[test_start:].copy()
print(f"\nTest period: {test_df.index.min().date()} to {test_df.index.max().date()}")
print(f"Test bars: {len(test_df):,}")

# Create test environment
test_env = TradingEnvironment(test_df, initial_capital=1000)

# Run test episode (no exploration)
agent.epsilon = 0  # Pure exploitation
state = test_env.reset(start_idx=0)

while not test_env.done:
    action_idx = agent.choose_action(state, training=False)
    action = agent.action_to_dict(action_idx, state['max_units'])
    state, reward, done, info = test_env.step(action)

# Get results
test_stats = test_env.get_episode_stats()

print(f"\n{'Metric':<20} {'Value':>15}")
print("-" * 35)
print(f"{'Starting Capital':<20} ${test_stats['starting_capital']:>14,.2f}")
print(f"{'Ending Capital':<20} ${test_stats['ending_capital']:>14,.2f}")
print(f"{'Total Return':<20} {test_stats['total_return_pct']:>14.1f}%")
print(f"{'Trades':<20} {test_stats['num_trades']:>15,}")
print(f"{'Win Rate':<20} {test_stats.get('win_rate', 0):>14.1f}%")
print(f"{'Avg Win':<20} ${test_stats.get('avg_win', 0):>14,.2f}")
print(f"{'Avg Loss':<20} ${test_stats.get('avg_loss', 0):>14,.2f}")
print(f"{'Best Trade':<20} ${test_stats.get('best_trade', 0):>14,.2f}")
print(f"{'Worst Trade':<20} ${test_stats.get('worst_trade', 0):>14,.2f}")
print(f"{'Max Drawdown':<20} {test_stats.get('max_drawdown', 0):>14.1f}%")
print(f"{'Liquidated':<20} {'YES' if test_stats['liquidated'] else 'NO':>15}")

In [None]:
# ============================================================
# TEST EQUITY CURVE
# ============================================================

if len(test_env.equity_curve) > 1:
    plt.figure(figsize=(12, 5))
    
    equity = test_env.equity_curve
    plt.plot(equity, linewidth=2)
    plt.axhline(y=1000, color='gray', linestyle='--', label='Starting capital')
    plt.axhline(y=300, color='red', linestyle='--', label='Liquidation level')
    
    plt.xlabel('Bar')
    plt.ylabel('Equity ($)')
    plt.title(f'Test Period Equity Curve\nFinal: ${equity[-1]:,.2f} ({test_stats["total_return_pct"]:+.1f}%)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# ============================================================
# TRADE ANALYSIS
# ============================================================

if test_env.trades:
    trades_df = pd.DataFrame(test_env.trades)
    
    print("\nTrade Distribution:")
    print(f"  Long trades:  {(trades_df['position'] > 0).sum()}")
    print(f"  Short trades: {(trades_df['position'] < 0).sum()}")
    
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.hist(trades_df['pnl_dollars'], bins=30, edgecolor='black', alpha=0.7)
    plt.axvline(x=0, color='red', linestyle='--')
    plt.xlabel('P&L ($)')
    plt.ylabel('Frequency')
    plt.title('Trade P&L Distribution')
    
    plt.subplot(1, 2, 2)
    cumulative_pnl = trades_df['pnl_dollars'].cumsum()
    plt.plot(cumulative_pnl, linewidth=2)
    plt.axhline(y=0, color='gray', linestyle='--')
    plt.xlabel('Trade #')
    plt.ylabel('Cumulative P&L ($)')
    plt.title('Cumulative P&L by Trade')
    
    plt.tight_layout()
    plt.show()

In [None]:
# ============================================================
# SAVE MODEL
# ============================================================

import os
os.makedirs('models/rl_v1', exist_ok=True)

# Save agent components
joblib.dump(agent.model, 'models/rl_v1/q_network.pkl')
joblib.dump(agent.scaler, 'models/rl_v1/scaler.pkl')

# Save config
config = {
    'version': 'rl_v1',
    'type': 'reinforcement_learning',
    'goal': 'Maximum profit through learning',
    'training_episodes': episode,
    'action_space': agent.action_size,
    'state_size': agent.state_size,
    'trading_params': {
        'initial_capital': 1000,
        'margin_per_unit': 36.57,
        'unit_size_oz': 0.15,
        'spread_pct': 0.0004,
        'liquidation_pct': 0.30
    },
    'test_performance': {
        'period': f"{test_df.index.min().date()} to {test_df.index.max().date()}",
        'starting_capital': test_stats['starting_capital'],
        'ending_capital': test_stats['ending_capital'],
        'total_return_pct': test_stats['total_return_pct'],
        'trades': test_stats['num_trades'],
        'win_rate': test_stats.get('win_rate', 0),
        'max_drawdown': test_stats.get('max_drawdown', 0),
        'liquidated': test_stats['liquidated']
    },
    'best_training_episode': best_stats
}

with open('models/rl_v1/config.json', 'w') as f:
    json.dump(config, f, indent=2, default=str)

print("Model saved to models/rl_v1/")

In [None]:
# ============================================================
# DOWNLOAD MODEL
# ============================================================

!cd models && zip -r rl_v1.zip rl_v1/

from google.colab import files
files.download('models/rl_v1.zip')

print("\nDownload started!")

In [None]:
# ============================================================
# SUMMARY
# ============================================================

# Calculate liquidation rate from episode stats
stats_df = pd.DataFrame(episode_stats)
liquidation_rate = stats_df['liquidated'].mean() * 100 if len(stats_df) > 0 else 0

print("\n" + "="*60)
print("RL TRADING AGENT v1.0 - COMPLETE")
print("="*60)
print(f"""
This agent learned to trade Gold by doing.

Training:
- {episode} episodes of trial and error
- Started with $1,000 each episode
- Could go long, short, pyramid, use stops
- Learned from blowups and wins

What it learned:
- Best episode return: {best_stats['total_return_pct']:.1f}% (training)
- Liquidation rate: {liquidation_rate:.1f}%

Test results ({test_df.index.min().date()} to {test_df.index.max().date()}):
- Return: {test_stats['total_return_pct']:.1f}%
- Trades: {test_stats['num_trades']}
- Win Rate: {test_stats.get('win_rate', 0):.1f}%
- Max Drawdown: {test_stats.get('max_drawdown', 0):.1f}%

Model saved and ready to download.
""")