In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
import yfinance as yf
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

2025-04-02 18:22:30.591396: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743618150.611528   26583 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743618150.618332   26583 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743618150.633618   26583 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743618150.633637   26583 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743618150.633639   26583 computation_placer.cc:177] computation placer alr

In [4]:
# ======================
# Technical Indicators
# ======================
def compute_rsi(prices, window=14):
    delta = prices.diff()
    gain = delta.where(delta > 0, 0).fillna(0)
    loss = -delta.where(delta < 0, 0).fillna(0)
    
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()
    
    rs = avg_gain / (avg_loss + 1e-10)
    return 100 - (100 / (1 + rs))

def compute_bollinger_bands(prices, window=20):
    sma = prices.rolling(window=window).mean()
    std = prices.rolling(window=window).std()
    return sma + 2 * std, sma - 2 * std

def compute_macd(prices, fast=12, slow=26, signal=9):
    ema_fast = prices.ewm(span=fast, adjust=False).mean()
    ema_slow = prices.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    return macd, signal_line

In [5]:
# ======================
# Trading Environment (Gymnasium)
# ======================
class AdvancedTradingEnv(gym.Env):
    def __init__(self, df, initial_balance=10000, transaction_fee=0.001, window_size=30):
        super().__init__()
        
        self.df = df
        self.initial_balance = initial_balance
        self.transaction_fee = transaction_fee
        self.window_size = window_size
        
        # Action space
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
        
        # Observation space
        self.observation_space = spaces.Box(
            low=-np.inf, 
            high=np.inf, 
            shape=(10,),
            dtype=np.float32
        )
        
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.position = 0.0
        self.portfolio_history = [self.initial_balance] * self.window_size
        self.peak_portfolio = self.initial_balance
        self.max_drawdown = 0.0
        return self._next_observation(), {}

    def _next_observation(self):
        window_data = self.df.iloc[self.current_step - self.window_size:self.current_step]
        
        # Explicit scalar extraction
        close_price = float(window_data['Close'].iloc[-1])
        sma_20 = float(window_data['SMA_20'].iloc[-1])
        rsi = float(window_data['RSI_14'].iloc[-1])
        upper_bb = float(window_data['Upper_BB'].iloc[-1])
        lower_bb = float(window_data['Lower_BB'].iloc[-1])
        macd = float(window_data['MACD'].iloc[-1])
        macd_signal = float(window_data['MACD_Signal'].iloc[-1])
        volume = float(window_data['Volume'].iloc[-1])
        volume_ma5 = float(window_data['Volume_MA5'].iloc[-1])
        
        features = np.array([
            close_price / sma_20 - 1,
            (rsi - 50) / 50,
            (upper_bb - lower_bb) / sma_20,
            macd - macd_signal,
            (volume - volume_ma5) / (np.std(window_data['Volume'].values[-5:]) + 1e-10),
            self.balance / self.initial_balance,
            self.position * close_price / self.initial_balance,
            np.log(close_price / float(window_data['Close'].iloc[-5])),
            np.log(volume / float(window_data['Volume'].iloc[-5])),
            self.max_drawdown
        ], dtype=np.float32)
        
        return features

    def step(self, action):
        current_price = float(self.df['Close'].iloc[self.current_step])
        prev_portfolio = self.portfolio_history[-1]
        
        # Execute action
        trade_value = action[0] * self.balance
        fee = abs(trade_value) * self.transaction_fee
        
        if trade_value > 0:  # Buy
            max_affordable = (self.balance - fee) / current_price
            units_bought = min(trade_value / current_price, max_affordable)
            self.position += units_bought
            self.balance -= units_bought * current_price + fee
        elif trade_value < 0:  # Sell
            units_sold = min(abs(trade_value) / current_price, self.position)
            self.position -= units_sold
            self.balance += units_sold * current_price - fee

        # Update portfolio
        portfolio_value = self.balance + self.position * current_price
        self.portfolio_history.append(portfolio_value)
        
        # Risk metrics
        self.peak_portfolio = max(self.peak_portfolio, portfolio_value)
        current_drawdown = (self.peak_portfolio - portfolio_value) / self.peak_portfolio
        self.max_drawdown = max(self.max_drawdown, current_drawdown)
        
        # Calculate returns
        # Calculate returns CORRECTED VERSION
        hist_window = np.array(self.portfolio_history[-self.window_size:])
        if len(hist_window) >= 2:  # Ensure we have enough data
            returns = np.diff(hist_window) / hist_window[:-1]
        else:
            returns = np.array([0.0])
        sharpe_ratio = np.mean(returns) / (np.std(returns) + 1e-10) * np.sqrt(252)
        
        # Reward components
        reward = (portfolio_value - prev_portfolio) + \
                0.2 * sharpe_ratio - \
                0.5 * self.max_drawdown - \
                0.1 * fee
        
        # Move to next step
        self.current_step += 1
        done = self.current_step >= len(self.df) - 1
        
        return self._next_observation(), reward, done, False, {}

In [6]:
# ======================
# Data Preparation
# ======================
def prepare_data(symbol="BTC-USD", start="2018-01-01", end="2024-01-01"):
    df = yf.download(symbol, start=start, end=end)
    
    # Calculate indicators
    df['SMA_20'] = df['Close'].rolling(20).mean()
    df['RSI_14'] = compute_rsi(df['Close'])
    df['Upper_BB'], df['Lower_BB'] = compute_bollinger_bands(df['Close'])
    df['MACD'], df['MACD_Signal'] = compute_macd(df['Close'])
    df['Volume_MA5'] = df['Volume'].rolling(5).mean()
    
    # Validate data
    df = df.dropna()
    df = df.astype(np.float32)
    
    return df

In [7]:
# ======================
# Training & Evaluation
# ======================
def train_model(df, window_size=365, total_timesteps=100000):
    env = make_vec_env(lambda: AdvancedTradingEnv(df), n_envs=4)
    
    model = PPO(
        "MlpPolicy",
        env,
        verbose=1,
        learning_rate=3e-4,
        n_steps=2048,
        batch_size=64,
        gamma=0.99,
        device="auto"
    )
    
    # Rolling window training
    for i in range(0, len(df) - window_size, window_size//2):
        window_df = df.iloc[i:i+window_size]
        env = make_vec_env(lambda: AdvancedTradingEnv(window_df), n_envs=4)
        model.set_env(env)
        model.learn(total_timesteps=total_timesteps, reset_num_timesteps=False)
        
    return model

In [8]:
def evaluate_model(model, test_df):
    env = AdvancedTradingEnv(test_df)
    obs, _ = env.reset()
    done = False
    
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, _, done, _, _ = env.step(action)
    
    final_value = env.balance + env.position * test_df['Close'].iloc[-1]
    buy_hold_value = env.initial_balance * (test_df['Close'].iloc[-1] / test_df['Close'].iloc[0])
    
    print(f"\n{' Final Results ':=^40}")
    print(f"RL Portfolio Value: ${final_value.values[0]:,.2f}")  # Alternative method
    print(f"Buy & Hold Value:   ${buy_hold_value.values[0]:,.2f}")
    print(f"Max Drawdown:       {env.max_drawdown*100:.2f}%")
    print("="*40)

In [9]:
# ======================
# Main Execution
# ======================
if __name__ == "__main__":
    full_data = prepare_data()
    train_df = full_data.iloc[:-365]
    test_df = full_data.iloc[-365:]
    
    model = train_model(train_df)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


Using cuda device
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 334       |
|    ep_rew_mean     | -6.79e+03 |
| time/              |           |
|    fps             | 223       |
|    iterations      | 1         |
|    time_elapsed    | 36        |
|    total_timesteps | 8192      |
----------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 334          |
|    ep_rew_mean          | -6.82e+03    |
| time/                   |              |
|    fps                  | 208          |
|    iterations           | 2            |
|    time_elapsed         | 78           |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0032131022 |
|    clip_fraction        | 0.0175       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0.

In [10]:
    evaluate_model(model, test_df)
    model.save("rl_trading_model_final")


RL Portfolio Value: $18,152.04
Buy & Hold Value:   $25,422.55
Max Drawdown:       20.06%
