In [2]:
!pip install gymnasium stable-baselines3 yfinance torch pandas numpy


Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting yfinance
  Downloading yfinance-0.2.48-py2.py3-none-any.whl.metadata (13 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting lxml>=4.9.1 (from yfinance)
  Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.7.tar.gz (939 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [3]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd
import yfinance as yf
import torch as th
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
import torch.nn as nn

# Download BTC data from Yahoo Finance
data = yf.download('BTC-USD', start='2020-01-01', end='2023-01-01')
data = data[['Open', 'High', 'Low', 'Close', 'Volume']]
data.ffill(inplace=True)  # Forward fill using ffill to avoid FutureWarning

class BTCTradingEnv(gym.Env):
    def __init__(self, data, sequence_length=10):
        super(BTCTradingEnv, self).__init__()

        self.data = data
        self.sequence_length = sequence_length
        self.current_step = 0
        self.cash = 10000  # Starting cash
        self.shares_held = 0
        self.total_assets = self.cash

        # Define observation and action spaces
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(sequence_length, len(data.columns)), dtype=np.float32
        )
        self.action_space = spaces.Discrete(3)  # 0: Sell, 1: Hold, 2: Buy

    def reset(self, seed=None):
        # Set the seed for reproducibility
        self.np_random, seed = gym.utils.seeding.np_random(seed)

        # Reset state variables
        self.current_step = 0
        self.cash = 10000
        self.shares_held = 0
        self.total_assets = self.cash
        return self._get_observation(), {}

    def _get_observation(self):
        # Get a sequence of past data and cast it to float32
        end = self.current_step + self.sequence_length
        obs = self.data.iloc[self.current_step:end].values.astype(np.float32)
        return obs

    def step(self, action):
        # Use .item() to get a native float from a single element Series
        current_price = float(self.data.iloc[self.current_step]['Close'].item())
        reward = 0.0  # Initialize reward as a native Python float

        # Execute trade logic
        if action == 0:  # Sell
            reward = float(self.shares_held * current_price)  # Ensure reward is a native float
            self.cash += reward
            self.shares_held = 0
        elif action == 2:  # Buy
            max_shares = self.cash // current_price
            self.shares_held += max_shares
            self.cash -= max_shares * current_price

        # Update portfolio value and ensure reward is a float
        self.total_assets = self.cash + self.shares_held * current_price
        reward = float(self.total_assets - self.cash)  # Convert reward to native float

        self.current_step += 1
        done = self.current_step >= len(self.data) - self.sequence_length
        obs = self._get_observation()

        return obs, reward, done, False, {}

# Initialize the environment with sequential data
env = BTCTradingEnv(data)

# Check if the environment is valid
from stable_baselines3.common.env_checker import check_env
check_env(env)

# Define the LSTM-based policy network for PPO
class LSTMFeatureExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=256):
        super(LSTMFeatureExtractor, self).__init__(observation_space, features_dim)

        n_input_channels = observation_space.shape[1]
        self.lstm = nn.LSTM(input_size=n_input_channels, hidden_size=features_dim, batch_first=True)

    def forward(self, observations):
        # Pass the observations through the LSTM layer
        lstm_out, _ = self.lstm(observations)
        return lstm_out[:, -1, :]  # Use the last LSTM output as the feature

# Custom LSTM policy using the feature extractor
class CustomLSTMPolicy(ActorCriticPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomLSTMPolicy, self).__init__(*args, **kwargs,
            features_extractor_class=LSTMFeatureExtractor,
            features_extractor_kwargs=dict(features_dim=256))

# Train the PPO model with the custom LSTM policy
model = PPO(CustomLSTMPolicy, env, verbose=1)
model.learn(total_timesteps=10000)


[*********************100%***********************]  1 of 1 completed


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.09e+03 |
|    ep_rew_mean     | 4.62e+05 |
| time/              |          |
|    fps             | 470      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.09e+03      |
|    ep_rew_mean          | 1.93e+06      |
| time/                   |               |
|    fps                  | 276           |
|    iterations           | 2             |
|    time_elapsed         | 14            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 7.4218115e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2       

<stable_baselines3.ppo.ppo.PPO at 0x7b2cd6a12140>

In [7]:
# Reset the environment for testing
obs, _ = env.reset()
done = False
cumulative_reward = 0
total_steps = 0

# Run a loop to simulate trading using the trained model
while not done:
    # Use the trained model to predict the action
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = env.step(action)

    # Accumulate rewards for performance tracking
    cumulative_reward += reward
    total_steps += 1

# Output test results
final_cash = env.cash
final_assets = env.total_assets
final_shares_held = env.shares_held

print(f"Test Results after {total_steps} steps:")
print(f"Final Cash: ${final_cash:.2f}")
print(f"Final Total Assets: ${final_assets:.2f}")
print(f"Final Shares Held: {final_shares_held}")
print(f"Cumulative Reward: {cumulative_reward:.2f}")


Test Results after 1086 steps:
Final Cash: $30160.55
Final Total Assets: $30160.55
Final Shares Held: 0
Cumulative Reward: 4774247.82


In [8]:
# Reset the environment for testing
obs, _ = env.reset()
done = False
cumulative_reward = 0
total_steps = 0

# Run a loop to simulate trading using the trained model
print("Step | Action  | Portfolio Value | Reward | Total Cash | Shares Held")
print("---------------------------------------------------------------------")

while not done:
    # Use the trained model to predict the action
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = env.step(action)

    # Determine action type
    action_name = "Hold"
    if action == 0:
        action_name = "Sell"
    elif action == 2:
        action_name = "Buy"

    # Accumulate rewards for performance tracking
    cumulative_reward += reward
    total_steps += 1

    # Print step-by-step action log
    print(f"{total_steps:<5} | {action_name:<6} | ${env.total_assets:.2f}       | {reward:.2f}   | ${env.cash:.2f}    | {env.shares_held}")

# Output final test results
final_cash = env.cash
final_assets = env.total_assets
final_shares_held = env.shares_held

print("\nTest Results:")
print(f"Final Cash: ${final_cash:.2f}")
print(f"Final Total Assets: ${final_assets:.2f}")
print(f"Final Shares Held: {final_shares_held}")
print(f"Cumulative Reward: {cumulative_reward:.2f}")


Step | Action  | Portfolio Value | Reward | Total Cash | Shares Held
---------------------------------------------------------------------
1     | Hold   | $10000.00       | 0.00   | $10000.00    | 0
2     | Hold   | $10000.00       | 0.00   | $10000.00    | 0
3     | Sell   | $10000.00       | 0.00   | $10000.00    | 0
4     | Hold   | $10000.00       | 0.00   | $10000.00    | 0
5     | Buy    | $10000.00       | 7411.32   | $2588.68    | 1.0
6     | Sell   | $10357.90       | 0.00   | $10357.90    | 0
7     | Sell   | $10357.90       | 0.00   | $10357.90    | 0
8     | Buy    | $10357.90       | 8079.86   | $2278.04    | 1.0
9     | Buy    | $10157.11       | 7879.07   | $2278.04    | 1.0
10    | Sell   | $10444.59       | 0.00   | $10444.59    | 0
11    | Buy    | $10444.59       | 8037.54   | $2407.06    | 1.0
12    | Sell   | $10599.55       | 0.00   | $10599.55    | 0
13    | Buy    | $10599.55       | 8144.19   | $2455.36    | 1.0
14    | Buy    | $11283.12       | 8827.76   | $