In [1]:
import gym
from gym import spaces
import numpy as np
import pandas as pd
import random
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv





In [2]:


# Constants for the environment
MAX_SHARE_PRICE = 500  # Example max price can be adjusted as needed
MAX_NUM_SHARES = 1000
MAX_ACCOUNT_BALANCE = 10000
INITIAL_ACCOUNT_BALANCE = 1000
MAX_STEPS = 10000  



In [23]:


class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""

    def __init__(self, df,render_mode=None):
        super(StockTradingEnv, self).__init__()

        self.df = df
        self.reward_range = (0, MAX_ACCOUNT_BALANCE)
        self.render_mode = render_mode

        # Actions of the format Buy x%, Sell x%, Hold, etc.
        self.action_space = spaces.Box(low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float32)

        # Prices contains the OHCL values for the last five prices
        self.observation_space = spaces.Box(low=0, high=1, shape=(7, 7), dtype=np.float32)

    def _next_observation(self):
        start = max(self.current_step - 5, 0)
        end = max(self.current_step, 1)
        frame = np.array([
            self.df.loc[start:end, 'Open'].values / MAX_SHARE_PRICE,
            self.df.loc[start:end, 'High'].values / MAX_SHARE_PRICE,
            self.df.loc[start:end, 'Low'].values / MAX_SHARE_PRICE,
            self.df.loc[start:end, 'Close'].values / MAX_SHARE_PRICE,
            self.df.loc[start:end, 'Volume'].values / MAX_NUM_SHARES,
            self.df.loc[start:end, 'VWAP'].values / MAX_SHARE_PRICE,
            self.df.loc[start:end, 'RSI'].values / 100
        ]).T  # Keeping as is, (6, 7)

        if frame.shape[0] < 6:
            padding = np.zeros((6 - frame.shape[0], 7))  # Match the number of columns
            frame = np.vstack([padding, frame])

        financial_metrics = np.array([
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE),
            0  # Placeholder or another meaningful metric
        ]).reshape(1, 7)  # Now (1, 7)

        return np.vstack([frame, financial_metrics])


    def _take_action(self, action):
        # Setting the current price to a random price within the time step
        current_price = random.uniform(
            self.df.loc[self.current_step, "Open"], self.df.loc[self.current_step, "Close"])

        action_type = action[0]
        amount = action[1]

        if action_type < 1:
            # Buy amount % of balance in shares
            total_possible = int(self.balance / current_price)
            shares_bought = int(total_possible * amount)
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price

            self.balance -= additional_cost
            if self.shares_held + shares_bought > 0:
                self.cost_basis = (prev_cost + additional_cost) / (self.shares_held + shares_bought)
            else:
                self.cost_basis = 0

            self.shares_held += shares_bought

        elif action_type < 2:
            # Sell amount % of shares held
            shares_sold = int(self.shares_held * amount)
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price

        self.net_worth = self.balance + self.shares_held * current_price

        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0

    def step(self, action):
        self._take_action(action)
        self.current_step += 1

        if self.current_step > len(self.df) - 6:
            self.current_step = 0  

        reward = self.balance * (self.current_step / MAX_STEPS)
        done = self.net_worth <= 0 or self.current_step >= len(self.df) - 6

        obs = self._next_observation()

        return obs, reward, done, {}

    def reset(self):
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0
        self.current_step = random.randint(0, max(0, len(self.df) - 6))
        return self._next_observation()

    def render(self, mode='human', close=False):
        if close:
            return
        if self.render_mode == 'human' or mode == 'human':
            profit = self.net_worth - INITIAL_ACCOUNT_BALANCE
            print(f'Step: {self.current_step} Balance: {self.balance} Shares held: {self.shares_held} '
                  f'Avg cost for held shares: {self.cost_basis} Total sales value: {self.total_sales_value} '
                  f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth}) Profit: {profit}')
        else:
            raise NotImplementedError(f"Render mode {self.render_mode} not available")



In [24]:

def rsi_calculation(prices, interval=14):
    """
    Compute the Relative Strength Index (RSI) for a given set of prices.

    Args:
    prices (pd.Series): Series of prices.
    interval (int): The period over which to calculate RSI values.

    Returns:
    pd.Series: The RSI values.
    """
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).fillna(0)
    loss = (-delta.where(delta < 0, 0)).fillna(0)

    # Calculate the Exponential Moving Averages (EMA) for gains and losses
    avg_gain = gain.ewm(com=interval - 1, min_periods=interval).mean()
    avg_loss = loss.ewm(com=interval - 1, min_periods=interval).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    rsi = rsi.fillna(0)  # Initially, before enough data points, RSI is undefined

    return rsi

In [25]:


# Load and preprocess data
df = pd.read_csv('./data/AAPL.csv')
df.dropna(inplace=True)
df['VWAP'] = (df['Close'] * df['Volume']).cumsum() / df['Volume'].cumsum()
df['RSI'] = rsi_calculation(df['Close'])



In [26]:

# Create and use environment
env = DummyVecEnv([lambda: StockTradingEnv(df,render_mode='human')])
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=20000)


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Using cuda device
-----------------------------
| time/              |      |
|    fps             | 243  |
|    iterations      | 1    |
|    time_elapsed    | 8    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 211          |
|    iterations           | 2            |
|    time_elapsed         | 19           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0054928446 |
|    clip_fraction        | 0.0228       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.84        |
|    explained_variance   | 2.92e-06     |
|    learning_rate        | 0.0003       |
|    loss                 | 9.64e+06     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00401     |
|    std                  | 0.998        |
|    value_loss           | 2.51e+07     

<stable_baselines3.ppo.ppo.PPO at 0x2b33c23db10>

In [27]:
obs = env.reset()
for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)                                                                                                                                                                                                  
    env.render()



In [28]:
obs = env.reset()
for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    # Manually print or log the information we want
    env.get_attr('current_step')[0]  # Using get_attr to access attributes from environments within VecEnv
    print(f"Step: {env.get_attr('current_step')[0]}")
    print(f"Balance: {env.get_attr('balance')[0]}")
    print(f"Net Worth: {env.get_attr('net_worth')[0]}")
    if done:
        break


Step: 184
Balance: 1000.0
Net Worth: 1000.0
Step: 185
Balance: 1000.0
Net Worth: 1000.0
Step: 186
Balance: 1000.0
Net Worth: 1000.0
Step: 187
Balance: 1000.0
Net Worth: 1000.0
Step: 188
Balance: 1000.0
Net Worth: 1000.0
Step: 189
Balance: 1000.0
Net Worth: 1000.0
Step: 190
Balance: 1000.0
Net Worth: 1000.0
Step: 191
Balance: 1000.0
Net Worth: 1000.0
Step: 192
Balance: 1000.0
Net Worth: 1000.0
Step: 193
Balance: 1000.0
Net Worth: 1000.0
Step: 194
Balance: 228.9509683783058
Net Worth: 1000.0
Step: 195
Balance: 228.9509683783058
Net Worth: 1104.5784818094337
Step: 196
Balance: 228.9509683783058
Net Worth: 1165.4614995717948
Step: 197
Balance: 228.9509683783058
Net Worth: 1193.0054226742654
Step: 198
Balance: 113.27515981018365
Net Worth: 1192.9160397793237
Step: 199
Balance: 76.71163035820862
Net Worth: 1137.0539844654843
Step: 200
Balance: 1147.9472774002581
Net Worth: 1147.9472774002581
Step: 201
Balance: 23.127567734954255
Net Worth: 1147.9472774002581
Step: 202
Balance: 23.12756773495

Render method was not working so printed the results by directly calling them
