<a href="https://colab.research.google.com/github/ccasanoval/RLtests/blob/master/stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium
!pip install stable_baselines3
!pip install yfinance



In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 30 11:50:19 2024

@author: cesar.casanova
"""

#!pip install gym pandas yfinance stable-baselines3 shimmy

TRAIN = True

itx = 'ITX.MC'
sacyr = 'SCYR.MC'
ohl = 'OHLA.MC'
san = "SAN.MC"

TICKER = itx
TICKER_TEST = TICKER

START = "2020-01-01"
END = "2024-09-02"

INITIAL_BALANCE = 10_000
TRAIN_STEPS = 100_000_000

MODEL_NAME = f"stock_{TICKER}"#_{int(TRAIN_STEPS/1000)}k"

#------------------------------------------------------------------------------
# TRAIN ENVIRONMENT

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import yfinance as yf
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

class StockTradingEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, df, pct_df, max_steps=1000):
        super(StockTradingEnv, self).__init__()

        self.df = df
        self.pct_df = pct_df
        self.reward_range = (-np.inf, np.inf)
        self.action_space = spaces.Box(low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(5, 5), dtype=np.float16)
        self.initial_balance = INITIAL_BALANCE
        self.balance = self.initial_balance
        self.shares_held = 0
        self.current_step = 6
        self.max_steps=max_steps
        self.train_cnt_epoch=len(self.df.loc[:, 'Close'].values) -2 -6


    def step(self, action):
      self.current_step += 1

      action_type = action[0]
      amount = action[1]

      close_price = self.df.loc[self.current_step, 'Close']
      next_day_close_price = self.df.loc[self.current_step+1, 'Close']

      shares_bought = 0
      shares_sold = 0
      asset_value_before_action = self.balance + self.shares_held * close_price

      if action_type < 1:
          # Hold
          pass
      elif action_type < 2:
          # Buy
          total_possible = int(self.balance / close_price)
          shares_bought = int(total_possible * amount)
          total_cost = shares_bought * close_price
          self.balance -= total_cost
          self.shares_held += shares_bought
      elif action_type < 3:
          # Sell
          shares_sold = int(self.shares_held * amount)
          self.balance += shares_sold * close_price
          self.shares_held -= shares_sold

      if self.current_step >= self.train_cnt_epoch:
          self.current_step = 6

      obs = self._next_observation()

      asset_value_after_action = self.balance + self.shares_held * next_day_close_price
      reward = asset_value_after_action - asset_value_before_action
      done = self.current_step >= self.max_steps or self.balance <= 0

      return obs, reward, done, done, {}


    def reset(self, seed=0):
        self.balance = self.initial_balance
        self.shares_held = 0
        self.current_step = 6
        obs = self._next_observation()
        return obs, {}

    def _next_observation(self):
        end_slice = self.current_step + 1
        start_slice = end_slice-4
        # in iloc the last index is not inclusive; thus we are doing end_slice+1.
        obs = self.pct_df.iloc[start_slice:end_slice+1].values
        return obs

    def render(self, mode='human'):
        return self.df.loc[self.current_step, 'Open']

    def close(self):
        return


#------------------------------------------------------------------------------
# TRAIN
if TRAIN:
    df = yf.download(TICKER, start=START, end=END)
    df = df.sort_values('Date')
    df = df.drop(columns='Adj Close')
    df = df.reset_index(drop=True)
    pct_df = df.copy()

    for feature in ['Open', 'High', 'Low', 'Close', 'Volume']:
        pct_df[feature] = pct_df[feature].pct_change()

    print("df: ", df.tail(20))
    print("pct_df: ", pct_df.tail(20))

    env = DummyVecEnv([lambda: StockTradingEnv(df, pct_df)])
    model = PPO("MlpPolicy", env, verbose=1)

    #model.learn(total_timesteps=TRAIN_STEPS)
    #model.save(MODEL_NAME)
    divide = 100
    for i in range(divide):
        timesteps = TRAIN_STEPS/divide
        model.learn(total_timesteps=timesteps)
        n = int(timesteps*(i+1))
        print(f"Saving #{i} / {divide}: {(i+1) * divide} --------------------------------------")
        file_name = f"stock_{TICKER}_{int(n/1000)}k"
        model.save(file_name)

print("--------------------------------")
print("------------------------------------------------------------------------")
print("--------------------------------")



[*********************100%***********************]  1 of 1 completed


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    total_timesteps      | 335872       |
| train/                  |              |
|    approx_kl            | 0.0068636946 |
|    clip_fraction        | 0.0779       |
|    clip_range           | 0.2          |
|    entropy_loss         | 0.58         |
|    explained_variance   | 0.074        |
|    learning_rate        | 0.0003       |
|    loss                 | 2.21e+05     |
|    n_updates            | 11430        |
|    policy_gradient_loss | -0.0052      |
|    std                  | 0.181        |
|    value_loss           | 5.78e+05     |
------------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 536         |
|    iterations           | 165         |
|    time_elapsed         | 629         |
|    total_timesteps      | 337920      |
| train/                  |             |
|    approx_kl            | 0.008302495

In [None]:

#------------------------------------------------------------------------------
# TEST ENVIRONMENT

class StockTradingTestEnv(StockTradingEnv):
    def __init__(self, df, pct_df, initial_balance=INITIAL_BALANCE):
        super().__init__(df, pct_df)
        self.initial_balance = initial_balance
        self.balance = self.initial_balance
        self.train_cnt_epoch=len(self.df.loc[:, 'Close'].values) -2 -6

    def step(self, action):
        self.current_step += 1
        action_type = action[0]
        amount = action[1]

        close_price = self.df.loc[self.current_step, 'Close']
        next_day_close_price=self.df.loc[self.current_step+1, 'Close']

        shares_bought = 0
        shares_sold = 0
        asset_value_before_action = self.balance + self.shares_held * close_price

        if action_type < 1:
            # Hold
            pass
        elif action_type < 2:
            # Buy
            total_possible = int(self.balance / close_price)
            shares_bought = int(total_possible * amount)
            total_cost = shares_bought * close_price
            self.balance -= total_cost
            self.shares_held += shares_bought
        elif action_type < 3:
            # Sell
            shares_sold = int(self.shares_held * amount)
            self.balance += shares_sold * close_price
            self.shares_held -= shares_sold

        if self.current_step >= len(self.df.loc[:, 'Close'].values) - 6:
            self.current_step = 6

        obs = self._next_observation()

        asset_value_after_action = self.balance + self.shares_held * next_day_close_price
        reward = asset_value_after_action - asset_value_before_action


        if self.current_step >= self.train_cnt_epoch:
          done = True
        else:
          done = False

        return obs, reward, done, done, {}


#------------------------------------------------------------------------------
# TEST

import matplotlib.pyplot as plt
from stable_baselines3 import PPO

# Load the saved model
model = PPO.load(MODEL_NAME)

# Load the new dataset
df_ = yf.download(TICKER_TEST, start=START, end=END)
df_ = df_.sort_values('Date')
df_ = df_.drop(columns='Adj Close')
df_ = df_.reset_index(drop=True)

# Create a new percentage change dataframe
pct_df_ = df_.copy()

for feature in ['Open', 'High', 'Low', 'Close', 'Volume']:
    pct_df_[feature] = pct_df_[feature].pct_change()

# Adjust the environment to use the new data and the final balance from the training
final_training_balance = INITIAL_BALANCE * 3
env = DummyVecEnv([lambda: StockTradingTestEnv(df_, pct_df_, initial_balance=final_training_balance)])

# Set the initial state of the environment
initial_state = env.reset()
done = False

# This list will hold the value of the portfolio at each step
portfolio_values = []

while not done:
    # Get the action from the model
    action, _ = model.predict(initial_state)

    # Take a step in the environment and get the new state and reward
    initial_state, reward, done, info = env.step(action)
    #print(f'action: {action} / reward: {reward}')
    # The current value of the portfolio is the initial balance plus the value of the shares held

    if not done:
        portfolio_value = env.envs[0].balance + (env.envs[0].shares_held * env.envs[0].df.loc[env.envs[0].current_step, 'Close'])
        # print('balance', env.envs[0].balance)
        # print('shares_held', env.envs[0].shares_held)
        # print('portfolio_value', portfolio_value)
        # # Add the portfolio value to the list
        portfolio_values.append(portfolio_value)
        # print('portfolio_values', portfolio_values)
        # print('current_step', env.envs[0].current_step)
    else:
        print("Reached the end of the data.")


# Plot the portfolio value over time
plt.figure(figsize=(10,6))
plt.plot(portfolio_values)
plt.title(f'Portfolio Value {TICKER_TEST} Over Time')
plt.xlabel('Step')
plt.ylabel('Value')
plt.show()


In [None]:

#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
import datetime
from stable_baselines3 import PPO

def date_by_adding_business_days(from_date, add_days):
    business_days_to_add = add_days
    current_date = from_date
    while business_days_to_add > 0:
        current_date += datetime.timedelta(days=1)
        weekday = current_date.weekday()
        if weekday >= 5: # sunday = 6
            continue
        business_days_to_add -= 1
    return current_date


model = PPO.load(MODEL_NAME)

START = "2024-07-01"
END = "2024-09-02"
df_ = yf.download(TICKER_TEST, start=START, end=END)
df_ = df_.sort_values('Date')
df_ = df_.drop(columns='Adj Close')
df_ = df_.reset_index(drop=True)
pct_df_ = df_.copy()
for feature in ['Open', 'High', 'Low', 'Close', 'Volume']:
    pct_df_[feature] = pct_df_[feature].pct_change()

final_training_balance = INITIAL_BALANCE
env = DummyVecEnv([lambda: StockTradingTestEnv(df_, pct_df_, initial_balance=final_training_balance)])

initial_state = env.reset()
done = False

date_start = datetime.datetime.strptime(START, '%Y-%m-%d')
print(f"\n ---------------------- Actions {START}-{END}------------------------------ \n")
while not done:
    action, _ = model.predict(initial_state)
    initial_state, reward, done, info = env.step(action)
    action_type = (action[0])[0]
    amount = (action[0])[1]

    #date = date_start + datetime.timedelta(days=env.envs[0].current_step)# Wrong, as stock doesnt open weekends...
    date = date_by_adding_business_days(date_start, env.envs[0].current_step)#Wrong, as stock doesnt open on holidays
    if action_type < 1:
        #print(f"Action: HOLD   (HELD: {env.envs[0].shares_held} / VAL: {env.envs[0].balance} / )")
        pass
    elif action_type < 2:
        print(f'\n#{env.envs[0].current_step} :::::: {date} :::::::::::::::::  action: {action} / reward: {reward}')
        #total_possible = int(env.envs[0].balance / close_price)
        #shares_bought = int(total_possible * amount)
        #print(f"Action: BUY   $shares_bought (HELD: {shares_held})")
        print(f"Action: BUY   {amount} (HELD: {env.envs[0].shares_held} / VAL: {env.envs[0].balance})")
    elif action_type < 3:
        print(f'\n#{env.envs[0].current_step} :::::: {date} :::::::::::::::::  action: {action} / reward: {reward}')
        shares_sold = int(env.envs[0].shares_held * amount)
        #print(f"Action: SALE  $shares_sold   (HELD: {shares_held})")
        print(f"Action: SALE  {amount} = {shares_sold} (HELD: {env.envs[0].shares_held} / VAL: {env.envs[0].balance})")

    if not done:
        env.envs[0].balance + (env.envs[0].shares_held * env.envs[0].df.loc[env.envs[0].current_step, 'Close'])
    else:
        print("Reached the end of the data.")


In [None]:
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
import datetime
from stable_baselines3 import PPO

def date_by_adding_business_days(from_date, add_days):
    business_days_to_add = add_days
    current_date = from_date
    while business_days_to_add > 0:
        current_date += datetime.timedelta(days=1)
        weekday = current_date.weekday()
        if weekday >= 5: # sunday = 6
            continue
        business_days_to_add -= 1
    return current_date

models = [
    (san, PPO.load(f"stock_{san}")),
    (itx, PPO.load(f"stock_{itx}")),
    (sacyr, PPO.load(f"stock_{sacyr}")),
    (iag, PPO.load(f"stock_{iag}")),
    ]

START = "2024-08-01"
END = "2024-09-03"
df_ = yf.download(TICKER_TEST, start=START, end=END)
df_ = df_.sort_values('Date')
df_ = df_.drop(columns='Adj Close')
df_ = df_.reset_index(drop=True)
pct_df_ = df_.copy()
for feature in ['Open', 'High', 'Low', 'Close', 'Volume']:
    pct_df_[feature] = pct_df_[feature].pct_change()

final_training_balance = INITIAL_BALANCE
env = DummyVecEnv([lambda: StockTradingTestEnv(df_, pct_df_, initial_balance=final_training_balance)])

initial_state = env.reset()

date_start = datetime.datetime.strptime(START, '%Y-%m-%d')

print(f"\n ---------------------- Actions {TICKER} {START}-{END}------------------------------ \n")

for ticker, model in models:
    done = False
    while not done:
        action, _ = model.predict(initial_state)
        initial_state, reward, done, info = env.step(action)
        action_type = (action[0])[0]
        amount = (action[0])[1]

        #date = date_start + datetime.timedelta(days=env.envs[0].current_step)# Wrong, as stock doesnt open weekends...
        date = date_by_adding_business_days(date_start, env.envs[0].current_step)#Wrong, as stock doesnt open on holidays
        if action_type < 1:
            #print(f"Action: HOLD   (HELD: {env.envs[0].shares_held} / VAL: {env.envs[0].balance} / )")
            pass
        elif action_type < 2:
            print(f'\n#{env.envs[0].current_step} :: {ticker} {date.date()} :: action: {action} / reward: {reward}')
            #total_possible = int(env.envs[0].balance / close_price)
            #shares_bought = int(total_possible * amount)
            #print(f"Action: BUY   $shares_bought (HELD: {shares_held})")
            print(f"Action: BUY   {amount} (HELD: {env.envs[0].shares_held} / VAL: {env.envs[0].balance})")
        elif action_type < 3:
            print(f'\n#{env.envs[0].current_step} :: {ticker} {date,date()} :: action: {action} / reward: {reward}')
            shares_sold = int(env.envs[0].shares_held * amount)
            #print(f"Action: SALE  $shares_sold   (HELD: {shares_held})")
            print(f"Action: SALE  {amount} = {shares_sold} (HELD: {env.envs[0].shares_held} / VAL: {env.envs[0].balance})")

        if not done:
            env.envs[0].balance + (env.envs[0].shares_held * env.envs[0].df.loc[env.envs[0].current_step, 'Close'])
        else:
            print("--- END ---")
