In [29]:
import gymnasium as gym
from gymnasium import spaces

import pandas as pd
import numpy as np

import yfinance as yf

import torch
from torch.distributions import Categorical

import matplotlib.pyplot as plt

from datetime import datetime as dt
from tqdm.auto import tqdm

from warnings import filterwarnings
filterwarnings("ignore")

In [30]:
start = "2006-01-01"
end = dt.now()
sec = "AAPL"

prices = yf.download(sec, start, end)[["Open", "Close"]]
prices.columns = [f"{sec}_open", f"{sec}_close"]
features = pd.read_csv("workflow/data/features.csv", index_col=0, parse_dates=True)

data = pd.concat([prices, features], axis=1).dropna()

data.head()

In [None]:
class TradeEnv(gym.Env):
    def __init__(self, sec, X, lookback=50, position_size=10_000, balance=1_000_000):
        
        self.prices = X[X.columns[X.columns.str.contains("open|close")]].iloc[lookback:]
        self.trades = pd.DataFrame(index=self.prices.index)
        self.X = np.array([features.iloc[i-lookback:i] for i in range(lookback, len(X))])
        self.features = X[X.columns[~X.columns.isin(self.prices.columns)]]
        self.sec = sec
        self.lookback = lookback

        self.pos = position_size
        self.I = balance
        self.M = balance
        self.done = False
        self.current_step = 0

        self.obs0 = torch.tensor(self.X[0, :, :], dtype=torch.float32).unsqueeze(0)

        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(lookback, len(features)), dtype=np.float32)

    def step(self, action):
        self.trades.loc[self.trades.index[self.current_step], "ACTION"] = action.item()
        self.current_step += 1
        observation = torch.tensor(self.X[self.current_step, :, :], dtype=torch.float32).unsqueeze(0)
        step_roi = 0  # Initialize ROI for this step as 0

        o = self.prices.iloc[self.current_step][f"{self.sec}_open"]
        c = self.prices.iloc[self.current_step][f"{self.sec}_close"]

        n_shares = round((self.pos / o), 2) if o != 0 else 0

        if self.current_step >= len(self.prices) - 1 or self.M <= self.pos or o == 0 or c == 0:
            self.done = True
            return observation, step_roi, self.done, {}

        if action == 1:  # Buy and sell
            cost = n_shares * o

            if self.M >= cost:
                self.M -= cost  # Deduct cost from balance to buy
                roi = (c - o) / o  # Calculate ROI
                step_roi = roi  # Assign ROI to step reward
                profit = n_shares * (c - o)  # Profit/Loss
                self.M += (cost + profit)  # Add ending value back to balance

            else:  # Not enough balance to buy
                step_roi = -0.1  # Negative ROI as a penalty

        elif action == 0:  # Do nothing
            step_roi = 0  # ROI remains 0

        return observation, step_roi, self.done, {}

    def reset(self):
        self.current_step = 0
        self.done = False
        self.M = self.I
        return self.obs0

    def render(self, mode='human'):
        print(f"Step: {self.current_step}")
        print(f"Balance: {self.M}")
        print(f"Done: {self.done}")

In [None]:
class LSTMPolicy(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim,  num_layers=5):
        super().__init__()
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, 2, bias=True)
        self.softmax = torch.nn.Softmax(dim=-1)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out_last = lstm_out[:, -1, :]
        fc_out = self.fc(lstm_out_last)
        return self.softmax(fc_out).squeeze(0)

In [None]:
from sklearn.preprocessing import MinMaxScaler

X1 = data.iloc[:252*4]

env = TradeEnv("AAPL", X1)

input_shape = env.obs0.shape[-1]
hidden_dim = 31

model = LSTMPolicy(input_shape, hidden_dim)
optimizer = torch.optim.Adam(model.parameters(), maximize=True)

policy_gradients = []
cum_rets = []

for i in tqdm(range(1000)):
    state = env.reset()
    done = False
    rewards = []
    log_probs = []

    while not done:
        action_probs = model(state)
        action_dist = Categorical(action_probs)

        action = action_dist.sample()
        log_prob = action_dist.log_prob(action)

        state, reward, done, info = env.step(action)
        rewards.append(reward)
        print(reward)
        log_probs.append(log_prob)

    policy_gradient = []
    for l, r in zip(log_probs, rewards):
        policy_gradient.append(l * (r - np.mean(rewards)))
    policy_gradient = torch.stack(policy_gradient).sum()
    policy_gradients.append(policy_gradient)

    cum_ret = np.cumsum(rewards)[-1]
    cum_rets.append(cum_ret)
    
    optimizer.zero_grad()
    policy_gradient.backward()
    optimizer.step()
    print(f"R: {cum_ret:,.3f} | G: {policy_gradient.item():,.3f}")

  0%|          | 0/1000 [00:00<?, ?it/s]

-0.13527053669183953
-0.15956270869738232
-0.15131359237244732
0
0
-0.15542194354072417
0
-0.14778697173321487
-0.15348443557777008
0
-0.1595164787019549
0
-0.13217779269936944
-0.15804917825429052
0
0
-0.1537015103827764
0
0
-0.13560473598025877
0
-0.1477603305446026
0
-0.1651164025095808
0
-0.155059207357339
-0.17150778400369296
-0.15055814847371973
-0.16264807021147715
0
-0.150570323968709
-0.12542746260632912
0
-0.15557222820813063
-0.139678091036061
-0.14920190097464744
0
-0.14380252450340522
-0.14412492269877578
0
-0.13974636693687473
0
0
-0.17845407489837947
0
-0.1621558783333643
-0.15626637754229952
-0.15564146082526023
0
0
-0.15878889008203387
-0.17939313452989114
0
-0.15263572681811066
0
-0.1405874953233899
-0.15596912121279363
0
0
-0.13537531765417385
-0.16798884080702187
-0.16924140333565316
0
-0.15781611745365393
0
-0.14432612465954617
0
0
-0.13494429050444215
0
-0.15604193464488406
-0.09111550856631172
0
-0.15244221456585733
-0.15093410691429388
-0.1386063751309346
0
-0.1

KeyboardInterrupt: 