In [7]:
from collections import deque

# data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

# gym
import gymnasium as gym

from forexgym.envs import DiscreteActionEnvironment
from forexgym.utils import Query, CurrencyPair

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class Policy(nn.Module):
    def __init__(self, n_inputs: int, n_outputs: int, hidden_size: int = 128, continuous_actions: bool = False):
        super(Policy, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_outputs)
        )
        self.continuous_actions = continuous_actions
        self.log_probs = []
        self.rewards = []

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y_pred = self.layers(x)
        return F.softmax(y_pred, dim=1)
    
    def act(self, state: np.ndarray) -> int:
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        
        m = Categorical(self.forward(state))
        action = m.sample()
        self.log_probs.append(m.log_prob(action))
        
        return action.item()

    def act_secure(self, state: np.ndarray) -> int:
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state)
        action = torch.argmax(probs).item()
        return action

In [8]:
def article_processor(df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
    df["x1"] = ((df["Close"].shift(-1) - df["Close"]) / df["Close"]).shift(1) 
    df["x2"] = ((df["High"].shift(-1) - df["High"]) / df["High"]).shift(1) 
    df["x3"] = ((df["Low"].shift(-1) - df["Low"]) / df["Low"]).shift(1) 
    df["x4"] = (df["High"] - df["Close"]) / df["Close"] 
    df["x5"] = (df["Close"] - df["Low"]) / df["Close"] 
    
    return df.drop(["Date", "Open", "High", "Low", "Close"], axis=1)

In [9]:
ticker = "EURUSD"
#timeframes = ["1m", "5m", "15m", "30m", "1H", "4H", "1D"]
timeframes = ["4H", "1H", "15m"]

query = Query(episode_length=256, trading_timeframe="1H", trading_column="Close")
query.add_query(
    timeframe="1H",
    window_size=16,
    data_processor=article_processor
)
# dataset = pair.generate_dataset(query)

env = DiscreteActionEnvironment(
    currency_tickers={"EURUSD": timeframes},
    query=query,
    reward_type="continuous",
    reward_multiplier=1e3,
    episode_length=256,
    allow_holding=True
)

Genrating EURUSD dataset...


100%|██████████| 1/1 [00:00<00:00, 68.97it/s]


Generated EURUSD dataset.


In [12]:
env.reset()

AttributeError: 'NoneType' object has no attribute 'reset'

In [10]:
try:
    env.close() # type: ignore  # noqa: F821
except: # type: ignore  # noqa: E722, F821
    pass



LEARNING_RATE = 0.001
ENV = "LunarLander-v2"
#env = gym.make(ENV)
state, info = env.reset()

policy = Policy(env.observation_space.shape[0], env.action_space.n).to(device)
optimiser = optim.Adam(policy.parameters(), lr=LEARNING_RATE)

eps = np.finfo(np.float32).eps.item()



DISCOUNT_FACTOR = 0.99
#MAX_TIME_STEPS = env.spec.max_episode_steps
#REWARD_THRESHOLD = env.spec.reward_threshold
N_EPISODES = 10000


AttributeError: 'NoneType' object has no attribute 'reset'

In [534]:
def sample_episode() -> None:
    state, _ = env.reset()
    for _ in range(MAX_TIME_STEPS):
        action = policy.act(state)
        state, reward, terminated, _, _ = env.step(action)
        policy.rewards.append(reward)
        
        if terminated:
            break
        

In [610]:
def finish_episode():
    R = 0
    policy_loss = []
    returns = deque()
    for r in policy.rewards[::-1]:
        R = r + DISCOUNT_FACTOR * R
        returns.appendleft(R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(policy.log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimiser.zero_grad()
    policy_loss = torch.cat(policy_loss).sum().to(device)
    policy_loss.backward()
    optimiser.step()
    
    
    return policy_loss

In [611]:
def train():
    
    n_completions = 0
    
    for ep in range(N_EPISODES):
        
        del policy.rewards[:]
        del policy.log_probs[:]
        
        sample_episode()
        finish_episode()
        
        if ep % 25 == 0:
            total_rewards = sum(policy.rewards)
            
            n_completions += total_rewards > REWARD_THRESHOLD
                
            
            print(f"Episode: {ep} | Reward: {sum(policy.rewards)}")
            
            if n_completions >= 5:
                break

In [636]:
train()

Episode: 0 | Reward: 26.95846946250927
Episode: 25 | Reward: -54.45134080111597
Episode: 50 | Reward: 231.7256943072305
Episode: 75 | Reward: 141.33193941359775
Episode: 100 | Reward: 229.32840093026783
Episode: 125 | Reward: 146.9576331301402
Episode: 150 | Reward: 23.92448867696136
Episode: 175 | Reward: 111.80948949583285
Episode: 200 | Reward: 108.67486720736593
Episode: 225 | Reward: 52.47914284634612
Episode: 250 | Reward: 173.72561280570642
Episode: 275 | Reward: 23.948937827469678
Episode: 300 | Reward: 258.90483039639196
Episode: 325 | Reward: -119.35525067411466
Episode: 350 | Reward: -27.96291973655545
Episode: 375 | Reward: -79.49269247361633
Episode: 400 | Reward: -31.523365333090737
Episode: 425 | Reward: -26.757083332711943
Episode: 450 | Reward: 182.23914224650062
Episode: 475 | Reward: -51.33235046745686
Episode: 500 | Reward: -86.02395466355523
Episode: 525 | Reward: -253.55422953620823
Episode: 550 | Reward: -48.22400618094291
Episode: 575 | Reward: -75.7721873117026

In [637]:
try:
    env.close() # type: ignore  # noqa: F821
except: # type: ignore  # noqa: E722, F821
    pass

env = gym.make(ENV, render_mode="human")


def benchmark():
    state, _ = env.reset()
    total_reward = 0
    for _ in range(MAX_TIME_STEPS):
        action = policy.act_secure(state)
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward



In [640]:
benchmark()

error: display Surface quit

In [641]:
torch.save(policy, "lunar_lander_reinforce.pt")