In [None]:
# module import

import warnings
import gym_trading_env  # noqa
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3 import DQN, PPO  # noqa
from sb3_contrib import RecurrentPPO
from sb3_contrib.ppo_recurrent import MlpLstmPolicy
from stable_baselines3.ppo import MlpPolicy  # noqa
from tqdm import TqdmExperimentalWarning
from preprocess import preprocess

In [None]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

In [None]:
def reward_only_position_changed(history):
    prev_position = history[-2]["position"]
    curr_position = history[-1]["position"]
    holding_fee = 0.01
    holding_cost = 0

    index = 1
    index_limit = len(history)

    while index < index_limit and history["position", -index] == prev_position:
        index += 1
        holding_cost -= holding_fee

    if prev_position == curr_position:
        if curr_position == 0:
            return holding_cost
        else:
            return 0
    else:
        return (history["portfolio_valuation", -1] / history["portfolio_valuation", -2] - 1) # / sqrt(index)

In [None]:
# Enviornment

env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="./data/train/month/**/*.pkl",
    preprocess=preprocess,
    reward_function=reward_only_position_changed,
    positions=[-10,0,10],
    trading_fees=0.0001/1000,
    borrow_interest_rate=0.000003,
    portfolio_initial_value=100,
    # num_envs=3,
)
env.unwrapped.add_metric(
    "Position Changes", lambda history: np.sum(np.diff(history["position"]) != 0)
)
# env.unwrapped.add_metric("value", lambda history: history["portfolio_valuation", -1])
env.unwrapped.add_metric("Episode Length", lambda history: len(history["position"]))
env.unwrapped.add_metric("Episode Reward", lambda history: sum(history["reward"]))

In [None]:
# Train

warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
model = RecurrentPPO(
    MlpLstmPolicy,
    env,
    # buffer_size=30000000,  # 1000000
    # batch_size=128,
    verbose=0,
    tensorboard_log="./tensorboard/",
    device="cpu",
    seed=2414411,
)
model.learn(total_timesteps=1000000, progress_bar=True)

In [None]:
reward = 0.0
done, truncated = False, False
observation, info = env.reset()

warnings.filterwarnings("ignore", category=UserWarning)
while not done and not truncated:
    action = model.predict(observation, deterministic=True)
    observation, r, done, truncated, info = env.step(action[0])
    reward += r


In [None]:
model.save("./model/RPPO/1.zip")