In [1]:
import warnings
import time
import math

import gymnasium as gym
import torch
import torch.nn as nn
from gym_trading_env.renderer import Renderer
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.envs.wrappers.torch import wrap_env
from skrl.memories.torch import RandomMemory
from skrl.models.torch import DeterministicMixin, Model, GaussianMixin, MultivariateGaussianMixin
from skrl.resources.schedulers.torch import KLAdaptiveLR
from skrl.trainers.torch import ParallelTrainer
from skrl.utils import set_seed
from skrl.utils.spaces.torch import unflatten_tensorized_space
from torch.nn import TransformerEncoder, TransformerEncoderLayer

from preprocess import preprocess


In [2]:
set_seed(42)

gym.register(
    id="DiscretedTradingEnv",
    entry_point="predict_live:DiscretedTradingEnv",
    disable_env_checker=True,
)

env_cfg = dict(
    id="DiscretedTradingEnv",
    preprocess=preprocess,
    max_episode_duration="max",
    verbose=1,
    leverage=10,
    stop_loss=-0.1,
    take_profit=0.1,
    window_size=30,
    # btc_index=True,
)
env = gym.make(**env_cfg)
obs = env.observation_space
env = wrap_env(env, wrapper="gymnasium")

device = env.device
replay_buffer_size = 1024 * 1 * env.num_envs
memory_size = int(replay_buffer_size / env.num_envs)

[38;20m[skrl:INFO] Seed: 42[0m
binanceusdm requires to release all resources with an explicit call to the .close() coroutine. If you are using the exchange instance with async coroutines, add `await exchange.close()` to your code into a place when you're done with the exchange and don't need the exchange instance anymore (at the end of your async coroutine).
Exception ignored in: <function ClientSession.__del__ at 0x7aaf92995e10>
Traceback (most recent call last):
  File "/home/pitin/Desktop/hp/.venv/lib/python3.10/site-packages/aiohttp/client.py", line 437, in __del__
Exception ignored in: <function BaseConnector.__del__ at 0x7aaf929455a0>
Traceback (most recent call last):
  File "/home/pitin/Desktop/hp/.venv/lib/python3.10/site-packages/aiohttp/connector.py", line 321, in __del__
[38;20m[skrl:INFO] Environment wrapper: gymnasium[0m


In [3]:
class LearnablePositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=524288):
        super().__init__()
        self.position_embedding = nn.Embedding(max_len, d_model)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0)
        positions = positions.expand(batch_size, seq_len)
        position_encoded = self.position_embedding(positions)
        return x + position_encoded


class SharedNoFC(MultivariateGaussianMixin, DeterministicMixin, Model):
    def __init__(
        self,
        observation_space,
        action_space,
        device,
        clip_actions=False,
        clip_log_std=True,
        min_log_std=-20,
        max_log_std=2,
    ):
        Model.__init__(self, observation_space, action_space, device)
        MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
        DeterministicMixin.__init__(self, clip_actions)

        self._shared_features = None
        self.num_features = 7
        self.net_projection = nn.Sequential(
            nn.Conv1d(self.num_features, 8, kernel_size=1, padding=1),
        )

        # Transformer Encoder for self-attention
        transformer_layer = TransformerEncoderLayer(
            d_model=8,  # The size of the input feature vector
            nhead=4,  # Number of attention heads
            dim_feedforward=256,  # The size of the feedforward network in the encoder
            # dropout=0.1,
            batch_first=True,  # Use batch_first for better inference performance
        )
        self.positional_encoding = LearnablePositionalEncoding(d_model=8)
        self.transformer_encoder = TransformerEncoder(transformer_layer, num_layers=2)

        # Output heads using Conv1d with additional layers to capture interactions
        self.policy_head = nn.Sequential(
            nn.Conv1d(8, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(8, self.num_actions, kernel_size=1),  # 7 = seq_length
            nn.AdaptiveAvgPool1d(1),  # 마지막에 시퀀스 길이를 1로 줄임\
        )
        self.value_head = nn.Sequential(
            nn.Conv1d(8, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(8, 1, kernel_size=1),
            nn.AdaptiveAvgPool1d(1),
        )
        self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))

    def act(self, inputs, role):
        if role == "policy":
            return MultivariateGaussianMixin.act(self, inputs, role)
        # elif role == "value":
        #     return DeterministicMixin.act(self, inputs, role)

    def compute(self, inputs, role):
        states = unflatten_tensorized_space(obs, inputs["states"])

        if role == "policy":
            features = states.permute(0, 2, 1)
            features = self.net_projection(features)
            features = self.positional_encoding(
                features.permute(0, 2, 1)
            )  # torch.Size([32, 7, 128])
            features = features + self.transformer_encoder(
                features
            )  # torch.Size([32, 7, 128])

            self._shared_features = features.permute(
                0, 2, 1
            )  # torch.Size([32, 128, 7])
            actions = self.policy_head(self._shared_features)  # torch.Size([32, 5, 1])
            print(self.log_std_parameter)
            return actions.squeeze(-1), self.log_std_parameter, {}  # torch.Size([32, 5])

        # elif role == "value":
        #     if self._shared_features is None:
        #         features = states.permute(0, 2, 1)
        #         features = self.net_projection(features)
        #         features = self.positional_encoding(features.permute(0, 2, 1))
        #         features = features + self.transformer_encoder(features)

        #         shared_features = features.permute(0, 2, 1)
        #     else:
        #         shared_features = self._shared_features

        #     self._shared_output = None

        #     value = self.value_head(shared_features)
        #     return value.squeeze(-1), {}

In [4]:
models = {}
models["policy"] = SharedNoFC(env.observation_space, env.action_space, device)
# models["value"] = models["policy"]

# for model in models.values():
#     model.init_parameters(method_name="normal_", mean=0.0, std=0.1)

cfg = PPO_DEFAULT_CONFIG.copy()
cfg["rollouts"] = memory_size
cfg["learning_epochs"] = 32
cfg["mini_batches"] = 16
cfg["discount_factor"] = 0.99
cfg["learning_rate"] = 5e-4
cfg["learning_rate_scheduler"] = KLAdaptiveLR
cfg["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.01, "min_lr": 1e-7}

cfg["experiment"]["write_interval"] = 5000
cfg["experiment"]["checkpoint_interval"] = 100000
cfg["experiment"]["directory"] = "runs/torch/mddt"

In [5]:
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

agent = PPO(
    models=models,
    memory=None,
    cfg=cfg,
    observation_space=env.observation_space,
    action_space=env.action_space,
    device=device,
)
path = "24-12-10_14-56-44-982474_PPO"
# path = "24-12-10_11-34-51-118959_PPO"
# path = "24-12-09_08-17-35-656338_PPO"
# path = "24-12-06_13-42-35-434197_PPO"
agent.load(f"/home/pitin/Desktop/hp/runs/torch/mddt/{path}/checkpoints/best_agent.pt")
cfg_trainer = {"timesteps": 10000000, "headless": True, "environment_info": "pc_counter"}
trainer = ParallelTrainer(cfg=cfg_trainer, env=env, agents=[agent])



In [6]:
trainer.eval()

  0%|          | 0/10000000 [00:00<?, ?it/s]Parameter containing:
tensor([-2.9427, -3.3000, -3.3176, -3.1206], device='cuda:0',
       requires_grad=True)
[3597.01 3614.54 3583.35 3583.5 ]
[3574.9456 3588.7498 3576.5095 3587.6833]
  0%|          | 0/10000000 [00:00<?, ?it/s]


IndexError: index 30 is out of bounds for axis 0 with size 30

In [None]:
states, infos = env.reset()
timestep = 0
timesteps = 3000
terminated = torch.tensor([[False] * env.num_envs])

agent.pre_interaction(timestep=timestep, timesteps=timesteps)

with torch.no_grad():
    actions = agent.act(states, timestep=timestep, timesteps=timesteps)[0]
    next_states, rewards, terminated, truncated, infos = env.step(actions)
    env.render()

super(type(agent), agent).post_interaction(timestep=timestep, timesteps=timesteps)

if env.num_envs > 1:
    states = next_states
else:
    if terminated.any() or truncated.any():
        with torch.no_grad():
            states, infos = env.reset()
    else:
        states = next_states


In [None]:
# AVAX
# ADA
# SOL
# ETH
# BNB
# XLM

In [None]:
# env.save_for_render()

In [None]:
# renderer = Renderer(render_logs_dir="render_logs")
# renderer.run()