In [1]:
import gymnasium as gym
import torch
from skrl.envs.wrappers.torch import wrap_env
from skrl.memories.torch import RandomMemory
from skrl.trainers.torch import SequentialTrainer
from skrl.utils import set_seed
from reward import refined_pnl, simple_reward
from preprocess import preprocess
from lr_schedulers import CosineAnnealingWarmUpRestarts

set_seed(42)

gym.register(
    id="MultiDatasetDiscretedTradingEnv",
    entry_point="environment:MultiDatasetDiscretedTradingEnv",
    disable_env_checker=True,
)

[38;20m[skrl:INFO] Seed: 42[0m


In [2]:
env_cfg = dict(
    id="MultiDatasetDiscretedTradingEnv",
    dataset_dir="./data/train/month_15m/**/**/*.pkl",
    preprocess=preprocess,
    reward_function=simple_reward,
    positions=[-20, -5, -2, 2, 5, 20],
    trading_fees=0.0001,
    borrow_interest_rate=0.0003,
    portfolio_initial_value=100,
    max_episode_duration="max",  # 24 * 60,
    verbose=0,
    window_size=4*24*3,
)

In [3]:
obs = gym.make(**env_cfg).observation_space
env = gym.make_vec(
    vectorization_mode="sync",
    num_envs=32,
    wrappers=[gym.wrappers.FlattenObservation],
    **env_cfg,
)
env = wrap_env(env, wrapper="gymnasium")

[38;20m[skrl:INFO] Environment wrapper: gymnasium[0m


In [4]:
device = env.device
replay_buffer_size = 1024 * 24 * env.num_envs
memory_size = int(replay_buffer_size / env.num_envs)
memory = RandomMemory(memory_size=memory_size, num_envs=env.num_envs, device=device, replacement=False)

In [5]:
import torch.nn as nn
from skrl.models.torch import DeterministicMixin, CategoricalMixin, Model
from skrl.utils.spaces.torch import unflatten_tensorized_space


class Shared(CategoricalMixin, DeterministicMixin, Model):
    def __init__(
        self,
        observation_space,
        action_space,
        device,
        clip_actions=False,
        unnormalized_log_prob=True,
    ):
        Model.__init__(self, observation_space, action_space, device)
        CategoricalMixin.__init__(self, unnormalized_log_prob)
        DeterministicMixin.__init__(self, clip_actions)

        self.net = nn.Sequential(
            nn.Linear(97, 128),
            nn.BatchNorm1d(128),
            nn.ELU(),
            nn.Dropout(0.2),
            nn.Linear(128, 32),
            nn.BatchNorm1d(32),
            nn.ELU(),
            nn.Dropout(0.2),
        )

        self.net_features = nn.Sequential(
            nn.Conv1d(env_cfg["window_size"], 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Conv1d(128, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Conv1d(64, 32, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
        )

        self.mean_layer = nn.Sequential(
            nn.Linear(32, self.num_actions),
            nn.Softmax(1)
        )
        self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))

        self.value_layer = nn.Linear(32, 1)

    def act(self, inputs, role):
        if role == "policy":
            return CategoricalMixin.act(self, inputs, role)
        elif role == "value":
            return DeterministicMixin.act(self, inputs, role)

    def compute(self, inputs, role):
        states = unflatten_tensorized_space(obs, inputs["states"])
        
        features = self.net_features(states["features"])
        values = torch.cat(
            (
                features,
                states["position"],
            ),
            dim=1,
        )

        if role == "policy":
            self._shared_output = self.net(values)
            action = self.mean_layer(self._shared_output)
            return action, {}
        elif role == "value":
            shared_output = (
                self.net(values) if self._shared_output is None else self._shared_output
            )
            self._shared_output = None
            value = self.value_layer(shared_output)
            return value, {}

In [6]:
models = {}
models["policy"] = Shared(env.observation_space, env.action_space, device)
models["value"] = models["policy"]

# initialize models' parameters (weights and biases)
for model in models.values():
    model.init_parameters(method_name="normal_", mean=0.0, std=0.1)

In [None]:
from skrl.agents.torch.ppo import PPO_DEFAULT_CONFIG

timesteps = 1000000
cfg = PPO_DEFAULT_CONFIG.copy()
cfg["rollouts"] = memory_size
cfg["learning_epochs"] = 64
cfg["mini_batches"] = 4
cfg["discount_factor"] = 0.99
# cfg["lambda"] = 0.95
cfg["learning_rate"] = 0
# cfg["grad_norm_clip"] = 1.0
# cfg["ratio_clip"] = 0.2
# cfg["value_clip"] = 0.2
# cfg["clip_predicted_values"] = False
# cfg["entropy_loss_scale"] = 0.0
# cfg["value_loss_scale"] = 0.5
# cfg["optimizer"] = torch.optim.AdamW(models["policy"].parameters(), lr=1e-5)
cfg["learning_starts"] = 0
cfg["learning_rate_scheduler"] = CosineAnnealingWarmUpRestarts
cfg["learning_rate_scheduler_kwargs"] = {
    "T_0": 16 * cfg["learning_epochs"],  # 첫 주기의 길이
    "T_mult": 2,  # 매 주기마다 주기의 길이를 두배로 늘림
    "T_up": cfg["learning_epochs"],  # warm-up 주기
    "eta_max": 1e-3,  # 최대 학습률
    "gamma": 0.6,  # 학습률 감소율
}

# logging to TensorBoard and write checkpoints (in timesteps)
cfg["experiment"]["write_interval"] = 1024 * 16
cfg["experiment"]["checkpoint_interval"] = 1000000
cfg["experiment"]["directory"] = "runs/torch/mddt"

In [8]:
import warnings
from skrl.agents.torch.ppo import PPO

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

agent = PPO(
    models=models,
    memory=memory,
    cfg=cfg,
    observation_space=env.observation_space,
    action_space=env.action_space,
    device=device,
)
cfg_trainer = {"timesteps": timesteps, "headless": True, "environment_info": "pc_counter"}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=[agent])
# agent.track_data("Episode/Position changed")

In [None]:
trainer.train()

 37%|███▋      | 368120/1000000 [23:40<24:15, 434.13it/s]  

In [None]:
terminated = False
observation, info = env.reset()

while terminated:
    # state-preprocessor + policy
    with torch.no_grad():
        states = state_preprocessor(states)
        actions = policy.act({"states": states})[0]

    # step the environment
    next_states, rewards, terminated, truncated, infos = env.step(actions)

    # render the environment
    env.render()

    # check for termination/truncation
    if terminated.any() or truncated.any():
        states, infos = env.reset()
    else:
        states = next_states