In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import gymnasium as gym
import random

In [2]:
class CustomEnv(gym.Env):
  def __init__(self):
    super(CustomEnv, self).__init__()
    self.action_space = gym.spaces.Box(low=-1, high=1, shape=(5,))
    self.observation_space = gym.spaces.Dict(
      data=gym.spaces.Box(low=-np.inf, high=np.inf, shape=(4, 6), dtype=np.float32),
      weights=gym.spaces.Box(low=0, high=1, shape=(5,), dtype=np.float32)
    )
    # self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(5, 3), dtype=np.float32)
  
  def reset(self, *args, **kwargs):
    x = np.random.rand(4, 6)
    x[:, 0] = np.linspace(0, 1, 4)
    w0 = np.zeros(5)
    w0[-1] = 1.0
    return {
      'data': x,
      'weights': w0
    }, {}
    # return x, {}
  
  def step(self, action):
    action = (action + 1) / 2
    t = np.linspace(0, 1, 5)
    t /= t.sum()
    reward = -100 * ((action - t)**2).sum()
    reward -= (action.sum() - 1)**2

    x = np.random.rand(4, 6)
    x[:, 0] = np.linspace(0, 1, 4)

    finished = reward < -10
    finished = False

    # print(reward)
    return {
      'data': x,
      'weights': action
    }, reward, finished, False, {}
    # return x, reward, finished, False, {}

In [3]:
from stable_baselines3 import DDPG, SAC, TQC, DQN
from stable_baselines3.common.noise import NormalActionNoise


from gymnasium.wrappers import TimeLimit
train_env = CustomEnv()

train_env = TimeLimit(train_env, max_episode_steps=1000)

# Set seeds
random.seed(42)
np.random.seed(42)
# train_env.seed(42)
train_env.action_space.seed(43)
torch.manual_seed(42)

model = SAC('MultiInputPolicy', 
             train_env, 
             buffer_size=10**8, 
             verbose=1, 
             action_noise=NormalActionNoise(mean=0, sigma=0.02*np.ones(5))
            )
            #  policy_kwargs={'net_arch': [400, 400],}
model.learn(total_timesteps=10**12, log_interval=1)

ImportError: cannot import name 'TQC' from 'stable_baselines3' (c:\Users\sumit\GitRepos\portfolio-rl\venv\Lib\site-packages\stable_baselines3\__init__.py)

In [None]:
x = np.random.rand(5, 3)
x[:, 0] = np.linspace(0, 1, 5)
model.predict({
  'data': x.flatten(),
  'weights': np.array([0, 0, 0, 0, 1.0])
})

(array([0.80157447, 0.6536939 , 0.7087927 , 0.7253792 , 0.9570687 ],
       dtype=float32),
 None)

In [8]:
from stable_baselines3 import DDPG, SAC, DQN
from stable_baselines3.common.noise import NormalActionNoise


from gymnasium.wrappers import TimeLimit
train_env = CustomEnv()

train_env = TimeLimit(train_env, max_episode_steps=1000)

# Set seeds
random.seed(42)
np.random.seed(42)
# train_env.seed(42)
train_env.action_space.seed(43)
torch.manual_seed(42)

model = DDPG('MultiInputPolicy', 
             train_env, 
             buffer_size=10**8, 
             verbose=1,
            )
            #  policy_kwargs={'net_arch': [400, 400],}
model.learn(total_timesteps=10**12, log_interval=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -3.65e+05 |
| time/              |           |
|    episodes        | 1         |
|    fps             | 171       |
|    time_elapsed    | 5         |
|    total_timesteps | 1000      |
| train/             |           |
|    actor_loss      | 638       |
|    critic_loss     | 8.02e+03  |
|    learning_rate   | 0.001     |
|    n_updates       | 899       |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -3.05e+05 |
| time/              |           |
|    episodes        | 2         |
|    fps             | 169       |
|    time_elapsed    | 11        |
|    total_timesteps | 2000      |
| train/             |           |
|    actor_loss      | 1.44e+03  |
|    critic_loss     | 3.36e+03  |
|    learning_rate   | 0.001     |
|    n_updates      

KeyboardInterrupt: 

In [9]:
x = np.random.rand(5, 3)
x[:, 0] = np.linspace(0, 1, 5)
model.predict({
  'data': x.flatten(),
  'weights': np.array([0, 0, 0, 0, 1.0])
})

(array([1., 1., 1., 1., 1.], dtype=float32), None)

In [22]:
from stable_baselines3 import DDPG, SAC, DQN
from stable_baselines3.common.noise import NormalActionNoise


from gymnasium.wrappers import TimeLimit
train_env = CustomEnv()

train_env = TimeLimit(train_env, max_episode_steps=1000)

# Set seeds
random.seed(42)
np.random.seed(42)
# train_env.seed(42)
train_env.action_space.seed(43)
torch.manual_seed(42)

model = DDPG('MultiInputPolicy', 
             train_env, 
             buffer_size=10**8, 
             verbose=1,
             action_noise=NormalActionNoise(mean=0, sigma=0.02*np.ones(5))
            )
            #  policy_kwargs={'net_arch': [400, 400],}
model.learn(total_timesteps=10**12, log_interval=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -2.37e+04 |
| time/              |           |
|    episodes        | 1         |
|    fps             | 143       |
|    time_elapsed    | 6         |
|    total_timesteps | 1000      |
| train/             |           |
|    actor_loss      | 44.1      |
|    critic_loss     | 4.3       |
|    learning_rate   | 0.001     |
|    n_updates       | 899       |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -1.21e+04 |
| time/              |           |
|    episodes        | 2         |
|    fps             | 145       |
|    time_elapsed    | 13        |
|    total_timesteps | 2000      |
| train/             |           |
|    actor_loss      | 42.5      |
|    critic_loss     | 0.902     |
|    learning_rate   | 0.001     |
|    n_updates      

KeyboardInterrupt: 

In [25]:
x = np.random.rand(5, 3)
x[:, 0] = np.linspace(0, 1, 5)
a = model.predict({
  'data': x.flatten(),
  'weights': np.array([0, 0, 0, 0, 1.0])
})[0]
(a + 1) / 2

array([1.1920929e-07, 2.6921946e-01, 5.1675427e-01, 7.3541683e-01,
       1.0000000e+00], dtype=float32)

In [3]:
# from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

# class Custom_EIEE_CNN_Extractor(BaseFeaturesExtractor):
#     def __init__(self, observation_space: gym.spaces.Dict, features_dim: int = 21):
#         super(Custom_EIEE_CNN_Extractor, self).__init__(observation_space, features_dim)
#         self.cnn = nn.Sequential(
#             nn.Conv2d(1, 2, kernel_size=(1, 4)),
#             nn.ReLU(),
#             nn.Conv2d(2, 4, kernel_size=(1, 3)),
#         ).cuda()

#     def forward(self, observations: dict[str, torch.Tensor]) -> torch.Tensor:
#         data = observations['data']
#         if len(data.shape) == 2:
#             data = data[:, None, None, :]
#         else:
#             data = data[:, None, :, :]
#         # print(f"{data.shape=}")
#         x = self.cnn(data)
#         w = observations['weights'][:, None, :, None]
#         # print(f"{x.shape=}, {w.shape=}")
#         return torch.cat((x.flatten(start_dim=1), observations['weights'].flatten(start_dim=1)), dim=1)
#         y = torch.cat((x, w), dim=1).flatten(start_dim=1)
#         # print(f"{y.shape=}")
#         return y


from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class Custom_EIEE_CNN_Extractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Dict, features_dim: int = 37):
        super(Custom_EIEE_CNN_Extractor, self).__init__(observation_space, features_dim)
        self.universe_size, data_len = observation_space['data'].shape
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 2, kernel_size=(1, 3)),
            nn.ReLU(),
            nn.Conv2d(2, 4, kernel_size=(1, 3))
        ).cuda()

    def forward(self, observations: dict[str, torch.Tensor]) -> torch.Tensor:
        # print(f"{observations['data'][:, None, :, :].shape=}")
        x = self.cnn(observations['data'][:, None, :, :])
        # print(f"post cnn {x.shape=}, {observations['weights'].shape=}")
        # x = torch.hstack([x.flatten(start_dim=1), observations['weights']])
        # return x.flatten(start_dim=1)
        # print(f"{x.flatten(start_dim=1).shape=}, {observations['weights'].flatten(start_dim=1).shape=}")
        return torch.cat((x.flatten(start_dim=1), observations['weights'].flatten(start_dim=1)), dim=1)

In [4]:
from stable_baselines3 import DDPG, SAC, DQN
from stable_baselines3.common.noise import NormalActionNoise


from gymnasium.wrappers import TimeLimit
train_env = CustomEnv()

train_env = TimeLimit(train_env, max_episode_steps=1000)

# Set seeds
random.seed(42)
np.random.seed(42)
# train_env.seed(42)
train_env.action_space.seed(43)
torch.manual_seed(42)

model = DDPG(
  'MultiInputPolicy', 
  train_env, 
  buffer_size=10**6, 
  verbose=1,
  action_noise=NormalActionNoise(mean=0, sigma=0.02*np.ones(5)),
  policy_kwargs={
    'features_extractor_class': Custom_EIEE_CNN_Extractor
  }
)
model.learn(total_timesteps=10**12, log_interval=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -2.2e+04 |
| time/              |          |
|    episodes        | 1        |
|    fps             | 90       |
|    time_elapsed    | 11       |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_loss      | 27.8     |
|    critic_loss     | 4.43     |
|    learning_rate   | 0.001    |
|    n_updates       | 899      |
---------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -1.16e+04 |
| time/              |           |
|    episodes        | 2         |
|    fps             | 91        |
|    time_elapsed    | 21        |
|    total_timesteps | 2000      |
| train/             |           |
|    actor_loss      | 29.8      |
|    criti

In [13]:
from stable_baselines3 import DDPG, SAC, DQN
from stable_baselines3.common.noise import NormalActionNoise


from gymnasium.wrappers import TimeLimit
train_env = CustomEnv()

train_env = TimeLimit(train_env, max_episode_steps=1000)

# Set seeds
random.seed(42)
np.random.seed(42)
# train_env.seed(42)
train_env.action_space.seed(43)
torch.manual_seed(42)

model = DDPG(
  'MultiInputPolicy', 
  train_env, 
  buffer_size=10**6, 
  verbose=1,
  action_noise=NormalActionNoise(mean=0, sigma=0.02*np.ones(5)),
  policy_kwargs={
    'features_extractor_class': Custom_EIEE_CNN_Extractor
  }
)
model.learn(total_timesteps=10**12, log_interval=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -2.2e+04 |
| time/              |          |
|    episodes        | 1        |
|    fps             | 123      |
|    time_elapsed    | 8        |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_loss      | 27.8     |
|    critic_loss     | 4.42     |
|    learning_rate   | 0.001    |
|    n_updates       | 899      |
---------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -1.15e+04 |
| time/              |           |
|    episodes        | 2         |
|    fps             | 111       |
|    time_elapsed    | 17        |
|    total_timesteps | 2000      |
| train/             |           |
|    actor_loss      | 29.7      |
|    criti

KeyboardInterrupt: 