In [1]:
from typing import SupportsFloat, Any

from gymnasium.core import ActType, ObsType

from src.reinforcement_learning.core.infos import InfoDict
from tmp_mp import main

In [1]:
main()

In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
vec_env = make_vec_env("HalfCheetah-v4", n_envs=4)

model = PPO("MlpPolicy", vec_env, use_sde=True, sde_sample_freq=100, verbose=2)
model.learn(total_timesteps=250000)
model.save("ppo_cartpole")

del model  # remove to demonstrate saving and loading

model = PPO.load("ppo_cartpole")

obs = vec_env.reset()
# while True:
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = vec_env.step(action)
#     vec_env.render("human")

In [14]:
import torch
import numpy as np

rollout_data = np.random.random((2500, 32, 17))

In [15]:
%%timeit
for i in range(len(rollout_data)):
    tensor = torch.tensor(rollout_data[i], device='cuda')

In [6]:
import types
import numpy as np
import torch
from torch import nn


class A(nn.Module):

    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(17, 17)

        self.forward = self.linear.forward

    # def forward(self, x):
    #     return self.linear(x)


test_data = torch.tensor(np.random.random((2500, 1024, 17))).float()

a = A()

In [7]:
%%timeit

for i in range(len(test_data)):
    a(test_data[i])

In [8]:
%%timeit

for i in range(len(test_data)):
    a(test_data[i]).detach()

In [1]:
from src.torch_device import get_torch_device

get_torch_device()

In [None]:
import gymnasium

gymnasium.make('Ant-v4', )

In [None]:
%load_ext autoreload
%autoreload 2

def record_video():
    import torch
    from tqdm import tqdm
    from src.reinforcement_learning.gym.singleton_vector_env import as_vec_env
    import gymnasium
    from gymnasium.wrappers import AutoResetWrapper, RecordVideo
    from src.model_db.tiny_model_db import TinyModelDB
    from src.reinforcement_learning.algorithms.policy_mitosis.mitosis_policy_info import MitosisPolicyInfo
    from src.reinforcement_learning.core.policy_construction import PolicyConstruction
    from src.datetime import get_current_timestamp

    record_env, _ = as_vec_env(gymnasium.make('Ant-v4', render_mode='rgb_array'))

    policy_db = TinyModelDB[MitosisPolicyInfo](base_path=f'E:/saved_models/rl/Ant-v4/mitosis-2024-06-10_19.43.13')
    print(policy_db)

    policy_entry = list(sorted(policy_db.all_entries(), key=lambda entry: entry['model_info']['score']))[-2]
    # policy_entry = policy_db.fetch_entry('2024-06-10_22.13.57~PJHPLG')
    policy_info: MitosisPolicyInfo = policy_entry['model_info']
    print(policy_entry)

    policy, _, record_env = PolicyConstruction.init_from_info(policy_info['initialization_info'], record_env)

    policy_db.load_model_state_dict(policy_entry['model_id'], policy)

    try:
        record_env.metadata['render_fps'] = 30
        record_env = AutoResetWrapper(
            RecordVideo(record_env, video_folder=rf'C:\Users\domin\Videos\rl\{get_current_timestamp()}',
                        episode_trigger=lambda ep_nr: True)
        )

        def record(max_steps: int):
            with torch.no_grad():
                obs, info = record_env.reset()
                for step in tqdm(range(max_steps)):
                    actions_dist, _ = policy.process_obs(torch.tensor(obs, device='cpu'))
                    actions = actions_dist.get_actions(deterministic=True).cpu().numpy()
                    obs, reward, terminated, truncated, info = record_env.step(actions)

        record(50_000)
    except KeyboardInterrupt:
        print('keyboard interrupt')
    finally:
        print('closing record_env')
        record_env.close()
        print('record_env closed')


record_video()

In [14]:
from torch import nn

from src.networks.core.net import Net
from src.networks.multihead_self_attention import MultiheadSelfAttention
from src.networks.core.seq_net import SeqNet
from src.networks.skip_nets.additive_skip_connection import AdditiveSkipConnection

device = 'cuda:0'

transformer_encoder_net = SeqNet.from_layer_provider(
    layer_provider=lambda layer_nr, is_last_layer, in_features, out_features:
    nn.Sequential(
        AdditiveSkipConnection(MultiheadSelfAttention(
            embed_dim=in_features,
            num_heads=8,
        )),
        nn.LayerNorm(in_features),
        AdditiveSkipConnection(Net.seq_as_net(
            nn.Linear(in_features, 2048),
            nn.ReLU(),
            nn.Linear(2048, out_features),
            nn.ReLU(),
        )),
        nn.LayerNorm(out_features),
    ),
    num_layers=6,
    num_features=512,
).to(device)

transformer_encoder_net.out_shape
transformer_encoder_net(torch.tensor(np.random.random((7, 5, 512))).to(device).float()).shape

In [2]:
transformer_encoder = nn.TransformerEncoder(
    nn.TransformerEncoderLayer(
        d_model=512,
        nhead=8,
        dropout=0,
    ),
    num_layers=6
).to(device)

In [3]:
import numpy as np
import torch
data1 = torch.Tensor(np.random.random((100, 4, 512))).to(device)

In [6]:
%%timeit -r 15 -n 500
transformer_encoder_net(data1)

In [7]:
%%timeit -r 15 -n 500
transformer_encoder(data1)

In [4]:
data2 = torch.Tensor(np.random.random((100, 64, 512))).to(device)

In [5]:
%%timeit -r 15 -n 200
transformer_encoder_net(data2)

In [6]:
%%timeit -r 15 -n 200
transformer_encoder(data2)

In [4]:
data3 = torch.Tensor(np.random.random((100, 512, 512))).to(device)

In [None]:
%%timeit -r 5 -n 50
transformer_encoder_net(data3)

In [None]:
%%timeit -r 5 -n 50
transformer_encoder(data3)

In [20]:
import itertools
import numpy as np

def relu(x):
    return x * (x > 0)

def elu(x, alpha=1.0):
    return np.where(x > 0, x, alpha * (np.exp(x) - 1))

def leaky_relu(x, alpha=0.1):
    return np.where(x > 0, x, alpha * x)

factors = np.array([1, 2, 5])
num_iterations = 10000
for d in np.concatenate((factors, factors * 10, factors * 100, factors * 1000, np.array([10_000]))):
    sum_ = 0
    for _ in range(num_iterations):
        v1 = (np.random.rand(d) - 0.5) * 2
        v2 = (np.random.rand(d) - 0.5) * 2
        
        v1 /= np.linalg.norm(v1)
        v2 /= np.linalg.norm(v2)
        
        v1 = leaky_relu(v1)
        v2 = leaky_relu(v2)
        
        sum_ += np.abs(np.inner(v1, v2))
    
    print(f'd={d:>5}: {sum_ / num_iterations}')

In [3]:
from src.networks.global_pooling import GlobalAveragePooling
from src.networks.core.tensor_shape import TensorShape
%load_ext autoreload
%autoreload 2

from src.networks.core.net import Net
from src.networks.skip_nets.dense_skip_net import DenseSkipNet
from torch import nn
import torch
import numpy as np


    
def make_dense_block(in_features: int, num_layers: int, growth_rate: int = 32):
    return DenseSkipNet.from_layer_provider(
        lambda layer_nr, is_last_layer, in_channels, out_channels:
            Net.seq_as_net(
                nn.BatchNorm2d(in_channels),
                nn.ReLU(),
                nn.Conv2d(in_channels, 4 * growth_rate, 1),
                nn.BatchNorm2d(4 * growth_rate),
                nn.ReLU(),
                nn.Conv2d(4 * growth_rate, growth_rate, 3, padding='same')
            ),
        in_size=in_features,
        out_sizes=[growth_rate] * num_layers,
        feature_dim_index=1,
    )

def make_transition_layer(in_features: int):
    return nn.Sequential(
        nn.Conv2d(in_features, int(in_features / 2), 1),
        nn.AvgPool2d(2, 2),
    )

dense_net_121 = Net.provider_seq_as_net(
    3,
    lambda in_f: nn.Conv2d(in_f, 64, 7, 2),
    lambda in_f: nn.MaxPool2d(3, 2),
    lambda in_f: make_dense_block(in_f, 6),
    lambda in_f: make_transition_layer(in_f),
    lambda in_f: make_dense_block(in_f, 12),
    lambda in_f: make_transition_layer(in_f),
    lambda in_f: make_dense_block(in_f, 24),
    lambda in_f: make_transition_layer(in_f),
    lambda in_f: make_dense_block(in_f, 16),
    lambda in_f: GlobalAveragePooling((2, 3)),
    lambda in_f: nn.Linear(in_f, 1000)
)


In [30]:
import sympy as sp
expr = sp.Symbol('features') + 1



In [1]:
tuple((1, 2))

In [1]:
import torch
import numpy as np

In [2]:
data = np.random.random((5000, 128, 255))

In [6]:
%%timeit

torch_data = torch.tensor(data)
results = []

for i in range(0, 5000, 500):
    result = torch_data[i:i+500].mean()
    results.append(result)

In [7]:
%%timeit

torch_data = torch.tensor(data)
results = []

for i in range(0, 5000, 500):
    result = torch.as_tensor(torch_data[i:i+500])
    results.append(result)

In [17]:
%%timeit

results = []

for i in range(0, 5000, 500):
    result = torch.tensor(data[i:i+500], device='cuda')
    results.append(result)

In [8]:

torch_data = torch.tensor(data)



In [10]:
%%timeit

results = []

for i in range(0, 5000, 500):
    result = torch_data[i:i+500].to('cuda').mean()
    results.append(result)

In [8]:
from stable_baselines3.common.vec_env import SubprocVecEnv
import gymnasium as gym

from stable_baselines3 import PPO

env = SubprocVecEnv([lambda: gym.make("HalfCheetah-v4", render_mode=None, **{'forward_reward_weight': 1.25, 'ctrl_cost_weight': 0.001 })] * 16)

model = PPO("MlpPolicy", env, verbose=10, target_kl=0.025, batch_size=500, n_steps=2500)

import cProfile

pr = cProfile.Profile()
pr.enable()
model.learn(total_timesteps=10_000 * 16)
pr.disable()
pr.dump_stats('profile_stats_sb3.pstat')

In [6]:
from src.module_analysis import count_parameters

count_parameters(model.policy)

In [1]:
from gymnasium.vector import AsyncVectorEnv, SyncVectorEnv
%load_ext autoreload
%autoreload 2

import numpy as np
from src.reinforcement_learning.gym.envs.test_env import TestEnv


env_fns = [
    lambda: TestEnv(1, False, False, 2),
    lambda: TestEnv(1, False, False, 3),
    lambda: TestEnv(1, False, False, 5),
    lambda: TestEnv(1, False, False, 7),
    # lambda: TestEnv(1, False, True, 3),
    # lambda: TestEnv(1, False, True, 3),
    # lambda: TestEnv(1, False, True, 3),
    # lambda: TestEnv(1, False, True, 3),
]
env = SyncVectorEnv(env_fns)

# print(env.env_fns[0]().env.episode_length)

env.reset()
for _ in range(31):
    obs, reward, term, trunc, info = env.step(np.zeros((len(env_fns), 2)))
    print(obs)
    print(np.logical_or(term, trunc))
    print(info)
    print()


pygame 2.5.2 (SDL 2.28.3, Python 3.11.7)
Hello from the pygame community. https://www.pygame.org/contribute.html
[[1.]
 [1.]
 [1.]
 [1.]]
[False False False False]
{}

[[0.]
 [2.]
 [2.]
 [2.]]
[ True False False False]
{'final_observation': array([array([2.]), None, None, None], dtype=object), '_final_observation': array([ True, False, False, False]), 'final_info': array([{}, None, None, None], dtype=object), '_final_info': array([ True, False, False, False])}

[[1.]
 [0.]
 [3.]
 [3.]]
[False  True False False]
{'final_observation': array([None, array([3.]), None, None], dtype=object), '_final_observation': array([False,  True, False, False]), 'final_info': array([None, {}, None, None], dtype=object), '_final_info': array([False,  True, False, False])}

[[0.]
 [1.]
 [4.]
 [4.]]
[ True False False False]
{'final_observation': array([array([2.]), None, None, None], dtype=object), '_final_observation': array([ True, False, False, False]), 'final_info': array([{}, None, None, None], dtype=

In [3]:
from src.reinforcement_learning.core.policies.actor_policy import ActorPolicy
from src.reinforcement_learning.core.action_noise import NormalActionNoise
from src.reinforcement_learning.algorithms.base.logging_config import LoggingConfig
from src.reinforcement_learning.core.callback import Callback
from src.reinforcement_learning.core.buffers.replay.replay_buffer import ReplayBuffer
from src.reinforcement_learning.core.action_selectors.diag_gaussian_action_selector import DiagGaussianActionSelector
from src.reinforcement_learning.core.policies.components.actor import Actor
from src.reinforcement_learning.core.policies.base_policy import BasePolicy
from src.reinforcement_learning.algorithms.base.off_policy_algorithm import OffPolicyAlgorithm
from torch import nn
import torch


class TestOffPolicyAlgo(OffPolicyAlgorithm):
    def optimize(self, last_obs: np.ndarray, last_episode_starts: np.ndarray, info: dict) -> None:
        print(self.buffer.observations)


algo = TestOffPolicyAlgo(
    env=env,
    policy=ActorPolicy(Actor(nn.Linear(1, 8), DiagGaussianActionSelector(8, 2, 0.0001, False))),
    buffer=ReplayBuffer.for_env(env, 10, 'cuda', optimize_memory_usage=False),
    gamma=0.99,
    tau=0.1,
    rollout_steps=10,
    gradient_steps=1,
    optimization_batch_size=256,
    action_noise=NormalActionNoise(np.array([[-5, 5], [5, -5], [0, 0], [-0.5, 0.5]]), np.array([0.1])),
    warmup_steps=5,
    learning_starts=0,
    sde_noise_sample_freq=None,
    callback=Callback(),
    logging_config=LoggingConfig(),
    torch_device='cuda',
    torch_dtype=torch.float32,
)
algo.learn(10)

[[[0.]
  [0.]
  [0.]
  [0.]]

 [[1.]
  [1.]
  [1.]
  [1.]]

 [[0.]
  [2.]
  [2.]
  [2.]]

 [[1.]
  [0.]
  [3.]
  [3.]]

 [[0.]
  [1.]
  [4.]
  [4.]]

 [[1.]
  [2.]
  [0.]
  [5.]]

 [[0.]
  [0.]
  [1.]
  [6.]]

 [[1.]
  [1.]
  [2.]
  [0.]]

 [[0.]
  [2.]
  [3.]
  [1.]]

 [[1.]
  [0.]
  [4.]
  [2.]]]


<__main__.TestOffPolicyAlgo at 0x1bc07d817d0>

In [10]:
np.concatenate((
    algo.buffer.observations, 
    algo.buffer.next_observations, 
    algo.buffer.dones[:, :, np.newaxis],
    algo.buffer.rewards[:, :, np.newaxis]
), axis=-1)[:, 1, :]

array([[0., 1., 0., 1.],
       [1., 2., 0., 1.],
       [2., 3., 1., 5.],
       [0., 1., 0., 1.],
       [1., 2., 0., 1.],
       [2., 3., 1., 5.],
       [0., 1., 0., 1.],
       [1., 2., 0., 1.],
       [2., 3., 1., 5.],
       [0., 1., 0., 1.]], dtype=float32)

In [7]:
samples = algo.buffer.sample(40)

np.concatenate((
    samples.observations.cpu().numpy(), 
    # samples.actions.cpu().numpy(),
    samples.next_observations.cpu().numpy(), 
    samples.dones.cpu().numpy(),
    samples.rewards.cpu().numpy()
), axis=-1)

(40, 4)

In [48]:
a = np.zeros((4,))
b = np.ones((3, 4))

b[1] = a

a[0] = 2
b[1, 1] = 3 

print(a)
print(b)

[2. 0. 0. 0.]
[[1. 1. 1. 1.]
 [0. 3. 0. 0.]
 [1. 1. 1. 1.]]


In [35]:

import numpy as np

bool_arr = np.array([1, 0, 0, 1] + [0] * (32 - 5) + [1], dtype=bool)
print(bool_arr.shape)

(32,)


In [36]:
%%timeit
s = 0

for i, b in enumerate(bool_arr):
    if b:
        s += i

1.81 µs ± 4.63 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [37]:
%%timeit

s = 0

for i in np.where(bool_arr)[0]:
    s += i

1.8 µs ± 9.28 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [41]:
np.where(bool_arr[10:20])

(array([], dtype=int64),)

In [5]:
import gymnasium as gym

from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env

env = make_vec_env("HalfCheetah-v4", n_envs=16)

model = SAC("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=1_000_000, log_interval=16)


Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -283     |
| time/              |          |
|    episodes        | 16       |
|    fps             | 1275     |
|    time_elapsed    | 12       |
|    total_timesteps | 16000    |
| train/             |          |
|    actor_loss      | -17.6    |
|    critic_loss     | 0.915    |
|    ent_coef        | 0.742    |
|    ent_coef_loss   | -3.01    |
|    learning_rate   | 0.0003   |
|    n_updates       | 993      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -262     |
| time/              |          |
|    episodes        | 32       |
|    fps             | 1272     |
|    time_elapsed    | 25       |
|    total_timesteps | 32000    |
| train/             |          |
|    actor_loss      | -26      |
|    critic_loss     | 1.81   

KeyboardInterrupt: 