In [1]:
from typing import SupportsFloat, Any

from gymnasium.core import ActType, ObsType

from src.reinforcement_learning.core.infos import InfoDict
from tmp_mp import main

In [1]:
main()

In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
vec_env = make_vec_env("HalfCheetah-v4", n_envs=4)

model = PPO("MlpPolicy", vec_env, use_sde=True, sde_sample_freq=100, verbose=2)
model.learn(total_timesteps=250000)
model.save("ppo_cartpole")

del model  # remove to demonstrate saving and loading

model = PPO.load("ppo_cartpole")

obs = vec_env.reset()
# while True:
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = vec_env.step(action)
#     vec_env.render("human")

In [14]:
import torch
import numpy as np

rollout_data = np.random.random((2500, 32, 17))

In [15]:
%%timeit
for i in range(len(rollout_data)):
    tensor = torch.tensor(rollout_data[i], device='cuda')

In [6]:
import types
import numpy as np
import torch
from torch import nn


class A(nn.Module):

    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(17, 17)

        self.forward = self.linear.forward

    # def forward(self, x):
    #     return self.linear(x)


test_data = torch.tensor(np.random.random((2500, 1024, 17))).float()

a = A()

In [7]:
%%timeit

for i in range(len(test_data)):
    a(test_data[i])

In [8]:
%%timeit

for i in range(len(test_data)):
    a(test_data[i]).detach()

In [1]:
from src.torch_device import get_torch_device

get_torch_device()

In [None]:
import gymnasium

gymnasium.make('Ant-v4', )

In [None]:
%load_ext autoreload
%autoreload 2

def record_video():
    import torch
    from tqdm import tqdm
    from src.reinforcement_learning.gym.singleton_vector_env import as_vec_env
    import gymnasium
    from gymnasium.wrappers import AutoResetWrapper, RecordVideo
    from src.model_db.tiny_model_db import TinyModelDB
    from src.reinforcement_learning.algorithms.policy_mitosis.mitosis_policy_info import MitosisPolicyInfo
    from src.reinforcement_learning.core.policy_construction import PolicyConstruction
    from src.datetime import get_current_timestamp

    record_env, _ = as_vec_env(gymnasium.make('Ant-v4', render_mode='rgb_array'))

    policy_db = TinyModelDB[MitosisPolicyInfo](base_path=f'E:/saved_models/rl/Ant-v4/mitosis-2024-06-10_19.43.13')
    print(policy_db)

    policy_entry = list(sorted(policy_db.all_entries(), key=lambda entry: entry['model_info']['score']))[-2]
    # policy_entry = policy_db.fetch_entry('2024-06-10_22.13.57~PJHPLG')
    policy_info: MitosisPolicyInfo = policy_entry['model_info']
    print(policy_entry)

    policy, _, record_env = PolicyConstruction.init_from_info(policy_info['initialization_info'], record_env)

    policy_db.load_model_state_dict(policy_entry['model_id'], policy)

    try:
        record_env.metadata['render_fps'] = 30
        record_env = AutoResetWrapper(
            RecordVideo(record_env, video_folder=rf'C:\Users\domin\Videos\rl\{get_current_timestamp()}',
                        episode_trigger=lambda ep_nr: True)
        )

        def record(max_steps: int):
            with torch.no_grad():
                obs, info = record_env.reset()
                for step in tqdm(range(max_steps)):
                    actions_dist, _ = policy.process_obs(torch.tensor(obs, device='cpu'))
                    actions = actions_dist.get_actions(deterministic=True).cpu().numpy()
                    obs, reward, terminated, truncated, info = record_env.step(actions)

        record(50_000)
    except KeyboardInterrupt:
        print('keyboard interrupt')
    finally:
        print('closing record_env')
        record_env.close()
        print('record_env closed')


record_video()

In [14]:
from torch import nn

from src.networks.core.net import Net
from src.networks.multihead_self_attention import MultiheadSelfAttention
from src.networks.core.seq_net import SeqNet
from src.networks.skip_nets.additive_skip_connection import AdditiveSkipConnection

device = 'cuda:0'

transformer_encoder_net = SeqNet.from_layer_provider(
    layer_provider=lambda layer_nr, is_last_layer, in_features, out_features:
    nn.Sequential(
        AdditiveSkipConnection(MultiheadSelfAttention(
            embed_dim=in_features,
            num_heads=8,
        )),
        nn.LayerNorm(in_features),
        AdditiveSkipConnection(Net.seq_as_net(
            nn.Linear(in_features, 2048),
            nn.ReLU(),
            nn.Linear(2048, out_features),
            nn.ReLU(),
        )),
        nn.LayerNorm(out_features),
    ),
    num_layers=6,
    num_features=512,
).to(device)

transformer_encoder_net.out_shape
transformer_encoder_net(torch.tensor(np.random.random((7, 5, 512))).to(device).float()).shape

In [2]:
transformer_encoder = nn.TransformerEncoder(
    nn.TransformerEncoderLayer(
        d_model=512,
        nhead=8,
        dropout=0,
    ),
    num_layers=6
).to(device)

In [3]:
import numpy as np
import torch
data1 = torch.Tensor(np.random.random((100, 4, 512))).to(device)

In [6]:
%%timeit -r 15 -n 500
transformer_encoder_net(data1)

In [7]:
%%timeit -r 15 -n 500
transformer_encoder(data1)

In [4]:
data2 = torch.Tensor(np.random.random((100, 64, 512))).to(device)

In [5]:
%%timeit -r 15 -n 200
transformer_encoder_net(data2)

In [6]:
%%timeit -r 15 -n 200
transformer_encoder(data2)

In [4]:
data3 = torch.Tensor(np.random.random((100, 512, 512))).to(device)

In [None]:
%%timeit -r 5 -n 50
transformer_encoder_net(data3)

In [None]:
%%timeit -r 5 -n 50
transformer_encoder(data3)

In [20]:
import itertools
import numpy as np

def relu(x):
    return x * (x > 0)

def elu(x, alpha=1.0):
    return np.where(x > 0, x, alpha * (np.exp(x) - 1))

def leaky_relu(x, alpha=0.1):
    return np.where(x > 0, x, alpha * x)

factors = np.array([1, 2, 5])
num_iterations = 10000
for d in np.concatenate((factors, factors * 10, factors * 100, factors * 1000, np.array([10_000]))):
    sum_ = 0
    for _ in range(num_iterations):
        v1 = (np.random.rand(d) - 0.5) * 2
        v2 = (np.random.rand(d) - 0.5) * 2
        
        v1 /= np.linalg.norm(v1)
        v2 /= np.linalg.norm(v2)
        
        v1 = leaky_relu(v1)
        v2 = leaky_relu(v2)
        
        sum_ += np.abs(np.inner(v1, v2))
    
    print(f'd={d:>5}: {sum_ / num_iterations}')

In [3]:
from src.networks.global_pooling import GlobalAveragePooling
from src.networks.core.tensor_shape import TensorShape
%load_ext autoreload
%autoreload 2

from src.networks.core.net import Net
from src.networks.skip_nets.dense_skip_net import DenseSkipNet
from torch import nn
import torch
import numpy as np


    
def make_dense_block(in_features: int, num_layers: int, growth_rate: int = 32):
    return DenseSkipNet.from_layer_provider(
        lambda layer_nr, is_last_layer, in_channels, out_channels:
            Net.seq_as_net(
                nn.BatchNorm2d(in_channels),
                nn.ReLU(),
                nn.Conv2d(in_channels, 4 * growth_rate, 1),
                nn.BatchNorm2d(4 * growth_rate),
                nn.ReLU(),
                nn.Conv2d(4 * growth_rate, growth_rate, 3, padding='same')
            ),
        in_size=in_features,
        out_sizes=[growth_rate] * num_layers,
        feature_dim_index=1,
    )

def make_transition_layer(in_features: int):
    return nn.Sequential(
        nn.Conv2d(in_features, int(in_features / 2), 1),
        nn.AvgPool2d(2, 2),
    )

dense_net_121 = Net.provider_seq_as_net(
    3,
    lambda in_f: nn.Conv2d(in_f, 64, 7, 2),
    lambda in_f: nn.MaxPool2d(3, 2),
    lambda in_f: make_dense_block(in_f, 6),
    lambda in_f: make_transition_layer(in_f),
    lambda in_f: make_dense_block(in_f, 12),
    lambda in_f: make_transition_layer(in_f),
    lambda in_f: make_dense_block(in_f, 24),
    lambda in_f: make_transition_layer(in_f),
    lambda in_f: make_dense_block(in_f, 16),
    lambda in_f: GlobalAveragePooling((2, 3)),
    lambda in_f: nn.Linear(in_f, 1000)
)


In [30]:
import sympy as sp
expr = sp.Symbol('features') + 1



In [1]:
tuple((1, 2))

In [1]:
import torch
import numpy as np

In [2]:
data = np.random.random((5000, 128, 255))

In [6]:
%%timeit

torch_data = torch.tensor(data)
results = []

for i in range(0, 5000, 500):
    result = torch_data[i:i+500].mean()
    results.append(result)

In [7]:
%%timeit

torch_data = torch.tensor(data)
results = []

for i in range(0, 5000, 500):
    result = torch.as_tensor(torch_data[i:i+500])
    results.append(result)

In [17]:
%%timeit

results = []

for i in range(0, 5000, 500):
    result = torch.tensor(data[i:i+500], device='cuda')
    results.append(result)

In [8]:

torch_data = torch.tensor(data)



In [10]:
%%timeit

results = []

for i in range(0, 5000, 500):
    result = torch_data[i:i+500].to('cuda').mean()
    results.append(result)

In [8]:
from stable_baselines3.common.vec_env import SubprocVecEnv
import gymnasium as gym

from stable_baselines3 import PPO

env = SubprocVecEnv([lambda: gym.make("HalfCheetah-v4", render_mode=None, **{'forward_reward_weight': 1.25, 'ctrl_cost_weight': 0.001 })] * 16)

model = PPO("MlpPolicy", env, verbose=10, target_kl=0.025, batch_size=500, n_steps=2500)

import cProfile

pr = cProfile.Profile()
pr.enable()
model.learn(total_timesteps=10_000 * 16)
pr.disable()
pr.dump_stats('profile_stats_sb3.pstat')

In [6]:
from src.module_analysis import count_parameters

count_parameters(model.policy)

In [2]:
from gymnasium.vector import AsyncVectorEnv, SyncVectorEnv
%load_ext autoreload
%autoreload 2

import numpy as np
from src.reinforcement_learning.gym.envs.test_env import TestEnv


env_fns = [
    lambda: TestEnv(4, False, 2),
    lambda: TestEnv(4, False, 3),
    lambda: TestEnv(4, False, 5),
    lambda: TestEnv(4, False, 7),
]
env = SyncVectorEnv(env_fns)

# print(env.env_fns[0]().env.episode_length)

env.reset()
for _ in range(31):
    obs, reward, term, trunc, info = env.step([0] * 4)
    print(obs)
    print(np.logical_or(term, trunc))
    print(info)
    print()


[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
[False False False False]
{}

[[0. 0. 0. 0.]
 [2. 2. 2. 2.]
 [2. 2. 2. 2.]
 [2. 2. 2. 2.]]
[ True False False False]
{'final_observation': array([array([2., 2., 2., 2.]), None, None, None], dtype=object), '_final_observation': array([ True, False, False, False]), 'final_info': array([{}, None, None, None], dtype=object), '_final_info': array([ True, False, False, False])}

[[1. 1. 1. 1.]
 [0. 0. 0. 0.]
 [3. 3. 3. 3.]
 [3. 3. 3. 3.]]
[False  True False False]
{'final_observation': array([None, array([3., 3., 3., 3.]), None, None], dtype=object), '_final_observation': array([False,  True, False, False]), 'final_info': array([None, {}, None, None], dtype=object), '_final_info': array([False,  True, False, False])}

[[0. 0. 0. 0.]
 [1. 1. 1. 1.]
 [4. 4. 4. 4.]
 [4. 4. 4. 4.]]
[ True False False False]
{'final_observation': array([array([2., 2., 2., 2.]), None, None, None], dtype=object), '_final_observation': array([ True, False,

In [11]:
from src.reinforcement_learning.algorithms.base.logging_config import LoggingConfig
from src.reinforcement_learning.core.callback import Callback
from src.reinforcement_learning.core.buffers.replay.replay_buffer import ReplayBuffer
from src.reinforcement_learning.core.action_selectors.diag_gaussian_action_selector import DiagGaussianActionSelector
from src.reinforcement_learning.core.policies.components.actor import Actor
from src.reinforcement_learning.core.policies.base_policy import BasePolicy
from src.reinforcement_learning.algorithms.base.off_policy_algorithm import OffPolicyAlgorithm
from torch import nn
import torch


class TestOffPolicyAlgo(OffPolicyAlgorithm):
    def optimize(self, last_obs: np.ndarray, last_episode_starts: np.ndarray, info: dict) -> None:
        print(self.buffer.observations)


algo = TestOffPolicyAlgo(
    env=env,
    policy=BasePolicy(Actor(nn.Linear(4, 8), DiagGaussianActionSelector(8, 2, 0.0001, False))),
    buffer=ReplayBuffer.for_env(env, 1000, 'cuda'),
    gamma=0.99,
    tau=0.1,
    rollout_steps=100,
    gradient_steps=1,
    action_noise=None,
    warmup_steps=50,
    sde_noise_sample_freq=None,
    callback=Callback(),
    logging_config=LoggingConfig(),
    torch_device='cuda',
    torch_dtype=torch.float32,
)
algo.learn(200)

[[-0.8693282   0.58624583]
 [-0.31176484 -0.94757736]
 [ 0.0834689   0.60659134]
 [ 0.77021277 -0.831201  ]]
[[ 0.52057505  0.41270298]
 [ 0.6694224   0.80718195]
 [-0.6913605  -0.5399516 ]
 [ 0.42518348  0.3773016 ]]
[[-0.39520526  0.49182367]
 [-0.78358936 -0.68001443]
 [-0.06645925  0.6082146 ]
 [-0.93713784 -0.903293  ]]
[[-0.6898142   0.7617745 ]
 [-0.56910473  0.25737107]
 [-0.08667714  0.9532684 ]
 [-0.7785256   0.4517108 ]]
[[ 0.6491212   0.8006077 ]
 [ 0.34740534 -0.25895944]
 [ 0.5966549   0.6526357 ]
 [-0.4833214  -0.7437691 ]]
[[-0.05704025 -0.26883358]
 [-0.21535401  0.04536812]
 [ 0.28586397  0.80168205]
 [-0.8133532   0.39490643]]
[[-0.85029477 -0.9289888 ]
 [-0.98563105  0.2966846 ]
 [ 0.19664985 -0.11767972]
 [ 0.36577854 -0.03406118]]
[[ 0.04866572  0.4727367 ]
 [-0.9000028   0.9328065 ]
 [ 0.08475711  0.04237787]
 [-0.08803723  0.36322376]]
[[-0.8188476  -0.09751792]
 [ 0.8404389   0.0824685 ]
 [ 0.9587921  -0.16711509]
 [ 0.1557264  -0.52093506]]
[[-0.8788543   0.93

<__main__.TestOffPolicyAlgo at 0x260f5e2b1d0>

In [10]:
algo.buffer.actions

array([[[-0.8388518 ,  0.6058575 ],
        [-0.48365894, -0.41857854],
        [ 0.25024652,  0.6455046 ],
        [ 0.28540504, -0.17151779]],

       [[ 0.1705378 , -0.6679885 ],
        [-0.5365215 ,  0.5805764 ],
        [-0.9393634 ,  0.2365116 ],
        [-0.9840996 ,  0.21031807]],

       [[ 0.706734  ,  0.79460365],
        [-0.45994848, -0.31456307],
        [ 0.4815851 ,  0.3064709 ],
        [ 0.7127489 ,  0.68381625]],

       ...,

       [[ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]]], dtype=float32)