In [1]:
from typing import SupportsFloat, Any

from gymnasium.core import ActType, ObsType

from tmp_mp import main

In [2]:
main()

In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
vec_env = make_vec_env("HalfCheetah-v4", n_envs=4)

model = PPO("MlpPolicy", vec_env, use_sde=True, sde_sample_freq=100, verbose=2)
model.learn(total_timesteps=250000)
model.save("ppo_cartpole")

del model # remove to demonstrate saving and loading

model = PPO.load("ppo_cartpole")

obs = vec_env.reset()
# while True:
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = vec_env.step(action)
#     vec_env.render("human")

In [14]:
import torch
import numpy as np

rollout_data = np.random.random((2500, 32, 17))

In [15]:
%%timeit
for i in range(len(rollout_data)):
    tensor = torch.tensor(rollout_data[i], device='cuda')

In [6]:
import types
import numpy as np
import torch
from torch import nn
class A(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(17, 17)
        
        self.forward = self.linear.forward
        
    # def forward(self, x):
    #     return self.linear(x)
    
test_data = torch.tensor(np.random.random((2500, 1024, 17))).float()

a = A()

In [7]:
%%timeit

for i in range(len(test_data)):
    a(test_data[i])

In [8]:
%%timeit

for i in range(len(test_data)):
    a(test_data[i]).detach()

In [1]:
from src.torch_device import get_torch_device

get_torch_device()

In [None]:
import gymnasium

gymnasium.make('Ant-v4',)

In [None]:
%load_ext autoreload
%autoreload 2

def record_video():
    import torch
    from tqdm import tqdm
    from src.reinforcement_learning.gym.singleton_vector_env import as_vec_env
    import gymnasium
    from gymnasium.wrappers import AutoResetWrapper, RecordVideo    
    from src.model_db.tiny_model_db import TinyModelDB
    from src.reinforcement_learning.algorithms.policy_mitosis.mitosis_policy_info import MitosisPolicyInfo
    from src.reinforcement_learning.core.policy_construction import PolicyConstruction
    from src.datetime import get_current_timestamp

    record_env, _ = as_vec_env(gymnasium.make('Ant-v4', render_mode='rgb_array'))

    policy_db = TinyModelDB[MitosisPolicyInfo](base_path=f'E:/saved_models/rl/Ant-v4/mitosis-2024-06-10_19.43.13')
    print(policy_db)
    
    policy_entry = list(sorted(policy_db.all_entries(), key=lambda entry: entry['model_info']['score']))[-2]
    # policy_entry = policy_db.fetch_entry('2024-06-10_22.13.57~PJHPLG')
    policy_info : MitosisPolicyInfo = policy_entry['model_info']
    print(policy_entry)

    policy, _, record_env = PolicyConstruction.init_from_info(policy_info['initialization_info'], record_env)

    policy_db.load_model_state_dict(policy_entry['model_id'], policy)
    
    try:
        record_env.metadata['render_fps'] = 30
        record_env = AutoResetWrapper(
            RecordVideo(record_env, video_folder=rf'C:\Users\domin\Videos\rl\{get_current_timestamp()}', episode_trigger=lambda ep_nr: True)
        )
        
        def record(max_steps: int):
            with torch.no_grad():
                obs, info = record_env.reset()
                for step in tqdm(range(max_steps)):
                    actions_dist, _ = policy.process_obs(torch.tensor(obs, device='cpu'))
                    actions = actions_dist.get_actions(deterministic=True).cpu().numpy()
                    obs, reward, terminated, truncated, info = record_env.step(actions)
        
        record(50_000)
    except KeyboardInterrupt:
        print('keyboard interrupt')
    finally:
        print('closing record_env')
        record_env.close()
        print('record_env closed')

record_video()

In [10]:
from gymnasium.wrappers import AutoResetWrapper, TimeLimit
import torch
import numpy as np
import gymnasium
from gymnasium.core import ActType, ObsType
from typing import Any, SupportsFloat
from src.reinforcement_learning.core.generalized_advantage_estimate import compute_episode_returns, compute_returns


class TestEnv(gymnasium.Env):
    
    counter: int
    
    def reset(
        self,
        *,
        seed: int | None = None,
        options: dict[str, Any] | None = None,
    ) -> tuple[ObsType, dict[str, Any]]:
        self.counter = 0
        return np.array([self.counter]), {}
    
    def step(
        self, action: ActType
    ) -> tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]:
        self.counter += 1
        done_ = self.counter >= 10
        
        if done_:
            reward_ = 5.0
        else:
            reward_ = 0.0
        
        return np.array([self.counter]), reward_, done_, False, {}



env = AutoResetWrapper(TimeLimit(TestEnv(), 100))
obs, _ = env.reset()
episode_starts = True

obs_l = []
rewards = []
dones = []

for i in range(50):
    next_obs, reward, terminated, truncated, _ = env.step(None)
    obs_l.append(obs)
    rewards.append(reward)
    dones.append(episode_starts)
    episode_starts = np.logical_or(terminated, truncated)
    obs = next_obs
    

returns, episode_returns = compute_episode_returns(
    rewards=np.array(rewards)[:, np.newaxis],
    episode_starts=np.array(dones)[:, np.newaxis],
    last_episode_starts=np.array([episode_starts]),
    gamma=1.0,
    gae_lambda=1.0,
    normalize_rewards=None,
    remove_unfinished_episodes=True
)    

for i, (obs_, reward_, return_, done) in enumerate(zip(obs_l, rewards, returns, dones)):
    print(f'{i}; {obs_}; {reward_}; {return_}; {done}')
print(obs, episode_starts)
print()

print(len(episode_returns), np.mean(episode_returns), np.std(episode_returns))

