In [1]:
import sys
import time
from typing import Any, Iterable

import gymnasium as gym
import numpy as np
import torch
from torch import optim

from src.datetime import get_current_timestamp
from src.model_db.tiny_model_db import TinyModelDB
from src.module_analysis import count_parameters
from src.moving_averages import ExponentialMovingAverage, AsymmetricExponentialMovingAverage
from src.np_functions import softmax
from src.reinforcement_learning.algorithms.policy_mitosis.async_policy_mitosis import AsyncPolicyMitosis
from src.reinforcement_learning.algorithms.policy_mitosis.mitosis_policy_info import MitosisPolicyInfo
from src.reinforcement_learning.algorithms.policy_mitosis.policy_mitosis_base import PolicyWithEnvAndInfo, TrainInfo
from src.reinforcement_learning.algorithms.ppo.ppo import PPOLoggingConfig, PPO
from src.reinforcement_learning.core.callback import Callback
from src.reinforcement_learning.core.generalized_advantage_estimate import compute_gae_and_returns
from src.reinforcement_learning.core.normalization import NormalizationType
from src.reinforcement_learning.core.objectives import ObjectiveLoggingConfig
from src.reinforcement_learning.gym.parallelize_env import parallelize_env_async
from src.stopwatch import Stopwatch
from src.torch_device import get_torch_device
from src.torch_functions import antisymmetric_power
from src.trees import Forest

%load_ext autoreload
%autoreload 2

pygame 2.5.2 (SDL 2.28.3, Python 3.11.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:


nr_carts = 6

def make_multi_agent_cart_pole_env(render_mode: str | None = None, time_limit: float | None = None):
    from src.reinforcement_learning.gym.envs.multi_agent_cartpole3d import MultiAgentCartPole3D
    return MultiAgentCartPole3D(
        nr_carts=nr_carts,
        cart_size=0.25,
        force_magnitude=500,
        physics_steps_per_step=10,
        reset_position_radius=1.25,
        reset_randomize_position_angle_offset=True,
        reset_position_randomization_magnitude=0.1,
        reset_hinge_randomization_magnitude=0.05,
        slide_range=2,
        hinge_range=1.2,
        time_limit=time_limit or 60.0,
        step_reward_function=lambda time_, action, state, prev_state: 0.01 ,
        out_ouf_range_reward_function=lambda time_, action, state: 0.0,# -10 + time_ * 3,
        time_limit_reward_function=lambda time_, action, state: 10,
        render_mode=render_mode,
    )

In [None]:



def init_policy():
    import numpy as np
    import torch
    from torch import nn

    from src.networks.core.net import Net
    from src.networks.core.seq_net import SeqNet
    from src.reinforcement_learning.core.action_selectors.squashed_diag_gaussian_action_selector import \
        SquashedDiagGaussianActionSelector
    from src.reinforcement_learning.core.policies.actor_critic_policy import ActorCriticPolicy
    from src.networks.skip_nets.additive_skip_connection import AdditiveSkipConnection
    from src.weight_initialization import orthogonal_initialization
    from src.networks.multihead_self_attention import MultiheadSelfAttention
    
    in_size = 8
    action_size = 2
    
    actor_layers = 3
    actor_features = 48
    
    critic_layers = 2
    critic_features = 48

    actor_hidden_activation_function = nn.ELU
    critic_hidden_activation_function = nn.ELU
    
    actor_hidden_initialization = lambda module: orthogonal_initialization(module, gain=np.sqrt(2))
    critic_hidden_initialization = lambda module: orthogonal_initialization(module, gain=np.sqrt(2))

    class A2CNetwork(nn.Module):

        def __init__(self):
            super().__init__()

            self.actor_embedding = nn.Sequential(nn.Linear(in_size, actor_features), actor_hidden_activation_function())
            self.actor = SeqNet.from_layer_provider(
                layer_provider=lambda layer_nr, is_last_layer, in_features, out_features: nn.Sequential(
                    AdditiveSkipConnection(MultiheadSelfAttention(
                        embed_dim=in_features,
                        num_heads=4,
                        batch_first=True,
                    )),
                    nn.LayerNorm(in_features),
                    AdditiveSkipConnection(Net.sequential_net(
                        actor_hidden_initialization(nn.Linear(in_features, out_features)),
                        actor_hidden_activation_function(),
                        actor_hidden_initialization(nn.Linear(in_features, out_features)),
                        nn.Tanh() if is_last_layer else actor_hidden_activation_function(),
                    )),
                    nn.LayerNorm(in_features),
                ),
                num_layers=actor_layers,
                num_features=actor_features,
            )

            self.critic_embedding = nn.Sequential(nn.Linear(in_size, critic_features), critic_hidden_activation_function())
            self.critic = SeqNet.from_layer_provider(
                layer_provider=lambda layer_nr, is_last_layer, in_features, out_features: nn.Sequential(
                    AdditiveSkipConnection(MultiheadSelfAttention(
                        embed_dim=in_features,
                        num_heads=4,
                        batch_first=True,
                    )),
                    nn.LayerNorm(in_features),
                    AdditiveSkipConnection(Net.sequential_net(
                        critic_hidden_initialization(nn.Linear(in_features, out_features)),
                        critic_hidden_activation_function(),
                        critic_hidden_initialization(nn.Linear(in_features, out_features)),
                        critic_hidden_activation_function(),
                    )),
                    nn.LayerNorm(in_features),
                ),
                num_layers=critic_layers,
                num_features=critic_features,
            )
            self.critic_regressor = nn.Linear(critic_features, 1)

        def forward(self, x: torch.Tensor):
            *batch_shape, nr_actors, nr_features = x.shape
            x = torch.flatten(x, end_dim=-3)
            
            actor_out: torch.Tensor = self.actor(self.actor_embedding(x))
            critic_out: torch.Tensor = self.critic_regressor(self.critic(self.critic_embedding(x)).sum(dim=-2))
            
            actor_out = actor_out.unflatten(dim=0, sizes=batch_shape)
            critic_out = critic_out.unflatten(dim=0, sizes=batch_shape)
            
            return actor_out, critic_out
        
    return ActorCriticPolicy(A2CNetwork(), SquashedDiagGaussianActionSelector(
        latent_dim=actor_features,
        action_dim=action_size,
        std=0.15,
        std_learnable=False,
        action_net_initialization=lambda module: orthogonal_initialization(module, gain=0.01),
    ))

def wrap_env(env_):
    return env_

def train_func(policy_with_env_and_info: PolicyWithEnvAndInfo) -> TrainInfo:
    policy = policy_with_env_and_info['policy']
    env = policy_with_env_and_info['env']
    
    score = 0.0
    score_ema = AsymmetricExponentialMovingAverage(up_alpha=0.2, down_alpha=0.5)
    rollout_stopwatch = Stopwatch()
    def on_rollout_done(rl: PPO, step: int, info: dict[str, Any], scheduler_values: dict[str, Any]):   
        
        if 'raw_rewards' in info['rollout']:
            raw_rewards = info['rollout']['raw_rewards']
            _, gamma_1_returns = compute_gae_and_returns(
                value_estimates=np.zeros_like(rl.buffer.rewards[:len(raw_rewards)]),
                rewards=raw_rewards,
                episode_starts=rl.buffer.episode_starts[:len(raw_rewards)],
                last_values=np.zeros_like(rl.buffer.rewards[0], dtype=float),
                last_dones=np.zeros_like(rl.buffer.episode_starts[0], dtype=bool),
                gamma=1.0,
                gae_lambda=1.0,
                normalize_rewards=None,
                normalize_advantages=None,
            )
        else:
            _, gamma_1_returns = rl.buffer.compute_gae_and_returns(
                last_values=torch.zeros_like(rl.buffer.value_estimates[0]),
                last_dones=np.zeros_like(rl.buffer.episode_starts[0], dtype=bool),
                gamma=1.0,
                gae_lambda=1.0,
                normalize_advantages=None,
                normalize_rewards=None,
            )
        
        episode_scores = gamma_1_returns[
            rl.buffer.episode_starts[:rl.buffer.pos]
        ]
        
        nonlocal score, score_ema
        score = episode_scores.mean()
        
        current_score_ema = None
        if not np.isnan(score):
            current_score_ema = score_ema.update(score)
        else:
            print(f'================================= Warning ================================= \n'
                  f' Score is NaN! There was likely no episode start/end in the rollout buffer \n'
                  f'=========================================================================== \n\n\n')
        
        rollout_time = rollout_stopwatch.reset()
        
        resets: np.ndarray = rl.buffer.episode_starts.astype(int).sum(axis=0)
        resets_mean = resets.mean()
        resets_min = resets.min()
        
        print(f'{policy_info["policy_id"]}  {step:>6}: '
              f'{score = :9.3f}, '
              f'score_ema = {current_score_ema or score_ema.get():9.3f}, '
              f'time = {rollout_time:5.2f}, '
              f'resets = {resets_mean:5.2f} >= {resets_min:5.2f}')
        sys.stdout.flush()
        
    optimizations_done = 0
    def on_optimization_done(rl: PPO, step: int, info: dict[str, Any], scheduler_values: dict[str, Any]):
        nonlocal optimizations_done
        optimizations_done += 1
    
    policy_info = policy_with_env_and_info['policy_info']
    policy_info_str = ('('
          f'policy_id = {policy_info["policy_id"]}, '
          f'parent_id = {policy_info["parent_policy_id"]}, '
          f'num_parameters = {count_parameters(policy)}, '
          f'previous_steps = {policy_info["steps_trained"]}, '
          f'previous_score = {policy_info["score"]:9.3f}'
          ')')
    
    print(f'Starting PPO with policy {policy_info_str:s} for {steps_per_iteration:_} steps')
    mitosis_iteration_stopwatch = Stopwatch()
    PPO(
        env=env,
        policy=policy.to(device),
        policy_optimizer=lambda pol: optim.AdamW(pol.parameters(), lr=1e-5),
        buffer_size=5000,
        gamma=0.995,
        gae_lambda=1.0,
        normalize_rewards=None,
        normalize_advantages=NormalizationType.Std,
        weigh_and_reduce_actor_objective=lambda obj: antisymmetric_power(obj, 1.5).mean(),
        weigh_and_reduce_entropy_objective=None,  # lambda obj: 1.0 * obj.mean(),
        weigh_and_reduce_critic_objective=lambda obj: 0.5 * obj.mean(),
        ppo_max_epochs=10,
        ppo_kl_target=0.025,
        ppo_batch_size=500,
        action_ratio_clip_range=0.1,
        grad_norm_clip_value=1.0,
        callback=Callback(
            on_rollout_done=on_rollout_done,
            on_optimization_done=on_optimization_done,
        ),
        logging_config=PPOLoggingConfig(log_rollout_infos=True),
        torch_device=device,
    ).train(steps_per_iteration)
    
    
    print(f'Training finished for policy {policy_info_str:s}, end score = {score:9.3f}, time = {mitosis_iteration_stopwatch.time_passed():6.2f}')
    
    return {
        'steps_trained': steps_per_iteration, 
        'optimizations_done': optimizations_done, 
        'score': score_ema.get(),
    }

def select_policy_selection_probs(policy_infos: Iterable[MitosisPolicyInfo]) -> np.ndarray:
    # TODO introduce score change momentum factor, average child score
    policy_infos = list(policy_infos)
    policy_info_forest = Forest(
        policy_infos, 
        get_id=lambda pi: pi['policy_id'], 
        get_parent_id=lambda pi: pi['parent_policy_id']
    )
    
    scores = np.array([policy_info['score'] for policy_info in policy_infos], dtype=float)
    score_probs = softmax(scores, temperature=0.5 / np.log(len(scores)), normalize=True)
    
    num_descendants = np.array([
        policy_info_forest.compute_num_descendants(policy_info['policy_id'], discount_factor=0.5) 
        for policy_info in policy_infos
    ], dtype=float)
    num_descendants_probs = softmax(-num_descendants, temperature=0.5)
    
    steps_trained = np.array([policy_info['steps_trained'] for policy_info in policy_infos], dtype=float)
    steps_trained_probs = softmax(-steps_trained, temperature=0.1, normalize=True)
    
    score_weight = 1.0
    num_descendants_weight = 0.5
    steps_trained_weight = 0.5
    
    probs = (
        score_probs**score_weight * 
        num_descendants_probs**num_descendants_weight * 
        steps_trained_probs**steps_trained_weight
    )
    probs /= probs.sum()
    
    print('policy selection probs = \n\t' + '\n\t'.join(
        f'{(policy_id := policy_infos[i]["policy_id"])}: {p = :8.6f}, '
        f'score = {policy_infos[i]["score"]:7.3f}, '
        f'score_prob = {score_probs[i]**score_weight:7.5f}, '
        f'num_children = {len(policy_info_forest[policy_id].children)}, '
        f'num_descendants = {num_descendants[i]:7.3f}, '
        f'descendants_prob = {num_descendants_probs[i]**num_descendants_weight:7.5f}, '
        f'steps = {policy_infos[i]["steps_trained"]}, '
        f'steps_prob = {steps_trained_probs[i]**steps_trained_weight:7.5f}, '
        for i, p
        in enumerate(probs)
    ))
    
    return probs

device = get_torch_device("cuda:0") if True else get_torch_device('cpu')
print(f'using device {device}')

steps_per_iteration = 100_000

num_envs = 16

# mitosis_id = get_current_timestamp()
mitosis_id = '2024-05-28_20.00.00'
policy_db = TinyModelDB[MitosisPolicyInfo](base_path=f'E:/saved_models/rl/MultiAgentCartPole/{nr_carts}/mitosis-{mitosis_id}')
# policy_db = TinyModelDB[PolicyInfo](base_path=f'C:/Users/domin/git/pytorch-starter/saved_models/rl/{env_name}/mitosis-{mitosis_id}')

try:
    print(f'Starting {nr_carts} agent cartpole mitosis with id {mitosis_id}')
    AsyncPolicyMitosis(
        num_workers=3,
        policy_db=policy_db,
        train_policy_function=train_func,
        create_env=lambda: parallelize_env_async(lambda: make_multi_agent_cart_pole_env(None), num_envs),
        new_init_policy_function=init_policy,
        new_wrap_env_function=wrap_env,
        new_policy_prob_function=lambda nr_policies, nr_primordial_ancestors: 0.0,
        select_policy_selection_probs=select_policy_selection_probs,
        min_primordial_ancestors=5,
        rng_seed=None,
        initialization_delay=5,
        delay_between_workers=20,
    ).train_with_mitosis(1000)
except KeyboardInterrupt:
    print('keyboard interrupt')
finally:    
    policy_db.close()
    print('model db closed')
    

print('done')

using device cuda:0
Starting 6 agent cartpole mitosis with id 2024-05-28_20.00.00
Starting worker 0 with delay = 0
policy selection probs = 
	2024-05-28_23.10.15~6QVFyI: p = 0.014433, score =   0.028, score_prob = 0.00042, num_children = 0, num_descendants =   0.000, descendants_prob = 0.25844, steps = 500000, steps_prob = 0.28810, 
	2024-05-28_23.17.44~QlbUFa: p = 0.001557, score =   0.034, score_prob = 0.00042, num_children = 1, num_descendants =   3.000, descendants_prob = 0.01287, steps = 400000, steps_prob = 0.62174, 
	2024-05-28_23.18.28~rlRteB: p = 0.009864, score =   0.034, score_prob = 0.00042, num_children = 0, num_descendants =   0.000, descendants_prob = 0.25844, steps = 550000, steps_prob = 0.19611, 
	2024-05-28_23.25.50~AZeaFt: p = 0.005336, score =   0.036, score_prob = 0.00042, num_children = 1, num_descendants =   1.000, descendants_prob = 0.09507, steps = 500000, steps_prob = 0.28810, 
	2024-05-28_23.33.35~CUWu2J: p = 0.006719, score =   0.035, score_prob = 0.00042, n

In [None]:
def record_video():
    import torch
    from tqdm import tqdm
    from src.reinforcement_learning.gym.singleton_vector_env import as_vec_env
    from gymnasium.wrappers import AutoResetWrapper, RecordVideo
    from src.reinforcement_learning.gym.env_wrapping import wrap_env_using_source
    from src.reinforcement_learning.core.policies.policy_initialization import init_policy_using_source
    
    record_env = make_multi_agent_cart_pole_env(render_mode='rgb_array', time_limit=180.0)
    
    policy_db = TinyModelDB[MitosisPolicyInfo](base_path=f'E:/saved_models/rl/MultiAgentCartPole/6/mitosis-2024-05-28_20.00.00')
    print(policy_db)
    
    # policy_entry = max(policy_db.all_entries(), key=lambda entry: entry['model_info']['score'])
    policy_entry = policy_db.fetch_entry('2024-06-01_18.12.50~XFh5RF')
    policy_info : MitosisPolicyInfo = policy_entry['model_info']
    print(policy_entry)

    policy = init_policy_using_source(policy_info['init_policy_source_code'])

    policy_db.load_model_state_dict(policy, policy_entry['model_id'])

    record_env = wrap_env_using_source(record_env, policy_info['wrap_env_source_code'])
    
    try:
        record_env.metadata['render_fps'] = 500 / record_env.physics_steps_per_step
        record_env = AutoResetWrapper(
            RecordVideo(record_env, video_folder=rf'C:\Users\domin\Videos\rl\{get_current_timestamp()}', episode_trigger=lambda ep_nr: True)
        )
        record_env, _ = as_vec_env(record_env)
        # Todo: wrap env
        
        policy.reset_sde_noise(1)
        
        def record(max_steps: int):
            with torch.no_grad():
                obs, info = record_env.reset()
                for step in tqdm(range(max_steps)):
                    actions_dist, _ = policy.process_obs(torch.tensor(obs, device='cpu'))
                    actions = actions_dist.get_actions(deterministic=True).cpu().numpy()
                    obs, reward, terminated, truncated, info = record_env.step(actions)
        
        record(50_000)
    except KeyboardInterrupt:
        print('keyboard interrupt')
    finally:
        print('closing record_env')
        record_env.close()
        print('record_env closed')

record_video()

TinyModelDB(self.base_path = 'E:/saved_models/rl/MultiAgentCartPole/6/mitosis-2024-05-28_20.00.00', self.db_file_name = '_model_db.json')
{'model_id': '2024-06-01_18.12.50~XFh5RF', 'parent_model_id': '2024-06-01_16.10.14~ZZAb6N', 'model_info': {'policy_id': '2024-06-01_18.12.50~XFh5RF', 'parent_policy_id': '2024-06-01_16.10.14~ZZAb6N', 'score': 20.431178689379088, 'steps_trained': 1350000, 'env_steps_trained': 24800000, 'init_policy_source_code': 'def init_policy():\n    import numpy as np\n    import torch\n    from torch import nn\n\n    from src.networks.core.net import Net\n    from src.networks.core.seq_net import SeqNet\n    from src.reinforcement_learning.core.action_selectors.squashed_diag_gaussian_action_selector import \\\n        SquashedDiagGaussianActionSelector\n    from src.reinforcement_learning.core.policies.actor_critic_policy import ActorCriticPolicy\n    from src.networks.skip_nets.additive_skip_connection import AdditiveSkipConnection\n    from src.weight_initializ

 18%|█▊        | 8987/50000 [01:09<05:06, 133.71it/s]

Moviepy - Building video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-0.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-0.mp4



t:   0%|          | 0/9002 [00:00<?, ?it/s, now=None][A
t:   0%|          | 3/9002 [00:00<05:57, 25.20it/s, now=None][A
t:   0%|          | 24/9002 [00:00<01:12, 123.30it/s, now=None][A
t:   0%|          | 44/9002 [00:00<00:57, 156.47it/s, now=None][A
t:   1%|          | 64/9002 [00:00<00:52, 171.12it/s, now=None][A
t:   1%|          | 84/9002 [00:00<00:49, 181.23it/s, now=None][A
t:   1%|          | 104/9002 [00:00<00:47, 186.23it/s, now=None][A
t:   1%|▏         | 124/9002 [00:00<00:47, 187.92it/s, now=None][A
t:   2%|▏         | 144/9002 [00:00<00:46, 191.67it/s, now=None][A
t:   2%|▏         | 164/9002 [00:00<00:45, 193.63it/s, now=None][A
t:   2%|▏         | 184/9002 [00:01<00:45, 195.55it/s, now=None][A
t:   2%|▏         | 204/9002 [00:01<00:45, 195.50it/s, now=None][A
t:   2%|▏         | 224/9002 [00:01<00:44, 196.25it/s, now=None][A
t:   3%|▎         | 244/9002 [00:01<00:44, 196.77it/s, now=None][A
t:   3%|▎         | 264/9002 [00:01<00:44, 195.96it/s, now=None]

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-0.mp4


 18%|█▊        | 9038/50000 [01:56<4:16:14,  2.66it/s]

Moviepy - Building video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-1.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-1.mp4



t:   0%|          | 0/42 [00:00<?, ?it/s, now=None][A
t:  31%|███       | 13/42 [00:00<00:00, 128.68it/s, now=None][A
t:  81%|████████  | 34/42 [00:00<00:00, 175.75it/s, now=None][A
 18%|█▊        | 9064/50000 [01:56<2:12:37,  5.14it/s]       [A

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-1.mp4


 36%|███▌      | 18033/50000 [03:04<04:21, 122.12it/s]

Moviepy - Building video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-2.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-2.mp4



t:   0%|          | 0/9002 [00:00<?, ?it/s, now=None][A
t:   0%|          | 9/9002 [00:00<01:39, 89.97it/s, now=None][A
t:   0%|          | 25/9002 [00:00<01:10, 127.40it/s, now=None][A
t:   0%|          | 43/9002 [00:00<00:59, 150.34it/s, now=None][A
t:   1%|          | 63/9002 [00:00<00:52, 169.03it/s, now=None][A
t:   1%|          | 84/9002 [00:00<00:49, 181.05it/s, now=None][A
t:   1%|          | 104/9002 [00:00<00:47, 187.37it/s, now=None][A
t:   1%|▏         | 124/9002 [00:00<00:46, 191.44it/s, now=None][A
t:   2%|▏         | 145/9002 [00:00<00:45, 195.44it/s, now=None][A
t:   2%|▏         | 166/9002 [00:00<00:44, 197.18it/s, now=None][A
t:   2%|▏         | 186/9002 [00:01<00:44, 197.81it/s, now=None][A
t:   2%|▏         | 206/9002 [00:01<00:44, 197.86it/s, now=None][A
t:   3%|▎         | 226/9002 [00:01<00:44, 198.49it/s, now=None][A
t:   3%|▎         | 246/9002 [00:01<00:44, 195.98it/s, now=None][A
t:   3%|▎         | 266/9002 [00:01<00:44, 196.71it/s, now=None]

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-2.mp4


 54%|█████▍    | 27030/50000 [05:02<02:45, 138.73it/s] 

Moviepy - Building video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-3.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-3.mp4



t:   0%|          | 0/9002 [00:00<?, ?it/s, now=None][A
t:   0%|          | 11/9002 [00:00<01:23, 107.82it/s, now=None][A
t:   0%|          | 30/9002 [00:00<00:57, 154.83it/s, now=None][A
t:   1%|          | 49/9002 [00:00<00:52, 169.20it/s, now=None][A
t:   1%|          | 69/9002 [00:00<00:49, 179.18it/s, now=None][A
t:   1%|          | 88/9002 [00:00<00:48, 182.55it/s, now=None][A
t:   1%|          | 108/9002 [00:00<00:47, 186.52it/s, now=None][A
t:   1%|▏         | 127/9002 [00:00<00:47, 187.05it/s, now=None][A
t:   2%|▏         | 146/9002 [00:00<00:47, 187.97it/s, now=None][A
t:   2%|▏         | 166/9002 [00:00<00:46, 189.34it/s, now=None][A
t:   2%|▏         | 185/9002 [00:01<00:46, 188.38it/s, now=None][A
t:   2%|▏         | 204/9002 [00:01<00:47, 186.60it/s, now=None][A
t:   2%|▏         | 223/9002 [00:01<00:46, 187.01it/s, now=None][A
t:   3%|▎         | 243/9002 [00:01<00:46, 189.22it/s, now=None][A
t:   3%|▎         | 262/9002 [00:01<00:46, 188.32it/s, now=None

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-3.mp4


 54%|█████▍    | 27090/50000 [05:50<1:47:00,  3.57it/s]

Moviepy - Building video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-4.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-4.mp4



t:   0%|          | 0/58 [00:00<?, ?it/s, now=None][A
t:  21%|██        | 12/58 [00:00<00:00, 116.31it/s, now=None][A
t:  50%|█████     | 29/58 [00:00<00:00, 146.51it/s, now=None][A
t:  79%|███████▉  | 46/58 [00:00<00:00, 154.28it/s, now=None][A
 54%|█████▍    | 27113/50000 [05:53<1:16:50,  4.96it/s]      [A

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-4.mp4


 54%|█████▍    | 27133/50000 [05:54<42:59,  8.86it/s]  

Moviepy - Building video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-5.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-5.mp4



t:   0%|          | 0/50 [00:00<?, ?it/s, now=None][A
t:  24%|██▍       | 12/50 [00:00<00:00, 114.26it/s, now=None][A
t:  62%|██████▏   | 31/50 [00:00<00:00, 157.89it/s, now=None][A
 54%|█████▍    | 27157/50000 [05:54<24:16, 15.69it/s]        [A

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-06-01_19.22.37\rl-video-episode-5.mp4


 70%|██████▉   | 34787/50000 [06:49<01:47, 142.06it/s]

In [5]:

from tinydb import Query

with TinyModelDB[MitosisPolicyInfo](base_path=f'E:/saved_models/rl/MultiAgentCartPole/6/mitosis-2024-05-28_20.00.00') as policy_db:
    # ids = [
    # ]
    # for id in ids:
    #     policy_db.delete_entry(id, delete_state_dict=True)
    
    for entry in policy_db.all_entries():
        entry['model_info']['optimizations_done'] = 0
        policy_db.db.update({'model_info': entry['model_info']}, Query().model_id == entry['model_id'])