In [1]:
import time

from gymnasium import Env

from src.datetime import get_current_timestamp
from src.module_analysis import count_parameters, get_gradients_per_parameter
from src.moving_averages import ExponentialMovingAverage
from src.networks.core.tensor_shape import TensorShape
from src.networks.core.torch_wrappers.torch_net import TorchNet
from src.reinforcement_learning.core.generalized_advantage_estimate import compute_gae_and_returns
from src.reinforcement_learning.gym.envs.normalize_reward_wrapper import NormalizeRewardWrapper
from src.stopwatch import Stopwatch
from src.summary_statistics import format_summary_statics
from src.reinforcement_learning.core.policies.actor_critic_policy import ActorCriticPolicy
from typing import Any, SupportsFloat, Optional
from gymnasium.wrappers import RecordVideo, AutoResetWrapper, NormalizeReward, TransformReward, TransformObservation
from src.reinforcement_learning.core.callback import Callback
from src.reinforcement_learning.a2c.a2c import A2C
from src.reinforcement_learning.ppo.ppo import PPO
from src.reinforcement_learning.core.normalization import NormalizationType
from src.reinforcement_learning.gym.envs.step_skip_wrapper import StepSkipWrapper
from src.reinforcement_learning.core.rl_base import RLBase
from src.torch_device import set_default_torch_device
from src.reinforcement_learning.gym.envs.parallelize_env import parallelize_env_async
from torch.distributions import Normal, Categorical

import torch
from torch import optim, nn
from src.networks.core.seq_net import SeqNet
import gymnasium as gym
import numpy as np

%load_ext autoreload
%autoreload 2

In [7]:


def init_policy(continuous_actions: bool, actions_std: Optional[float]):
    class A2CNetwork(nn.Module):

        def __init__(self):
            super().__init__()

            # in_size = 4
            # shared_out_sizes = [64, 64]
            # actor_out_sizes = [64, 2]
            # critic_out_sizes = [64, 1]

            # in_size = 24
            # shared_out_sizes = [64, 128, 128]
            # actor_out_sizes = [128, 64, 4]
            # critic_out_sizes = [128, 64, 1]

            # in_size = 8
            # shared_out_sizes = [128, 256, 256]
            # actor_out_sizes = [256, 256, 128, 128, 4]
            # critic_out_sizes = [256, 256, 128, 128, 1]
            
            # in_size = 128
            # shared_out_sizes = [512, 512, 512, 512, 512]
            # actor_out_sizes = [512, 512, 512, 512, 512, 512, 5]
            # critic_out_sizes = [512, 512, 512, 512, 512, 512, 1]

            # in_size = 128
            # shared_out_sizes = [256, 256, 256]
            # actor_out_sizes = [256, 256, 256, 256, 256, 5]
            # critic_out_sizes = [256, 256, 256, 256, 256, 1]

            # in_size = 17
            # shared_out_sizes = []
            # actor_out_sizes = [64, 64, 64, 64, 64, 64, 6]
            # critic_out_sizes = [64, 64, 64, 64, 64, 64, 1]
            
            in_size = 27
            shared_out_sizes = []
            actor_out_sizes = [96, 96, 96, 96, 96, 96, 8]
            critic_out_sizes = [64, 64, 64, 64, 64, 64, 1]
            
            hidden_activation_function = nn.ELU()
            actor_out_activation_function = nn.Tanh()
            critic_out_activation_function = nn.Identity()
            
            self.has_shared = len(shared_out_sizes) > 0
            
            if self.has_shared:
                self.shared = SeqNet.from_layer_provider(
                    layer_provider=lambda layer_nr, is_last_layer, in_features, out_features: nn.Sequential(
                        nn.Linear(in_features, out_features),
                        hidden_activation_function
                    ),
                    in_size=in_size,
                    out_sizes=shared_out_sizes
                )
            else:
                self.shared = TorchNet(nn.Identity(), in_shape=TensorShape(features=in_size), out_shape=TensorShape(features=in_size))

            self.actor = SeqNet.from_layer_provider(
                layer_provider=lambda layer_nr, is_last_layer, in_features, out_features: nn.Sequential(
                    nn.Linear(in_features, out_features),
                    actor_out_activation_function if is_last_layer else hidden_activation_function
                ),
                in_size=self.shared.out_shape.get_definite_features(),
                out_sizes=actor_out_sizes
            )

            self.critic = SeqNet.from_layer_provider(
                layer_provider=lambda layer_nr, is_last_layer, in_features, out_features: nn.Sequential(
                    nn.Linear(in_features, out_features),
                    critic_out_activation_function if is_last_layer else hidden_activation_function
                ),
                in_size=self.shared.out_shape.get_definite_features(),
                out_sizes=critic_out_sizes
            )

        def forward(self, x: torch.Tensor):
            if self.has_shared:
                shared_out = self.shared(x)
            else:
                shared_out = x

            return self.actor(shared_out), self.critic(shared_out)

    return ActorCriticPolicy(A2CNetwork(), continuous_actions, actions_std)


score_mean_ema = ExponentialMovingAverage(alpha=0.1)
stopwatch = Stopwatch()

def on_optimization_done(rl: PPO, step: int, info: dict[str, Any]):
    time_taken = stopwatch.reset()
    
    if 'unnormalized_rewards' in info['rollout']:
        unnormalized_rewards = info['rollout']['unnormalized_rewards']
        _, gamma_1_returns = compute_gae_and_returns(
            value_estimates=np.zeros_like(rl.buffer.rewards[:len(unnormalized_rewards)]),
            rewards=unnormalized_rewards,
            episode_starts=rl.buffer.episode_starts[:len(unnormalized_rewards)],
            last_values=np.zeros_like(rl.buffer.rewards[0], dtype=float),
            last_dones=np.zeros_like(rl.buffer.episode_starts[0], dtype=bool),
            gamma=1.0,
            gae_lambda=1.0,
            normalize_rewards=None,
            normalize_advantages=None,
        )
    else:
        _, gamma_1_returns = rl.buffer.compute_gae_and_returns(
            last_values=torch.zeros_like(rl.buffer.value_estimates[0]),
            last_dones=np.zeros_like(rl.buffer.episode_starts[0], dtype=bool),
            gamma=1.0,
            gae_lambda=1.0,
            normalize_advantages=None,
            normalize_rewards=None,
        )
    
    episode_start_gamma_1_returns = gamma_1_returns[
        rl.buffer.episode_starts[:rl.buffer.pos]
    ]
    
    score_moving_average = score_mean_ema.update(episode_start_gamma_1_returns.mean())
    
    scores = format_summary_statics(
        episode_start_gamma_1_returns, 
        mean_format=' 6.1f',
        std_format='4.1f',
        min_value_format=' 6.1f',
        max_value_format='5.1f',
    )
    advantages = format_summary_statics(
        info['advantages'], 
        mean_format=' 6.3f',
        std_format='.1f',
        min_value_format=' 7.3f',
        max_value_format='6.3f',
    )
    abs_actor_obj = format_summary_statics(
        info['actor_objective_unreduced'].abs() * rl.actor_objective_weight,  
        mean_format=' 5.3f',
        std_format='5.3f',
        min_value_format=None,
        max_value_format=None,
    )
    critic_obj = format_summary_statics(
        info['weighted_critic_objective'], 
        mean_format='5.3f',
        std_format='5.3f',
        min_value_format=None,
        max_value_format=None,
    )
    resets = format_summary_statics(
        rl.buffer.episode_starts.astype(int).sum(axis=0), 
        mean_format='.2f',
        std_format=None,
        min_value_format='1d',
        max_value_format=None,
    )
    print(f"{step = : >7}, "
          f'{scores = :s}, '
          f'score_ema = {score_moving_average: 6.1f}, '
          f"{advantages = :s}, "
          f"{abs_actor_obj = :s}, "
          f"{critic_obj = :s}, "
          f"{resets = :s}, "
          f"time = {time_taken:4.1f}")
    
    # for param_name, param_grad in get_gradients_per_parameter(rl.policy, param_type='weight'):
    #     print(f'{param_name + ".grad":<50}: ' + format_summary_statics(
    #         param_grad,
    #         mean_format=' 8.5f',
    #         std_format='.5f',
    #         min_value_format=' 8.5f',
    #         max_value_format='7.5f',
    #     ))
    # 
    # print('\n')


device = set_default_torch_device("cuda:0") if True else set_default_torch_device('cpu')
print(f'using device {device}')

# env = parallelize_env_async(lambda: gym.make("CartPole-v1", max_episode_steps=1000, render_mode='rgb_array'), 16)
# env = parallelize_env_async(lambda: StepSkipWrapper(gym.make("BipedalWalker-v3", max_episode_steps=3000, render_mode='rgb_array'), steps_per_step=5), 8)
# env = parallelize_env_async(lambda: gym.make("LunarLander-v2", render_mode='rgb_array'), 128)
# env = parallelize_env_async(lambda: gym.make("ALE/Pacman-ram-v5", render_mode='rgb_array'), 128)
# env = parallelize_env_async(lambda: gym.make("ALE/Asteroids-ram-v5", render_mode='rgb_array'), 16)

def create_env(render_mode: str | None):
    return gym.make(env_name, render_mode=render_mode, **env_kwargs)

def wrap_env(_env: Env):
    # _env = NormalizeRewardWrapper(_env, gamma=gamma)
    # _env = TransformObservation(_env, lambda _obs: _obs / 255)
    # _env = TransformReward(_env, lambda _reward: 0.01 * _reward) 
    return _env

env_name = "Ant-v4"
env_kwargs = {'ctrl_cost_weight': 0.001, 'healthy_reward': 0.001}
num_envs = 32
env = parallelize_env_async(lambda: create_env(render_mode=None), num_envs)
gamma = 0.995
    
try:
    env = wrap_env(env)
    print(f'{env = }, {num_envs = }')
    
    # policy = init_policy(continuous_actions=True, actions_std=0.1)
    # policy.load_state_dict(torch.load('saved_models/rl/Pacman-ram-v5/2024-04-25_22.49.18---3+5x128--state_dict.pth'))
    print(f'{count_parameters(policy) = }')
    
    PPO(
        env=env,
        policy=policy.to(device),
        policy_optimizer=lambda pol: optim.Adam(pol.parameters(), lr=1e-5),
        buffer_size=2500,
        gamma=gamma,
        gae_lambda=1.0,
        normalize_rewards=None,
        normalize_advantages=NormalizationType.Std,
        actor_objective_weight=1.0,
        critic_objective_weight=0.5,
        ppo_epochs=3,
        ppo_batch_size=500,
        action_ratio_clip_range=0.13,
        log_unreduced=True,
        callback=Callback(on_optimization_done=on_optimization_done)
    ).train(5_000_000)
except KeyboardInterrupt:
    print('keyboard interrupt')
finally:
    print('closing envs')
    env.close()
    print('envs closed')

print('done')

using device cuda:0
env = AsyncVectorEnv(32), num_envs = 32
count_parameters(policy) = 72681
envs reset
step =    2500, scores =  171.7 ± 67.2 [  64.9, 297.8], score_ema =  171.7, advantages =  0.513 ± 1.0 [ -3.826,  2.976], abs_actor_obj =  2.640 ± 3.920, critic_obj = 147.997 ± 48.019, resets = 2.00 ≥ 2, time = 16.9
step =    5000, scores =  152.4 ± 112.2 [  -0.4, 297.6], score_ema =  169.8, advantages =  0.354 ± 1.0 [ -2.629,  2.657], abs_actor_obj =  2.988 ± 3.914, critic_obj = 146.312 ± 38.925, resets = 3.00 ≥ 3, time = 11.6
step =    7500, scores =  180.5 ± 66.2 [  77.3, 296.0], score_ema =  170.8, advantages =  0.457 ± 1.0 [ -3.286,  2.918], abs_actor_obj =  2.737 ± 3.844, critic_obj = 160.456 ± 46.160, resets = 2.00 ≥ 2, time = 11.8
step =   10000, scores =  143.4 ± 110.1 [  -0.6, 281.2], score_ema =  168.1, advantages =  0.202 ± 1.0 [ -2.950,  2.438], abs_actor_obj =  3.375 ± 4.210, critic_obj = 163.127 ± 24.987, resets = 3.00 ≥ 3, time = 12.2
step =   12500, scores =  180.2 ± 

In [9]:
record_env: gym.Env = create_env(render_mode='rgb_array')
record_env.metadata['render_fps'] = 20
try:
    record_env = wrap_env(record_env)
    record_env = AutoResetWrapper(
        RecordVideo(record_env, video_folder=rf'C:\Users\domin\Videos\rl\{get_current_timestamp()}', episode_trigger=lambda ep_nr: True)
    )
    def record(max_steps: int):
        obs, info = record_env.reset()
        for step in range(max_steps):
            actions_dist = policy.predict_actions(obs)
            actions = actions_dist.sample().detach().cpu().numpy()
            obs, reward, terminated, truncated, info = record_env.step(actions)
    
    record(10000)
except KeyboardInterrupt:
    print('keyboard interrupt')
finally:
    print('closing record_env')
    record_env.close()
    print('record_env closed')

Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-0.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-0.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-0.mp4
Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-1.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-1.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-1.mp4
Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-2.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-2.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-2.mp4
Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-3.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-3.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-3.mp4
Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-4.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-4.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-4.mp4
keyboard interrupt
closing record_env
Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-5.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-5.mp4


                                                             

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-27_03.35.06\rl-video-episode-5.mp4
record_env closed


In [8]:
torch.save(policy.state_dict(), f'saved_models/rl/{env_name}/{get_current_timestamp()}-6x96_6x64--state_dict.pth')

In [30]:
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym

# Parallel environments
vec_env = make_vec_env(lambda: gym.make('CartPole-v1', render_mode='rgb_array'), n_envs=4)

model = A2C("MlpPolicy", vec_env, verbose=2)
model.learn(total_timesteps=25000)
model.save("a2c_cartpole")

del model # remove to demonstrate saving and loading

model = A2C.load("a2c_cartpole")

obs = vec_env.reset()
for _ in range(100_000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

In [11]:
import gymnasium
record_env = gymnasium.make("ALE/Pacman-ram-v5", render_mode='rgb_array')
try:
    record_env = AutoResetWrapper(
        RecordVideo(record_env, video_folder=r'C:\Users\domin\Videos\rl\2024-04-24.1', episode_trigger=lambda ep_nr: True)
    )
    def record(max_steps: int):
        obs, info = record_env.reset()
        for step in range(max_steps):
            # actions_dist = policy.predict_actions(obs)
            # actions = actions_dist.sample().detach().cpu().numpy()
            
            actions = 2
            
            obs, reward, terminated, truncated, info = record_env.step(actions)
            
            if terminated or truncated:
                break
    
    record(10000)
except KeyboardInterrupt:
    print('keyboard interrupt')
finally:
    print('closing record_env')
    record_env.close()
    print('record_env closed')

Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-0.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-0.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-0.mp4
closing record_env
Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-1.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-1.mp4


                                                  

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-1.mp4
record_env closed




<OrderEnforcing<PassiveEnvChecker<AtariEnv<ALE/Pacman-ram-v5>>>>
