In [20]:
from src.datetime import get_current_timestamp
from src.module_analysis import count_parameters, get_gradients_per_parameter
from src.reinforcement_learning.core.generalized_advantage_estimate import compute_gae_and_returns
from src.reinforcement_learning.gym.envs.normalize_reward_wrapper import NormalizeRewardWrapper
from src.summary_statistics import format_summary_statics
from src.reinforcement_learning.core.policies.actor_critic_policy import ActorCriticPolicy
from typing import Any, SupportsFloat, Optional
from gymnasium.wrappers import RecordVideo, AutoResetWrapper, NormalizeReward, TransformReward, TransformObservation
from src.reinforcement_learning.core.callback import Callback
from src.reinforcement_learning.a2c.a2c import A2C
from src.reinforcement_learning.ppo.ppo import PPO
from src.reinforcement_learning.core.normalization import NormalizationType
from src.reinforcement_learning.gym.envs.step_skip_wrapper import StepSkipWrapper
from src.reinforcement_learning.core.rl_base import RLBase
from src.torch_device import set_default_torch_device
from src.reinforcement_learning.gym.envs.parallelize_env import parallelize_env_async
from torch.distributions import Normal, Categorical

import torch
from torch import optim, nn
from src.networks.core.seq_net import SeqNet
import gymnasium as gym
import numpy as np

# %load_ext autoreload
# %autoreload 2

In [18]:


def init_policy(continuous_actions: bool, actions_std: Optional[float]):
    class A2CNetwork(nn.Module):

        def __init__(self):
            super().__init__()

            # in_size = 4
            # shared_out_sizes = [64, 64]
            # actor_out_sizes = [64, 2]
            # critic_out_sizes = [64, 1]

            # in_size = 24
            # shared_out_sizes = [64, 128, 128]
            # actor_out_sizes = [128, 64, 4]
            # critic_out_sizes = [128, 64, 1]

            # in_size = 8
            # shared_out_sizes = [128, 256, 256]
            # actor_out_sizes = [256, 256, 128, 128, 4]
            # critic_out_sizes = [256, 256, 128, 128, 1]
            
            in_size = 128
            shared_out_sizes = [512, 512, 512, 512, 512]
            actor_out_sizes = [512, 512, 512, 512, 512, 512, 5]
            critic_out_sizes = [512, 512, 512, 512, 512, 512, 1]

            self.shared = SeqNet.from_layer_provider(
                layer_provider=lambda layer_nr, is_last_layer, in_features, out_features: nn.Sequential(
                    nn.Linear(in_features, out_features),
                    nn.ReLU() if not is_last_layer else nn.ReLU()
                ),
                in_size=in_size,
                out_sizes=shared_out_sizes
            )

            self.actor = SeqNet.from_layer_provider(
                layer_provider=lambda layer_nr, is_last_layer, in_features, out_features: nn.Sequential(
                    nn.Linear(in_features, out_features),
                    nn.ReLU() if not is_last_layer else nn.Tanh()
                ),
                in_size=self.shared.out_shape.get_definite_features(),
                out_sizes=actor_out_sizes
            )

            self.critic = SeqNet.from_layer_provider(
                layer_provider=lambda layer_nr, is_last_layer, in_features, out_features: nn.Sequential(
                    nn.Linear(in_features, out_features),
                    nn.ReLU() if not is_last_layer else nn.Identity()
                ),
                in_size=self.shared.out_shape.get_definite_features(),
                out_sizes=critic_out_sizes
            )

        def forward(self, x: torch.Tensor):
            shared_out = self.shared(x)

            return self.actor(shared_out), self.critic(shared_out)

    return ActorCriticPolicy(A2CNetwork(), continuous_actions, actions_std)

def on_optimization_done(rl: PPO, step: int, info: dict[str, Any]):
    
    print(rl.buffer.pos)
    
    if 'unnormalized_rewards' in info['rollout']:
        unnormalized_rewards = info['rollout']['unnormalized_rewards']
        _, gamma_1_returns = compute_gae_and_returns(
            value_estimates=np.zeros_like(rl.buffer.rewards[:len(unnormalized_rewards)]),
            rewards=unnormalized_rewards,
            episode_starts=rl.buffer.episode_starts[:len(unnormalized_rewards)],
            last_values=np.zeros_like(rl.buffer.rewards[0], dtype=float),
            last_dones=np.zeros_like(rl.buffer.episode_starts[0], dtype=bool),
            gamma=1.0,
            gae_lambda=1.0,
            normalize_rewards=None,
            normalize_advantages=None,
        )
    else:
        _, gamma_1_returns = rl.buffer.compute_gae_and_returns(
            last_values=torch.zeros_like(rl.buffer.value_estimates[0]),
            last_dones=np.zeros_like(rl.buffer.episode_starts[0], dtype=bool),
            gamma=1.0,
            gae_lambda=1.0,
            normalize_advantages=None,
            normalize_rewards=None,
        )
    
    episode_start_gamma_1_returns = gamma_1_returns[
        rl.buffer.episode_starts[:rl.buffer.pos]
    ]
    
    scores = format_summary_statics(
        episode_start_gamma_1_returns, 
        mean_format=' 6.1f',
        std_format='4.1f',
        min_value_format=' 6.1f',
        max_value_format='5.1f',
    )
    advantages = format_summary_statics(
        info['advantages'], 
        mean_format=' 6.3f',
        std_format='.1f',
        min_value_format=' 7.3f',
        max_value_format='6.3f',
    )
    abs_actor_obj = format_summary_statics(
        info['actor_objective_unreduced'].abs() * rl.actor_objective_weight,  
        mean_format=' 5.3f',
        std_format='5.3f',
        min_value_format=None,
        max_value_format=None,
    )
    critic_obj = format_summary_statics(
        info['weighted_critic_objective'], 
        mean_format='5.3f',
        std_format='5.3f',
        min_value_format=None,
        max_value_format=None,
    )
    resets = format_summary_statics(
        rl.buffer.episode_starts.astype(int).sum(axis=0), 
        mean_format='.2f',
        std_format=None,
        min_value_format='1d',
        max_value_format=None,
    )
    print(f"{step = : >7}, "
          f'{scores = :s}, '
          f"{advantages = :s}, "
          f"{abs_actor_obj = :s}, "
          f"{critic_obj = :s}, "
          f"{resets = :s}")
    
    # for param_name, param_grad in get_gradients_per_parameter(rl.policy, param_type='weight'):
    #     print(f'{param_name + ".grad":<50}: ' + format_summary_statics(
    #         param_grad,
    #         mean_format=' 8.5f',
    #         std_format='.5f',
    #         min_value_format=' 8.5f',
    #         max_value_format='7.5f',
    #     ))
    # 
    # print('\n')


device = set_default_torch_device("cuda:0") if True else set_default_torch_device('cpu')
print(f'using device {device}')

# env = parallelize_env_async(lambda: gym.make("CartPole-v1", max_episode_steps=1000, render_mode='rgb_array'), 16)
# env = parallelize_env_async(lambda: StepSkipWrapper(gym.make("BipedalWalker-v3", max_episode_steps=3000, render_mode='rgb_array'), steps_per_step=5), 8)
# env = parallelize_env_async(lambda: gym.make("LunarLander-v2", render_mode='rgb_array'), 128)
env = parallelize_env_async(lambda: gym.make("ALE/Pacman-ram-v5", render_mode='rgb_array'), 8)

# env = TransformObservation(env, lambda obs: obs / 255)

print(f'{env = }')
    
try:
    # policy = init_policy(continuous_actions=False, actions_std=None)
    print(f'{count_parameters(policy) = }')
    
    gamma = 0.995
    
    # env = NormalizeRewardWrapper(env, gamma=gamma)
    # env = TransformReward(env, lambda reward: 0.1 * reward) 
    
    PPO(
        env=env,
        policy=policy.to(device),
        policy_optimizer=lambda pol: optim.Adam(pol.parameters(), lr=1e-4),
        buffer_size=2500,
        gamma=gamma,
        gae_lambda=1.0,
        normalize_rewards=None,
        normalize_advantages=NormalizationType.Std,
        actor_objective_weight=1.0,
        critic_objective_weight=0.1,
        ppo_epochs=3,
        ppo_batch_size=500,
        action_ratio_clip_range=0.2,
        log_unreduced=True,
        callback=Callback(on_optimization_done=on_optimization_done)
    ).train(5_000_000)
except KeyboardInterrupt:
    print('keyboard interrupt')
finally:
    print('closing envs')
    env.close()
    print('envs closed')

print('done')

using device cuda:0
env = AsyncVectorEnv(8)
count_parameters(policy) = 4271622
2500
step =    2500, scores =   13.6 ±  5.4 [   0.0,  30.0], advantages =  1.178 ± 1.0 [ -0.005,  4.950], abs_actor_obj =  1.179 ± 1.006, critic_obj = 1.656 ± 0.844, resets = 5.25 ≥ 5
2500
step =    5000, scores =   14.5 ± 10.6 [   2.0,  59.0], advantages =  0.099 ± 1.0 [ -0.909,  7.009], abs_actor_obj =  0.613 ± 0.805, critic_obj = 2.087 ± 1.098, resets = 5.75 ≥ 5
2500
step =    7500, scores =   14.3 ±  6.0 [   3.0,  28.0], advantages = -0.088 ± 1.0 [ -1.482,  5.018], abs_actor_obj =  0.815 ± 0.590, critic_obj = 0.816 ± 0.120, resets = 5.50 ≥ 5
2500
step =   10000, scores =   18.2 ± 10.2 [   4.0,  54.0], advantages =  0.209 ± 1.0 [ -0.905,  6.884], abs_actor_obj =  0.649 ± 0.792, critic_obj = 1.760 ± 1.420, resets = 5.12 ≥ 4
2500
step =   12500, scores =   16.7 ±  9.3 [   0.0,  56.0], advantages =  0.086 ± 1.0 [ -1.152,  7.008], abs_actor_obj =  0.706 ± 0.719, critic_obj = 1.459 ± 1.061, resets = 4.88 ≥ 4
2

In [28]:
record_env: gym.Env = env.env_fns[0]()
record_env.metadata['render_fps'] = 30
try:
    record_env = AutoResetWrapper(
        RecordVideo(record_env, video_folder=rf'C:\Users\domin\Videos\rl\{get_current_timestamp()}', episode_trigger=lambda ep_nr: True)
    )
    def record(max_steps: int):
        obs, info = record_env.reset()
        for step in range(max_steps):
            actions_dist = policy.predict_actions(obs)
            actions = actions_dist.sample().detach().cpu().numpy()
            obs, reward, terminated, truncated, info = record_env.step(actions)
    
    record(10000)
except KeyboardInterrupt:
    print('keyboard interrupt')
finally:
    print('closing record_env')
    record_env.close()
    print('record_env closed')

Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-25_03.00.08\rl-video-episode-0.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-25_03.00.08\rl-video-episode-0.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-25_03.00.08\rl-video-episode-0.mp4




Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-25_03.00.08\rl-video-episode-1.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-25_03.00.08\rl-video-episode-1.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-25_03.00.08\rl-video-episode-1.mp4




keyboard interrupt
closing record_env
Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-25_03.00.08\rl-video-episode-2.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-25_03.00.08\rl-video-episode-2.mp4


                                                                

KeyboardInterrupt: 

In [25]:
torch.save(policy.state_dict(), f'saved_models/rl/Pacman-ram-v5/{get_current_timestamp()}--state_dict.pth')

In [30]:
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym

# Parallel environments
vec_env = make_vec_env(lambda: gym.make('CartPole-v1', render_mode='rgb_array'), n_envs=4)

model = A2C("MlpPolicy", vec_env, verbose=2)
model.learn(total_timesteps=25000)
model.save("a2c_cartpole")

del model # remove to demonstrate saving and loading

model = A2C.load("a2c_cartpole")

obs = vec_env.reset()
for _ in range(100_000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

In [11]:
import gymnasium
record_env = gymnasium.make("ALE/Pacman-ram-v5", render_mode='rgb_array')
try:
    record_env = AutoResetWrapper(
        RecordVideo(record_env, video_folder=r'C:\Users\domin\Videos\rl\2024-04-24.1', episode_trigger=lambda ep_nr: True)
    )
    def record(max_steps: int):
        obs, info = record_env.reset()
        for step in range(max_steps):
            # actions_dist = policy.predict_actions(obs)
            # actions = actions_dist.sample().detach().cpu().numpy()
            
            actions = 2
            
            obs, reward, terminated, truncated, info = record_env.step(actions)
            
            if terminated or truncated:
                break
    
    record(10000)
except KeyboardInterrupt:
    print('keyboard interrupt')
finally:
    print('closing record_env')
    record_env.close()
    print('record_env closed')

Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-0.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-0.mp4


                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-0.mp4
closing record_env
Moviepy - Building video C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-1.mp4.
Moviepy - Writing video C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-1.mp4


                                                  

Moviepy - Done !
Moviepy - video ready C:\Users\domin\Videos\rl\2024-04-24.1\rl-video-episode-1.mp4
record_env closed




<OrderEnforcing<PassiveEnvChecker<AtariEnv<ALE/Pacman-ram-v5>>>>
