In [None]:
%%capture

%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext training_rl

In [None]:
%presentation_style

In [None]:
%%capture

%set_random_seed 12

In [None]:
%load_latex_macros


$\newcommand{\vect}[1]{{\mathbf{\boldsymbol{#1}} }}$
$\newcommand{\amax}{{\text{argmax}}}$
$\newcommand{\P}{{\mathbb{P}}}$
$\newcommand{\E}{{\mathbb{E}}}$
$\newcommand{\R}{{\mathbb{R}}}$
$\newcommand{\Z}{{\mathbb{Z}}}$
$\newcommand{\N}{{\mathbb{N}}}$
$\newcommand{\C}{{\mathbb{C}}}$
$\newcommand{\abs}[1]{{ \left| #1 \right| }}$
$\newcommand{\simpl}[1]{{\Delta^{#1} }}$


<img src="_static/images/aai-institute-cover.svg" alt="Snow" style="width:100%;">
<div class="md-slide title">Include title and greeting with divs</div>

# Training RL Agents

In this notebook we will dive a bit deeper into RL by training agents on the pendulum environment
that we already encountered before and analyze the results.
We will be concerned with questions like reward shaping, stability of results,
generalization to non-training situations and other issues related to real-world applications of RL.

Here we will only look at model-free RL since model-based RL will often require some domain specific
algorithms and engineering. Also, the openly available tools for model-based RL are far less mature
than for model-free.

In [None]:
%load_ext tensorboard

import gymnasium as gym
from typing import Callable

from gymnasium.envs.classic_control import PendulumEnv
from gymnasium.wrappers import TimeLimit

import datetime
import os
from collections.abc import Sequence
from tianshou.highlevel.config import SamplingConfig
from tianshou.highlevel.experiment import (
    ExperimentConfig,
    SACExperimentBuilder,
)
from tianshou.highlevel.params.alpha import AutoAlphaFactoryDefault
from tianshou.highlevel.params.policy_params import SACParams
from tianshou.highlevel.env import EnvFactory
from tianshou.highlevel.persistence import PersistableConfigProtocol
from tianshou.env import ShmemVectorEnv
from tianshou.highlevel.env import ContinuousEnvironments
from training_rl.env_utils import demo_model, collect_trajectory


In [None]:
%tensorboard --logdir log --host localhost

## The vanilla pendulum

Let us start by simply using gym's pendulum as is and training a soft actor critic (an off-policy algorithm) on it.

In [None]:
from tianshou.highlevel.params.lr_scheduler import LRSchedulerFactoryLinear
from typing import Literal
import torch
from tianshou.highlevel.params.dist_fn import DistributionFunctionFactoryIndependentGaussians
from tianshou.highlevel.params.policy_params import PPOParams
from tianshou.highlevel.experiment import PPOExperimentBuilder
from examples.mujoco.mujoco_env import MujocoEnvFactory


def train_ppo_agent(
    env_factory: EnvFactory,
    experiment_config: ExperimentConfig = None,
    buffer_size: int = 4096,
    hidden_sizes: Sequence[int] = (64, 64),
    lr: float = 3e-4,
    gamma: float = 0.99,
    epoch: int = 100,
    step_per_epoch: int = 30000,
    step_per_collect: int = 2048,
    repeat_per_collect: int = 10,
    batch_size: int = 64,
    training_num: int = 64,
    test_num: int = 10,
    rew_norm: bool = True,
    vf_coef: float = 0.25,
    ent_coef: float = 0.0,
    gae_lambda: float = 0.95,
    bound_action_method: Literal["clip", "tanh"] | None = "clip",
    lr_decay: bool = True,
    max_grad_norm: float = 0.5,
    eps_clip: float = 0.2,
    dual_clip: float | None = None,
    value_clip: bool = False,
    norm_adv: bool = False,
    recompute_adv: bool = True,
):
    experiment_config = experiment_config or ExperimentConfig()
    log_name = os.path.join("ppo", str(experiment_config.seed))

    sampling_config = SamplingConfig(
        num_epochs=epoch,
        step_per_epoch=step_per_epoch,
        batch_size=batch_size,
        num_train_envs=training_num,
        num_test_envs=test_num,
        buffer_size=buffer_size,
        step_per_collect=step_per_collect,
        repeat_per_collect=repeat_per_collect,
    )

    experiment = (
        PPOExperimentBuilder(env_factory, experiment_config, sampling_config)
        .with_ppo_params(
            PPOParams(
                discount_factor=gamma,
                gae_lambda=gae_lambda,
                action_bound_method=bound_action_method,
                reward_normalization=rew_norm,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                value_clip=value_clip,
                advantage_normalization=norm_adv,
                eps_clip=eps_clip,
                dual_clip=dual_clip,
                recompute_advantage=recompute_adv,
                lr=lr,
                lr_scheduler_factory=LRSchedulerFactoryLinear(sampling_config)
                if lr_decay
                else None,
                dist_fn=DistributionFunctionFactoryIndependentGaussians(),
            ),
        )
        .with_actor_factory_default(hidden_sizes, torch.nn.Tanh, continuous_unbounded=True)
        .with_critic_factory_default(hidden_sizes, torch.nn.Tanh)
        .build()
    )
    experiment_result = experiment.run(log_name)
    return experiment_result

In [None]:
def get_pendulum_env(render_mode: Literal["rgb_array"] | None = None):
    return TimeLimit(PendulumEnv(render_mode=render_mode), max_episode_steps=200)


class PendulumEnvFactory(EnvFactory):
    def create_envs(
        self, num_training_envs, num_test_envs, config: PersistableConfigProtocol | None = None
    ) -> ContinuousEnvironments:
        env = get_pendulum_env()
        train_envs = ShmemVectorEnv([get_pendulum_env] * num_training_envs)
        test_envs = ShmemVectorEnv([get_pendulum_env] * num_test_envs)
        return ContinuousEnvironments(
            env=env,
            train_envs=train_envs,
            test_envs=test_envs,
        )

In [None]:
exp_result = train_ppo_agent(
    PendulumEnvFactory(), epoch=1, step_per_epoch=20000, training_num=10,test_num=1
)

In [None]:
from tianshou.data import Batch
import numpy as np

policy = exp_result.world.policy

def get_action(obs: np.ndarray, info: dict | None = None):
    batch = Batch(obs=obs[None, :], info=info)
    forward_result = policy(batch, deterministic=True)
    return forward_result.act[0].numpy()

In [None]:
pend_env = gym.make("Pendulum-v1", render_mode="rgb_array")

In [None]:
obs, info = pend_env.reset()

In [None]:
get_action(obs, info)

In [None]:
f = pend_env.render()

In [None]:
f

In [None]:
traj = collect_trajectory(pend_env)

In [None]:
traj[0].frame

In [None]:
demo_model(pend_env, "random", 200)

<img src="_static/images/aai-institute-cover.svg" alt="Snow" style="width:100%;">
<div class="md-slide title">Thank you for the attention!</div>