In [1]:
import os
# os.kill(os.getpid(), 9) # Esta linha serve para forçar a reinicialização do processo do notebook

In [None]:
from time import time
from collections import defaultdict
from tqdm import tqdm
import sys 

In [3]:
import random
import numpy as np
import torch
from torch import nn
import plotly.express as px

torch.manual_seed(10)
random.seed(10)
np.random.seed(10)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Ray[rllib]

A biblioteca RAY possui diversos [algoritmos](https://docs.ray.io/en/latest/rllib/rllib-algorithms.html) de reforço já implementados. Ela busca simplificar a execução e configuração destes algoritmos com objetos de configuração como mostrado abaixo:

In [4]:
from ray.rllib.algorithms.ppo.ppo import PPOConfig
ppo_config = PPOConfig()

AS configurações são separadas em seções. Algumas são comuns, outras específicas de cada algoritmo. Na célula abaixo temos uma confiuguração do algoritmo PPO. Nem todas as opções estão presentes pois há várias que são muito específicas. Após ler as configurações da célula abaixo, leia, por cima, as configurações na [documentação do RAY](https://docs.ray.io/en/latest/rllib/rllib-training.html#configuring-rllib-algorithms).

In [5]:
from soccer_twos import EnvType
import gymnasium as gym
from ray import tune
from ray.rllib import MultiAgentEnv
import soccer_twos
from ray.tune.logger import pretty_print

In [6]:
class RLLibWrapper(gym.core.Wrapper, MultiAgentEnv):
    """
    A RLLib wrapper so our env can inherit from MultiAgentEnv.
    """

    pass


def create_rllib_env(env_config: dict = {}):
    """
    Creates a RLLib environment and prepares it to be instantiated by Ray workers.
    Args:
        env_config: configuration for the environment.
            You may specify the following keys:
            - variation: one of soccer_twos.EnvType. Defaults to EnvType.multiagent_player.
            - opponent_policy: a Callable for your agent to train against. Defaults to a random policy.
    """
    if hasattr(env_config, "worker_index"):
        env_config["worker_id"] = (
            env_config.worker_index * env_config.get("num_envs_per_worker", 1)
            + env_config.vector_index
        )
    env = soccer_twos.make(**env_config)
    # env = TransitionRecorderWrapper(env)
    if "multiagent" in env_config and not env_config["multiagent"]:
        # is multiagent by default, is only disabled if explicitly set to False
        return env
    return RLLibWrapper(env)

In [7]:
tune.registry.register_env("Soccer", create_rllib_env)  # registrando o ambiente no tune

In [8]:
environment_id = "Soccer"

ppo_config = ppo_config.resources(
    num_gpus = 1,
    num_cpus_per_worker = 0,
)

ppo_config = ppo_config.rollouts(
    num_rollout_workers = 8,
    # Number of rollout worker actors to create for parallel sampling. Setting this to 0 will force rollouts to be done in the local worker (driver process or the Algorithm’s actor when using Tune).
    num_envs_per_worker = 2,
    # Number of environments to evaluate vector-wise per worker. This enables model inference batching, which can improve performance for inference bottlenecked workloads.
    rollout_fragment_length = 8,
    # Divide episodes into fragments of this many steps each during rollouts.
)

ppo_config = ppo_config.environment(
    env = environment_id,
)

ppo_config.env_config = {"render": False, "time_scale": 50, "multiagent": False, "variation": EnvType.team_vs_policy,
                         "flatten_branched": True, "single_player": True}  # colocando os parâmertros do ambiente (soccer-twos)

ppo_config = ppo_config.framework(
    framework = "torch",
)

ppo_config = ppo_config.debugging(
    seed = 10,
)

ppo_config = ppo_config.training(
    lr = 5e-4,
    #  The default learning rate.
    train_batch_size = 256, # deve ser múltiplo de (workers * env_per_worker * rollout_fragment_length)
    # Training batch size, if applicable.
    use_critic = True,
    # Should use a critic as a baseline (otherwise don’t use value baseline; required for using GAE).
    use_gae = True,
    # If true, use the Generalized Advantage Estimator (GAE) with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    gamma = 0.99,
    # Float specifying the discount factor of the Markov Decision process.
    lambda_ = 0.95,
    # The GAE (lambda) parameter.
    sgd_minibatch_size = 32,
    # Total SGD batch size across all devices for SGD. This defines the minibatch size within each epoch.
    num_sgd_iter = 6,
    # Number of SGD iterations in each outer loop (i.e., number of epochs to execute per train batch).
    shuffle_sequences = True,
    # Whether to shuffle sequences in the batch when training (recommended).
    vf_loss_coeff = 0.5,
    # Coefficient of the value function loss. IMPORTANT: you must tune this if you set vf_share_layers=True inside your model’s config.
    entropy_coeff = 0.0,
    # Coefficient of the entropy regularizer.
    vf_clip_param = 100000.0, #Aqui eliminamos o clip colocando ele muito alto
    # Clip param for the value function. Note that this is sensitive to the scale of the rewards. If your expected V is large, increase this.
    clip_param = 0.5,
    #  PPO clip parameter.
    kl_coeff = 0.0,
    # Initial coefficient for KL divergence.
    model = {
        "fcnet_hiddens": [64, 32],
        "fcnet_activation": "relu",
        "vf_share_layers": False,
    },
    # Arguments passed into the policy model. See models/catalog.py for a full list of the available model options. 
)

ppo_config = ppo_config.reporting(
    min_sample_timesteps_per_iteration = 1,
    metrics_num_episodes_for_smoothing = 50,
)

Aqui aplicamos a configuração para construir o algoritmo/agente.

In [None]:
PPOalgo = ppo_config.build()

Em um passo de treinamento o RAY automaticamente cria os "workers" e dispara a coleta de experiências. Coletadas o suficiente, a biblioteca executa um passo de treino nos dados adquiridos e retorna um dicionário de resultados. Leia atentamente a saída da célula a seguir para que você possa se familiarizar com as métricas coletadas.

In [None]:
from ray.tune.logger import pretty_print
result = PPOalgo.train()
print(pretty_print(result))

Após a execução do algoritmo, para liberarmos os recursos de cpu e gpu, é preciso chamar o `ray.shutdown()`

In [None]:
import ray
ray.shutdown()

# PPO

Aqui temos uma configuração e execução do algoritmo PPO para o ambiente do [CartPole](https://www.gymlibrary.dev/environments/classic_control/cart_pole/)

In [None]:
import pandas as pd
MAX_EPISODES = 1000

all_metrics = pd.DataFrame()
all_metrics["episodes"] = [i+1 for i in range(MAX_EPISODES)]
all_metrics["threshold_reward"] = [475 for i in range(MAX_EPISODES)]

In [None]:
from ray.rllib.algorithms.ppo.ppo import PPOConfig
ppo_config = PPOConfig()

environment_id = "CartPole-v1"

ppo_config = ppo_config.resources(
    num_gpus = 0,
    num_cpus_per_worker = 0,
)
ppo_config = ppo_config.rollouts(
    num_rollout_workers = 4, 
    # Number of rollout worker actors to create for parallel sampling. Setting this to 0 will force rollouts to be done in the local worker (driver process or the Algorithm’s actor when using Tune).
    num_envs_per_worker = 2,
    # Number of environments to evaluate vector-wise per worker. This enables model inference batching, which can improve performance for inference bottlenecked workloads.
    rollout_fragment_length = 8,
    # Divide episodes into fragments of this many steps each during rollouts.
)
ppo_config = ppo_config.environment(
    env = environment_id,
)
ppo_config = ppo_config.framework(
    framework = "torch",
)
ppo_config = ppo_config.debugging(
    seed = 10,
)
ppo_config = ppo_config.training(
    
    lr = 0.0005,
    #  The default learning rate.
    train_batch_size = 128, # deve ser múltiplo de (workers * env_per_worker * rollout_fragment_length)
    # Training batch size, if applicable.
    use_critic = True,
    # Should use a critic as a baseline (otherwise don’t use value baseline; required for using GAE).
    use_gae = True,
    # If true, use the Generalized Advantage Estimator (GAE) with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    gamma = 0.99,
    # Float specifying the discount factor of the Markov Decision process.
    lambda_ = 0.95,
    # The GAE (lambda) parameter.
    sgd_minibatch_size = 32,
    # Total SGD batch size across all devices for SGD. This defines the minibatch size within each epoch.
    num_sgd_iter = 6,
    # Number of SGD iterations in each outer loop (i.e., number of epochs to execute per train batch).
    shuffle_sequences = True,
    # Whether to shuffle sequences in the batch when training (recommended).
    vf_loss_coeff = 0.5,
    # Coefficient of the value function loss. IMPORTANT: you must tune this if you set vf_share_layers=True inside your model’s config.
    entropy_coeff = 0.0,
    # Coefficient of the entropy regularizer.
    vf_clip_param = 100000.0, #Aqui eliminamos o clip colocando ele muito alto
    # Clip param for the value function. Note that this is sensitive to the scale of the rewards. If your expected V is large, increase this.
    clip_param = 0.5,
    #  PPO clip parameter.
    kl_coeff = 0.0,
    # Initial coefficient for KL divergence.
    model = {
        "fcnet_hiddens": [64, 32],
        "fcnet_activation": "relu",
        "vf_share_layers": False,
    },
    # Arguments passed into the policy model. See models/catalog.py for a full list of the available model options. 
)

ppo_config = ppo_config.reporting(
    min_sample_timesteps_per_iteration = 1,
    metrics_num_episodes_for_smoothing = 50,
)

In [None]:
from collections import deque

def run_experiment(name, config):
    PPOalgo = config.build()

    metrics = defaultdict(list)

    rew_deque = deque(maxlen=50)
    len_deque = deque(maxlen=50)

    pbar = tqdm(total=MAX_EPISODES, position=0, leave=True)

    episode = 0
    while episode < MAX_EPISODES:
        result = PPOalgo.train()

    # O código abaixo é feito para adquirir métricas com vários episódios coletados e terminados na chamada de 'train()'
    # ele serve como um exemplo do uso da métricas na variável 'result'
        if result["episodes_total"] > episode:
            for v in result["hist_stats"]["episode_reward"][-result["sampler_results"]["episodes_this_iter"]:]:
                rew_deque.append(v)
                metrics["train_reward"].append(np.array(rew_deque).mean())

            for v in result["hist_stats"]["episode_lengths"][-result["sampler_results"]["episodes_this_iter"]:]:
                len_deque.append(v)
                metrics["ep_len"].append(np.array(len_deque).mean())
            
            pbar.update(result["episodes_total"] - episode)
            pbar.set_description("| Mean Reward %.2f | Ep len %.2f |" % (result["sampler_results"]["episode_reward_mean"], result["sampler_results"]["episode_len_mean"]))

            episode = result["episodes_total"]

    all_metrics[name+"_reward"] = metrics["train_reward"][:1000]
    all_metrics[name+"_len"] = metrics["ep_len"][:1000]
    
    # !!!!! Esta execução dura ~5min
    return PPOalgo

In [None]:
agent = run_experiment(name="PPO_01", config=ppo_config)

# TensorBoard

O RAY automaticamente guarda as métricas da variável "result" em uma pasta no formato de csv e para o tensorboard. Execute as células a seguir e explore os resultados no ambiente do TensorBoard.

In [None]:
%load_ext tensorboard
log_folder = "/root/ray_results"

In [None]:
%tensorboard --logdir={log_folder}

Para facilidade de visualização e entrega do notebook, utilizaremos ainda o plotly para observar as métricas de recompensa e tamanho de episódio.

`Obs: É normal não atingir o limiar ainda`

In [None]:
px.line(all_metrics, x="episodes", y=[col for col in all_metrics.columns if '_reward' in col])

In [None]:
px.line(all_metrics, x="episodes", y=[col for col in all_metrics.columns if '_len' in col])

In [None]:
ray.shutdown()

# Exercício 1
 Analise as curvas de recompensa e tamanho do episódio ressaltando similaridades e diferenças entre elas. Explique o porque destas similaridades e diferenças.

# Exercício 2

A configuração anterior não foi capaz de passar do limiar de recompensa estabelecido. Altere a configuração da cálula abaixo para que ela atija o limiar antes do episódio `700`.

ATENÇÃO:
* Não altere a `seed`
* O Google Colab fornece apenas dois núcleos de CPU, aumentar muito o número de *workers* pode afetar significativamente o tempo de execução. 

In [None]:
from ray.rllib.algorithms.ppo.ppo import PPOConfig
ppo_config = PPOConfig()

environment_id = "CartPole-v1"

In [None]:
agent = run_experiment(name="PPO_02", config=ppo_config)

In [None]:
px.line(all_metrics, x="episodes", y=[col for col in all_metrics.columns if '_reward' in col])

In [None]:
px.line(all_metrics, x="episodes", y=[col for col in all_metrics.columns if '_len' in col])

In [None]:
ray.shutdown()

# Exercício 3

Detalhe QUAIS foram as alterações feitas na configuração, e EXPLIQUE por que elas ajudaram o algoritmo a convergir inserindo o papel de cada uma no PPO.

In [None]:
# Explicar