# Motivation du projet

# Implémentation inspiré de l'article originale
## Import de librairie

In [None]:
import gymnasium as gym

import torch
import torch.nn as nn
from torch.distributions.normal import Normal
from torch.utils.data import DataLoader, Dataset

from random import shuffle

import numpy as np

import matplotlib.pyplot as plt

## Définition des models

Modèle utilisé par le critic

In [None]:
class Regressor(nn.Module):
    def __init__(self, inChannel:int, outChannel:int):
        super().__init__()
        self.inLayer = nn.Sequential(
            nn.Linear(inChannel, 32),
            nn.Tanh()
        )
        self.linLayer = nn.Sequential(
            nn.Linear(32, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh()
        )
        self.outLayer = nn.Sequential(
            nn.Linear(32, outChannel)
        )
    def forward(self, x):
        x = self.inLayer(x.float())
        x = self.linLayer(x)

        return self.outLayer(x)

Modèle utilisé pour l'acteur, génère les paramètres de la distribution à utiliser pour choisir l'action

In [None]:
class NormalDistribParam(nn.Module):
    def __init__(self, inChannel):
        super().__init__()
        self.inLayer = nn.Sequential(
            nn.Linear(inChannel, 32),
            nn.Tanh()
        )
        self.linLayer1 = nn.Sequential(
            nn.Linear(32, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh()
        )
        self.muLayer = nn.Sequential(
            nn.Linear(32, 1),
        )
        self.sigmaLayer = nn.Sequential(
            nn.Linear(32, 1),
        )
    def forward(self, x):
        x = self.inLayer(x.float())
        x = self.linLayer1(x)
        return self.muLayer(x),torch.log(1+torch.exp(self.sigmaLayer(x)))

Définition de l'acteur qui décidera de l'action à prendre en fonction de la *policy* aprise ainsi que le critic qui évalura la qualité de la décision prise.


In [None]:
lambda_lr = lambda epoch: 0.5
class Policy():
    def __init__(self, dim_state:int, lr=5e-3):
        self.lr = lr
        self.reg = NormalDistribParam(inChannel=dim_state) 
        self.optim = torch.optim.Adam(self.reg.parameters(), lr=self.lr)
        self.scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.optim, lambda_lr)

class Critic():
    def __init__(self, dim_state:int, lr=5e-3):
      self.lr = lr
      self.reg = Regressor(inChannel = dim_state, outChannel = 1)
      self.optim = torch.optim.Adam(self.reg.parameters(), lr=self.lr)
      self.scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.optim, lambda_lr)

## Définition des fonctions utiles

Choisi une action et l'évalue, utilisé pour générer les données d'entrainement

In [None]:
def choose_action(policy, critic, state):
    mean, std = policy.reg(torch.tensor(state))

    distrib = Normal(mean+1e-7, std+1e-7)

    action = distrib.sample()
    p = distrib.log_prob(action)
    v = critic.reg(torch.tensor(state))

    return action.detach(), p.detach(), v.detach()

Utilisé lors de la phase d'apprentissage, retourne les données à optimiser

In [None]:
def evaluate(policy, critic, state, action):
    mean, std = policy.reg(torch.tensor(state))

    distrib = Normal(mean+1e-7, std+1e-7)

    p = distrib.log_prob(action)
    v = critic.reg(torch.tensor(state))
    entropy = distrib.entropy()

    return p, v, entropy

évalue la policy aprise, retourne les résultats moyennés sur n jeux

In [None]:
def test_policy(env, policy, critic, n):
    n_iter = 0
    r = 0
    for _ in range(n):
        term, trunc = False, False

        state, _ = env.reset()

        while (not term or trunc):
            action, _, _ = choose_action(policy, critic, state)

            state, reward, term, trunc = env.step(action)

            n_iter += 1
            r += reward
    return n_iter/n, r/n

## Initialisation des données d'entrainement

### Constantes

In [None]:
iterations = 50
N = 100
T = 3
epochs = 50

n_iter = 0

gamma = .95
epsilon = .1
percent = .50

c1, c2 = 0.01,0.0001

env_name = "InvertedDoublePendulum-v4"

### Création de l'environnement

In [None]:

env = gym.make(env_name)

policy = Policy(dim_state=env.observation_space.shape[0])
critic = Critic(dim_state=env.observation_space.shape[0])

mse = nn.MSELoss()

A exécuter pour charger les poids des models pré-entrainer

In [None]:
ckp_pol = torch.load("model-ppo/from-paper/"+env_name+"_pol.pt")
policy.reg.load_state_dict(ckp_pol["model"])

ckp_critic = torch.load("model-ppo/from-paper/"+env_name+"_critic.pt")
critic.reg.load_state_dict(ckp_critic["model"])

## Définition du *custom Dataset* de pytorch

L'utilisation d'un Dataset et Dataloader permet l'apprentissage en batch de manière très simple

### Constantes pour séparer les données de l'apprentissage

In [None]:
STATE_IDX_LOW = 0
STATE_IDX_UP = env.env.observation_space.shape[0]
ACTION_IDX = STATE_IDX_UP
ADV_IDX = 1 + STATE_IDX_UP
P_IDX = 2 + STATE_IDX_UP
V_IDX = 3 + STATE_IDX_UP

print(STATE_IDX_LOW, STATE_IDX_UP, ACTION_IDX, ADV_IDX, P_IDX, V_IDX)

### Torch Dataset

In [None]:
class SegmentDataset(Dataset):
    def __init__(self, data:list, partition:float):
      self.data = torch.cat(data[:int(len(data)*partition)], dim=0)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        old_state = self.data[idx][STATE_IDX_LOW : STATE_IDX_UP+1]
        old_action = self.data[idx][ACTION_IDX]
        old_adv = self.data[idx][ADV_IDX]
        old_p = self.data[idx][P_IDX]
        old_V = self.data[idx][V_IDX]

        return old_state, old_action, old_adv, old_p, old_V

## Apprentissage

In [None]:
graph_r = []
graph_iter = []
for iteration in range(iterations):

    old_buff = []
    print(int(max(T, n_iter-2)))
    for n_actor in range(N):

        t_t = []
        t_V = []
        t_adv = []
        t_reward = []

        t_buff = []

        term = True
        trunc = True
        t = 0

        for timesteps in range(int(max(T, n_iter-1))):
            t += 1
            if term or trunc:
                t = 0
                state, _ = env.reset()

            action, p, v = choose_action(policy, critic, state)
            state, reward, term, trunc = env.step(action)

            t_reward.append(reward)

            t_t.append(t)
            t_V.append(sum([t_reward[j]*gamma**j for j in range(timesteps-t+1, timesteps+1)]))
            t_adv.append(t_V[-1]-v)

            state = tuple(state)
            
            t_buff.append([*state, action, t_V[-1]-v, p, t_V[-1]])


        old_buff.append(torch.tensor(t_buff, dtype=torch.float32))

    shuffle(old_buff)

    train_data = SegmentDataset(old_buff, percent)
    train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        avg_loss = 0
        for old_state, old_action, old_adv, old_p, old_V in train_dataloader:

            newP, newV, entropy = evaluate(policy, critic, state, action)

            ratio = torch.exp(newP - old_p)

            loss1 = ratio * old_adv
            loss2 = torch.clamp(ratio, 1-epsilon, 1+epsilon)*old_adv

            l_clip = torch.min(loss1, loss2)
            l_vf = mse(newV, torch.tensor(old_V))

            loss = -(l_clip - c1*l_vf + c2*entropy)
            loss = loss.mean()
            policy.optim.zero_grad()
            critic.optim.zero_grad()
            loss.backward()
            policy.optim.step()
            critic.optim.step()
            
            avg_loss += loss.detach()
        print(f'Epoch [{epoch}/{epochs}] Mean Loss {avg_loss/(int(N*T*percent))}')

    n_iter, r = test_policy(env, policy, critic, 20)
    graph_iter.append(n_iter)
    graph_r.append(r)
    print(f"Iter[{iteration}/{iterations}]  n_iter mean : {n_iter} reward mean : {r}")
    if iteration != 0 and iteration%10:
        policy.scheduler.step()
        critic.scheduler.step()

torch.save({"model":policy.reg.state_dict()}, "policy.pt")
torch.save({"model":critic.reg.state_dict()}, "critic.pt")
plt.figure()
plt.plot(graph_iter)
plt.plot(graph_r)
plt.grid()
plt.legend(["iter", "reward"])

Evaluation des résultats

In [None]:
n_iter, r = test_policy(env, policy, critic, 20)
print(n_iter, r)

# [Comparaison avec le tutoriel de Pythorch](https://pytorch.org/rl/tutorials/coding_ppo.html)

In [None]:
from collections import defaultdict

import matplotlib.pyplot as plt
import torch
from tensordict.nn import TensorDictModule
from tensordict.nn.distributions import NormalParamExtractor
from torch import nn

from torchrl.collectors import SyncDataCollector
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.envs import (
    Compose,
    DoubleToFloat,
    ObservationNorm,
    StepCounter,
    TransformedEnv,
)
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value import GAE
from tqdm import tqdm

## Define Hyperparameters

In [None]:
device = "cpu" if not torch.has_cuda else "cuda:0"
num_cells = 256  # number of cells in each layer i.e. output dim.
lr = 3e-4
max_grad_norm = 1.0

## Data collection parameters

In [None]:
frame_skip = 1
frames_per_batch = 1000 // frame_skip
# For a complete training, bring the number of frames up to 1M
total_frames = 10_000 // frame_skip

## PPO parameters

In [None]:
sub_batch_size = 64  # cardinality of the sub-samples gathered from the current data in the inner loop
num_epochs = 10  # optimisation steps per batch of data collected
clip_epsilon = (
    0.2  # clip value for PPO loss: see the equation in the intro for more context.
)
gamma = 0.99
lmbda = 0.95
entropy_eps = 1e-4

## Define an environment

In [None]:
base_env = GymEnv("InvertedDoublePendulum-v4", device=device, frame_skip=frame_skip)

## Normalisation

In [None]:
env = TransformedEnv(
    base_env,
    Compose(
        # normalize observations
        ObservationNorm(in_keys=["observation"]),
        DoubleToFloat(
            in_keys=["observation"],
        ),
        StepCounter(),
    ),
)

In [None]:
env.transform[0].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0)

In [None]:
print("normalization constant shape:", env.transform[0].loc.shape)

In [None]:
print("observation_spec:", env.observation_spec)
print("reward_spec:", env.reward_spec)
print("done_spec:", env.done_spec)
print("action_spec:", env.action_spec)
print("state_spec:", env.state_spec)

In [None]:
rollout = env.rollout(3)
print("rollout of three steps:", rollout)
print("Shape of the rollout TensorDict:", rollout.batch_size)

## Policy

In [None]:
actor_net = nn.Sequential(
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(2 * env.action_spec.shape[-1], device=device),
    NormalParamExtractor(),
)

In [None]:
policy_module = TensorDictModule(
    actor_net, in_keys=["observation"], out_keys=["loc", "scale"]
)

In [None]:
policy_module = ProbabilisticActor(
    module=policy_module,
    spec=env.action_spec,
    in_keys=["loc", "scale"],
    distribution_class=TanhNormal,
    distribution_kwargs={
        "min": env.action_spec.space.minimum,
        "max": env.action_spec.space.maximum,
    },
    return_log_prob=True,
    # we'll need the log-prob for the numerator of the importance weights
)

## Value network

In [None]:
value_net = nn.Sequential(
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(1, device=device),
)

value_module = ValueOperator(
    module=value_net,
    in_keys=["observation"],
)

In [None]:
print("Running policy:", policy_module(env.reset()))
print("Running value:", value_module(env.reset()))

## Data collector

In [None]:
collector = SyncDataCollector(
    env,
    policy_module,
    frames_per_batch=frames_per_batch,
    total_frames=total_frames,
    split_trajs=False,
    device=device,
)

## Replay buffer

In [None]:
replay_buffer = ReplayBuffer(
    storage=LazyTensorStorage(frames_per_batch),
    sampler=SamplerWithoutReplacement(),
)

## Loss function

In [None]:
advantage_module = GAE(
    gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True
)

loss_module = ClipPPOLoss(
    actor=policy_module,
    critic=value_module,
    clip_epsilon=clip_epsilon,
    entropy_bonus=bool(entropy_eps),
    entropy_coef=entropy_eps,
    # these keys match by default but we set this for completeness
    value_target_key=advantage_module.value_target_key,
    critic_coef=1.0,
    gamma=0.99,
    loss_critic_type="smooth_l1",
)

optim = torch.optim.Adam(loss_module.parameters(), lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optim, total_frames // frames_per_batch, 0.0
)

## Training loop

In [None]:
logs = defaultdict(list)
pbar = tqdm(total=total_frames * frame_skip)
eval_str = ""

# We iterate over the collector until it reaches the total number of frames it was
# designed to collect:
for i, tensordict_data in enumerate(collector):
    # we now have a batch of data to work with. Let's learn something from it.
    for _ in range(num_epochs):
        # We'll need an "advantage" signal to make PPO work.
        # We re-compute it at each epoch as its value depends on the value
        # network which is updated in the inner loop.
        with torch.no_grad():
            advantage_module(tensordict_data)
        data_view = tensordict_data.reshape(-1)
        replay_buffer.extend(data_view.cpu())
        for _ in range(frames_per_batch // sub_batch_size):
            subdata = replay_buffer.sample(sub_batch_size)
            loss_vals = loss_module(subdata.to(device))
            loss_value = (
                loss_vals["loss_objective"]
                + loss_vals["loss_critic"]
                + loss_vals["loss_entropy"]
            )

            # Optimization: backward, grad clipping and optim step
            loss_value.backward()
            # this is not strictly mandatory but it's good practice to keep
            # your gradient norm bounded
            torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
            optim.step()
            optim.zero_grad()

    logs["reward"].append(tensordict_data["next", "reward"].mean().item())
    pbar.update(tensordict_data.numel() * frame_skip)
    cum_reward_str = (
        f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
    )
    logs["step_count"].append(tensordict_data["step_count"].max().item())
    stepcount_str = f"step count (max): {logs['step_count'][-1]}"
    logs["lr"].append(optim.param_groups[0]["lr"])
    lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
    if i % 10 == 0:
        # We evaluate the policy once every 10 batches of data.
        # Evaluation is rather simple: execute the policy without exploration
        # (take the expected value of the action distribution) for a given
        # number of steps (1000, which is our env horizon).
        # The ``rollout`` method of the env can take a policy as argument:
        # it will then execute this policy at each step.
        with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
            # execute a rollout with the trained policy
            eval_rollout = env.rollout(1000, policy_module)
            logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
            logs["eval reward (sum)"].append(
                eval_rollout["next", "reward"].sum().item()
            )
            logs["eval step_count"].append(eval_rollout["step_count"].max().item())
            eval_str = (
                f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
                f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
                f"eval step-count: {logs['eval step_count'][-1]}"
            )
            del eval_rollout
    pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))

    # We're also using a learning rate scheduler. Like the gradient clipping,
    # this is a nice-to-have but nothing necessary for PPO to work.
    scheduler.step()

## Result

In [None]:
plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
plt.plot(logs["reward"])
plt.title("training rewards (average)")
plt.subplot(2, 2, 2)
plt.plot(logs["step_count"])
plt.title("Max step count (training)")
plt.subplot(2, 2, 3)
plt.plot(logs["eval reward (sum)"])
plt.title("Return (test)")
plt.subplot(2, 2, 4)
plt.plot(logs["eval step_count"])
plt.title("Max step count (test)")
plt.show()

# Conclusion