# Implémentation de l'algorithme REINFORCE

L'implémentation proposé est inspiré par le tutoriel de [Gymnasium](https://gymnasium.farama.org/tutorials/training_agents/reinforce_invpend_gym_v26/)

## Import de librairie

In [None]:

import torch
import torch.nn as nn
from torch.distributions.normal import Normal
import gymnasium as gym
import matplotlib.pyplot as plt

## Définition du modèle de regression

In [None]:
class NormalDistribParam(nn.Module):
    def __init__(self, inChannel):
        super().__init__()
        self.inLayer = nn.Sequential(
            nn.Linear(inChannel, 32),
            nn.Tanh()
        )
        self.linLayer1 = nn.Sequential(
            nn.Linear(32, 32),
            nn.Tanh()
        )
        self.muLayer = nn.Sequential(
            nn.Linear(32, 1),
        )
        self.sigmaLayer = nn.Sequential(
            nn.Linear(32, 1),
        )
    def forward(self, x):
        x = self.inLayer(x.float())
        x = self.linLayer1(x)
        return self.muLayer(x),torch.log(1+torch.exp(self.sigmaLayer(x)))

# Définition de la *policy* à optimiser

In [None]:
class Policy():
    def __init__(self, inChannel, lr) -> None:
        self.reg = NormalDistribParam(inChannel=inChannel)
        self.opti = torch.optim.AdamW(self.reg.parameters(), lr=lr)
        self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.opti, gamma=.9)
    def choose_action(self, state):
        
        mean, std = self.reg(state)
        
        distrib = Normal(mean+1e-7, std+1e-7) # ajout d'une constante pour la stabilité 
        
        action = distrib.sample()
        p = distrib.log_prob(action)
        
        return action, p

In [None]:
def mean(liste:list)->list:
    return sum(liste)/len(liste)

In [None]:
def test_policy(env, policy, n):
    n_iter = 0
    r = 0
    for _ in range(n):
        term, trunc = False, False

        state, _ = env.reset()

        while (not term or trunc):
            action, _, _ = policy.choose_action(state)

            state, reward, term, trunc = env.step(action)

            n_iter += 1
            r += reward
    return n_iter/n, r/n

In [None]:
nEpisode = 29000 
frequenceUpdate = 250 

gamma = .9

env_name = "InvertedDoublePendulum-v4"

l_ps = []
l_rewards = []
avg_reward = []
ttlRewardEpisode = []
lenEpisode = []

In [None]:
env = gym.make(env_name)

policy = Policy(inChannel=env.observation_space[0], lr=1e-3)

In [None]:
for episode in range(1, nEpisode+1):
    state, _ = env.reset()
    running = True
    l_reward = []
    l_p = []
    t = 0
    
    while running:
        
        action, p = policy.choose_action(state=state)
        state, r, terminated, truncated, _ = env.step(action=action)
        l_p.append(p)
        running = not (terminated or truncated)
        if running:
            l_reward.append(r)
            avg_reward.append(r)
        else:
            l_reward.append(-1)
            avg_reward.append(-1)
            
    l_ps.append(l_p)
    l_rewards.append(l_reward)
    ttlRewardEpisode.append(sum(l_reward))
    lenEpisode.append(len(l_p))
    
    if episode % frequenceUpdate == 0:
        loss = 0
        policy.opti.zero_grad()
        
        print("AVG nStep : ", mean([len(l_ps[n]) for n in range(len(l_ps))]))
        for n in range(len(l_ps)):
            for t in range(len(l_ps[n])):
                loss += -1*l_ps[n][t]*torch.sum(torch.tensor([l_rewards[n][t]*gamma**(t_prime-t) for t_prime in range(t, len(l_ps[n]))]))

        loss.backward()
        policy.optimizer.step()
        
        if(episode%(frequenceUpdate*5)==0):
            policy.scheduler.step()
        print(f"Episode : {episode}/{nEpisode} Average Reward : {mean(avg_rewards)}")
        l_ps = []
        l_rewards = []
        avg_rewards = []

plt.figure()
plt.plot(ttlRewardEpisode)
plt.grid()
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("Total Reward per episode")

plt.figure()
plt.plot(lenEpisode)
plt.grid()
plt.xlabel("Episode")
plt.ylabel("Length of an episode")
plt.title("Length of each episode episode")        