<a href="https://colab.research.google.com/github/arthurst38/deep_learning/blob/main/RL_Chapitre_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapitre 4

##Partie1
Introduction du RL et structure de base agent + environnement

In [3]:
pip install tensorboardX

Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/07/84/46421bd3e0e89a92682b1a38b40efc22dafb6d8e3d947e4ceefd4a5fabc7/tensorboardX-2.2-py2.py3-none-any.whl (120kB)
[K     |██▊                             | 10kB 18.1MB/s eta 0:00:01[K     |█████▍                          | 20kB 24.5MB/s eta 0:00:01[K     |████████▏                       | 30kB 25.3MB/s eta 0:00:01[K     |██████████▉                     | 40kB 22.7MB/s eta 0:00:01[K     |█████████████▋                  | 51kB 24.0MB/s eta 0:00:01[K     |████████████████▎               | 61kB 16.4MB/s eta 0:00:01[K     |███████████████████             | 71kB 17.6MB/s eta 0:00:01[K     |█████████████████████▊          | 81kB 16.2MB/s eta 0:00:01[K     |████████████████████████▌       | 92kB 16.3MB/s eta 0:00:01[K     |███████████████████████████▏    | 102kB 16.7MB/s eta 0:00:01[K     |██████████████████████████████  | 112kB 16.7MB/s eta 0:00:01[K     |████████████████████████████████

In [None]:
#!/usr/bin/env python3
import gym
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

In [5]:
HIDDEN_SIZE = 128 #Nombre de neurones dans la couche cachee
BATCH_SIZE = 16
PERCENTILE = 70 #Filtrage utilise par la cross entropu: suppression des 70% de scores les plus faibles



class Net(nn.Module): #Definition d'un acteur de type reseau neuronal avec une couche cachee de type relu
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x): #le reseau prend une donne x et retourne un set de probabilite sur les actions
        return self.net(x)


Episode = namedtuple('Episode', field_names=['reward', 'steps']) #On definit ce qu'est un episode, une serie d'etapes qui accumule des reward
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action']) #Une etape d'un episode est une succession d'observation puis d'actions


def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()   #on choisit l'environnement que l'on remet dans sa config de depart
    sm = nn.Softmax(dim=1)  #la couche softmax a ete defini a l'exterieur du reseau neuronal, c'est elle qui donne les statistiques associees a chaque acction
    while True:
        obs_v = torch.FloatTensor([obs])  # On recupere l'observation
        act_probs_v = sm(net(obs_v))  #on la passe au reseau neuronal, la sortie du reseau neuronal est convertie en statistique
        act_probs = act_probs_v.data.numpy()[0] #On recuperere la statistique associee
        action = np.random.choice(len(act_probs), p=act_probs) #On fait un choix random sur les actions proposees
        next_obs, reward, is_done, _ = env.step(action) #on interroge l'environnement sur le resultat de l'action: on recupere ainsi l'observation, la recompense et le statut
        episode_reward += reward #on cumule les rewards
        step = EpisodeStep(observation=obs, action=action) #on definit les resultats de l'etape
        episode_steps.append(step) #on sauvegarde les etapes
        if is_done:   #une fois que l'episode se termine, on sauve tous les seulatat
            e = Episode(reward=episode_reward, steps=episode_steps)
            batch.append(e)
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs  #une fois l'observation faite, la nouvelle observation devient l'observation d'ou on partira la fois d'apres


def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))

    train_obs = []
    train_act = []
    for reward, steps in batch:
        if reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, steps))
        train_act.extend(map(lambda step: step.action, steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean



In [6]:
if __name__ == "__main__":
    env = gym.make("CartPole-v0")   # On utilise l'environnement cartpole
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    obs_size = env.observation_space.shape[0]  #
    n_actions = env.action_space.n   # on recupere le nombre d'operations qu'il est possible de faire dans cette environnement

    net = Net(obs_size, HIDDEN_SIZE, n_actions)    # creation d'un acteur de type reseau neuronal
    objective = nn.CrossEntropyLoss()    # definition de la fonction d'erreur
    optimizer = optim.Adam(params=net.parameters(), lr=0.01) # definition de l'optimiseur
    writer = SummaryWriter(comment="-cartpole")

    for iter_no, batch in enumerate(iterate_batches(
            env, net, BATCH_SIZE)):
        obs_v, acts_v, reward_b, reward_m = \
            filter_batch(batch, PERCENTILE)
        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()
        print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
        if reward_m > 199:
            print("Solved!")
            break
    writer.close()



0: loss=0.695, reward_mean=27.9, rw_bound=36.5
1: loss=0.680, reward_mean=22.9, rw_bound=25.5
2: loss=0.660, reward_mean=37.8, rw_bound=45.5
3: loss=0.651, reward_mean=32.1, rw_bound=35.5
4: loss=0.653, reward_mean=33.7, rw_bound=41.5
5: loss=0.653, reward_mean=44.1, rw_bound=54.0
6: loss=0.626, reward_mean=48.1, rw_bound=57.0
7: loss=0.620, reward_mean=45.2, rw_bound=50.0
8: loss=0.633, reward_mean=49.6, rw_bound=58.5
9: loss=0.599, reward_mean=59.0, rw_bound=72.5
10: loss=0.619, reward_mean=55.8, rw_bound=64.0
11: loss=0.628, reward_mean=51.1, rw_bound=61.0
12: loss=0.601, reward_mean=57.3, rw_bound=65.5
13: loss=0.612, reward_mean=52.6, rw_bound=58.0
14: loss=0.606, reward_mean=62.9, rw_bound=71.0
15: loss=0.590, reward_mean=72.2, rw_bound=81.0
16: loss=0.594, reward_mean=59.6, rw_bound=68.0
17: loss=0.589, reward_mean=68.2, rw_bound=84.0
18: loss=0.578, reward_mean=77.2, rw_bound=94.0
19: loss=0.567, reward_mean=66.7, rw_bound=84.5
20: loss=0.586, reward_mean=86.2, rw_bound=96.0
21

## Partie 2: Utilisation FrozenLake

In [7]:
#!/usr/bin/env python3
import gym, gym.spaces
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim



In [8]:

HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70


class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space,
                          gym.spaces.Discrete)
        shape = (env.observation_space.n, )
        self.observation_space = gym.spaces.Box(
            0.0, 1.0, shape, dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res


class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)


Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])


def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, _ = env.step(action)
        episode_reward += reward
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs


def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))

    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean


In [9]:
if __name__ == "__main__":
    env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=0.01)
    writer = SummaryWriter(comment="-frozenlake-naive")

    for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()
        print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
        if reward_m > 0.8:
            print("Solved!")
            break
    writer.close()

0: loss=1.384, reward_mean=0.0, reward_bound=0.0
1: loss=1.367, reward_mean=0.1, reward_bound=0.0
2: loss=1.384, reward_mean=0.0, reward_bound=0.0
3: loss=1.345, reward_mean=0.0, reward_bound=0.0
4: loss=1.366, reward_mean=0.1, reward_bound=0.0
5: loss=1.359, reward_mean=0.0, reward_bound=0.0
6: loss=1.344, reward_mean=0.0, reward_bound=0.0
7: loss=1.341, reward_mean=0.0, reward_bound=0.0
8: loss=1.330, reward_mean=0.0, reward_bound=0.0
9: loss=1.326, reward_mean=0.0, reward_bound=0.0
10: loss=1.235, reward_mean=0.0, reward_bound=0.0
11: loss=1.325, reward_mean=0.0, reward_bound=0.0
12: loss=1.288, reward_mean=0.0, reward_bound=0.0
13: loss=1.320, reward_mean=0.0, reward_bound=0.0
14: loss=1.219, reward_mean=0.1, reward_bound=0.0
15: loss=1.182, reward_mean=0.0, reward_bound=0.0
16: loss=1.213, reward_mean=0.0, reward_bound=0.0
17: loss=1.136, reward_mean=0.1, reward_bound=0.0
18: loss=1.222, reward_mean=0.0, reward_bound=0.0
19: loss=1.179, reward_mean=0.0, reward_bound=0.0
20: loss=1

KeyboardInterrupt: ignored

## Partie 3: Utilisation de Frozen lake

In [12]:
#!/usr/bin/env python3
import random
import gym
import gym.spaces
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim


HIDDEN_SIZE = 128
BATCH_SIZE = 100
PERCENTILE = 30
GAMMA = 0.9


class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res


class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)


Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])


def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, _ = env.step(action)
        episode_reward += reward
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs


def filter_batch(batch, percentile):
    filter_fun = lambda s: s.reward * (GAMMA ** len(s.steps))
    disc_rewards = list(map(filter_fun, batch))
    reward_bound = np.percentile(disc_rewards, percentile)

    train_obs = []
    train_act = []
    elite_batch = []
    for example, discounted_reward in zip(batch, disc_rewards):
        if discounted_reward > reward_bound:
            train_obs.extend(map(lambda step: step.observation,
                                 example.steps))
            train_act.extend(map(lambda step: step.action,
                                 example.steps))
            elite_batch.append(example)

    return elite_batch, train_obs, train_act, reward_bound



In [None]:

if __name__ == "__main__":
    random.seed(12345)
    env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=0.001)
    writer = SummaryWriter(comment="-frozenlake-tweaked")

    full_batch = []
    for iter_no, batch in enumerate(iterate_batches(
            env, net, BATCH_SIZE)):
        reward_mean = float(np.mean(list(map(
            lambda s: s.reward, batch))))
        full_batch, obs, acts, reward_bound = \
            filter_batch(full_batch + batch, PERCENTILE)
        if not full_batch:
            continue
        obs_v = torch.FloatTensor(obs)
        acts_v = torch.LongTensor(acts)
        full_batch = full_batch[-500:]

        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()
        print("%d: loss=%.3f, rw_mean=%.3f, "
              "rw_bound=%.3f, batch=%d" % (
            iter_no, loss_v.item(), reward_mean,
            reward_bound, len(full_batch)))
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_mean", reward_mean, iter_no)
        writer.add_scalar("reward_bound", reward_bound, iter_no)
        if reward_mean > 0.8:
            print("Solved!")
            break
    writer.close()

0: loss=1.386, rw_mean=0.010, rw_bound=0.000, batch=1
1: loss=1.374, rw_mean=0.010, rw_bound=0.000, batch=2
2: loss=1.382, rw_mean=0.020, rw_bound=0.000, batch=4
3: loss=1.380, rw_mean=0.000, rw_bound=0.000, batch=4
4: loss=1.372, rw_mean=0.020, rw_bound=0.000, batch=6
5: loss=1.369, rw_mean=0.000, rw_bound=0.000, batch=6
6: loss=1.364, rw_mean=0.010, rw_bound=0.000, batch=7
7: loss=1.362, rw_mean=0.030, rw_bound=0.000, batch=10
8: loss=1.360, rw_mean=0.020, rw_bound=0.000, batch=12
9: loss=1.358, rw_mean=0.000, rw_bound=0.000, batch=12
10: loss=1.361, rw_mean=0.020, rw_bound=0.000, batch=14
11: loss=1.361, rw_mean=0.010, rw_bound=0.000, batch=15
12: loss=1.359, rw_mean=0.040, rw_bound=0.000, batch=19
13: loss=1.357, rw_mean=0.000, rw_bound=0.000, batch=19
14: loss=1.357, rw_mean=0.010, rw_bound=0.000, batch=20
15: loss=1.353, rw_mean=0.020, rw_bound=0.000, batch=22
16: loss=1.352, rw_mean=0.030, rw_bound=0.000, batch=25
17: loss=1.352, rw_mean=0.010, rw_bound=0.000, batch=26
18: loss=

## Partie 4: Monitors

In [10]:
16

1825