<a href="https://colab.research.google.com/github/alerotta/DRL/blob/main/01%20-%20Cross%20Entropy/Cross_entropy_cartpole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium[classic-control] --quiet
!pip install torch --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import numpy as np
import gymnasium as gym
from dataclasses import dataclass # this is a decorator that simplify creatig class process.
import typing as tt # module to add type hints
from torch.utils.tensorboard.writer import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

In [8]:
HIDDEN_SIZE = 256 #this is the number of hidden units.
BATCH_SIZE = 16 #this is the number of episodes played before each update
PERCENTILE = 70 #this is the percentile of discrded episodes

## N.N. definition
# This is a simple nn, we do not need anything to complex for this example,
# two layers and a relu.

class MyNet(nn.Module) :
  def __init__(self, obs_size, hidden_size,n_actions):
    super().__init__()
    self.input_layer = nn.Linear(obs_size,hidden_size)
    self.hidden_layer = nn.Linear(hidden_size,n_actions)

  def forward (self,x):
    x = self.hidden_layer(nn.functional.relu(self.input_layer(x)))

    return x

## definition of classes used to contain data of learing phase

@dataclass
class EpisodeStep:
  observation: np.ndarray #this is the state of the env, it will be given by gym
  action: int #this is the action taken

@dataclass
class Episode:
  reward: float #the total reward of the episode
  steps: tt.List[EpisodeStep] #the list of all ep. steps

## function to 'play' the episode batch.

def run_batches(environment, network, batch_size) -> tt.Generator[tt.List[Episode],None,None]: # -> used for return type specification
  batch = []
  episode_reward = 0.0
  episode_steps = []
  obs, _ = environment.reset() # numpy vector
  softmax = nn.Softmax(dim=1) # softmax layer
  while True:
    obs_v = torch.tensor(obs, dtype=torch.float32) # pythorch tensor casting
    act_probs_v = softmax(network(obs_v.unsqueeze(0))) # action prob vector from the network
    act_probs = act_probs_v.data.numpy()[0] # casting to numpy
    action = np.random.choice(len(act_probs), p=act_probs) # uses numpy to take a random action
    next_obs,reward,is_done,is_trunc,_ = environment.step(action) # act
    episode_reward += float(reward) # comulative reward (not discounted)
    step = EpisodeStep(obs,action) # save data
    episode_steps.append(step) # save data
    if is_done or is_trunc :
      e = Episode(episode_reward,episode_steps)
      batch.append(e)
      episode_reward= 0.0
      episode_steps = []
      next_obs,_ = environment.reset()
      if len(batch) == batch_size:
        yield batch
        batch = []
    obs = next_obs

## function to discard 'bad' episodes and create data for the training

def filter_episode (batch,percentile):
  rewards = list(map(lambda s: s.reward , batch)) # extract reward of each episode and create a list
  reward_bound = float (np.percentile(rewards,percentile)) # find the threshold given the percentile
  reward_mean = float(np.mean(rewards)) # mean of the rewards of the bact

  train_obs: tt.List[np.ndarray] = []
  train_act: tt.List[int] = []
  for episode in batch:
    if episode.reward < reward_bound :
      continue
    train_obs.extend(map(lambda step: step.observation, episode.steps))
    train_act.extend(map(lambda step: step.action, episode.steps))

  train_obs_v = torch.FloatTensor(np.vstack(train_obs)) #casting
  train_act_v = torch.LongTensor(train_act) #casting
  return train_obs_v, train_act_v, reward_bound, reward_mean


def train ():
  env = gym.make("CartPole-v1") #create environment
  obs_size = env.observation_space.shape[0] #take the observation size
  n_actions = int(env.action_space.n) # number of possible actions

  net = MyNet(obs_size,HIDDEN_SIZE,n_actions) #instance the network
  objective = nn.CrossEntropyLoss()
  optimizer = optim.Adam(params=net.parameters(), lr=0.01)
  writer = SummaryWriter(comment="-cartpole")

  for iter_no, batch in enumerate(run_batches(env, net, BATCH_SIZE)):
    obs_v, acts_v, reward_b, reward_m = filter_episode(batch, PERCENTILE)
    optimizer.zero_grad()
    action_scores_v = net.forward(obs_v)
    loss_v = objective(action_scores_v,acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    if reward_m > 475:
      print("Solved!")
      break
  writer.close()



train()









0: loss=0.685, reward_mean=17.1, rw_bound=18.5
1: loss=0.671, reward_mean=27.1, rw_bound=29.5
2: loss=0.647, reward_mean=36.7, rw_bound=41.5
3: loss=0.648, reward_mean=37.6, rw_bound=44.0
4: loss=0.613, reward_mean=41.0, rw_bound=47.0
5: loss=0.605, reward_mean=48.6, rw_bound=58.0
6: loss=0.589, reward_mean=58.2, rw_bound=64.0
7: loss=0.566, reward_mean=48.8, rw_bound=50.5
8: loss=0.558, reward_mean=61.1, rw_bound=62.5
9: loss=0.552, reward_mean=75.9, rw_bound=74.0
10: loss=0.528, reward_mean=63.9, rw_bound=72.0
11: loss=0.535, reward_mean=74.7, rw_bound=77.0
12: loss=0.532, reward_mean=91.1, rw_bound=101.0
13: loss=0.495, reward_mean=106.8, rw_bound=121.0
14: loss=0.507, reward_mean=108.8, rw_bound=130.0
15: loss=0.491, reward_mean=85.1, rw_bound=88.5
16: loss=0.492, reward_mean=94.4, rw_bound=119.5
17: loss=0.473, reward_mean=104.6, rw_bound=115.5
18: loss=0.477, reward_mean=127.0, rw_bound=139.5
19: loss=0.483, reward_mean=176.8, rw_bound=192.5
20: loss=0.466, reward_mean=176.7, rw_