<a href="https://colab.research.google.com/github/alerotta/DRL/blob/main/01%20-%20Cross%20Entropy/Cross_Entropy_FrozenLake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gymnasium[toy-text] --quiet
!pip install torch --quiet

In [3]:

import numpy as np
import gymnasium as gym
from dataclasses import dataclass
import typing as tt
import random
from torch.utils.tensorboard.writer import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F



In [None]:

# just an example, it is not optimized and this method is not the best even for simple envs
# this is the proof since it is very slow.



HIDDEN_SIZE = 128
BATCH_SIZE = 100
PERCENTILE = 30
GAMMA = 0.9

#definition of a wrapper of the env to have one-hot encodings
class DiscreteOneHotWrapper(gym.ObservationWrapper):
  def __init__(self, env):
    super(DiscreteOneHotWrapper,self).__init__(env)
    assert isinstance(env.observation_space, gym.spaces.Discrete)
    shape = (env.observation_space.n, )
    self.observation_space = gym.spaces.Box(0.0, 1.0, shape, dtype=np.float32)

  def observation (self,observation):
     res = np.copy(self.observation_space.low)
     res[observation] = 1.0
     return res

# network class
class MyNet(nn.Module):
  def __init__(self,n_input,n_hidden,n_actions):
    super().__init__()
    self.input_layer = nn.Linear(n_input,n_hidden)
    self.hidden_layer = nn.Linear(n_hidden,n_actions)

  def forward(self,x):
    x = self.hidden_layer(F.relu(self.input_layer(x)))
    return x

@dataclass
class EpisodeStep():
  observation: np.array #state array from env
  action: int #action taken

@dataclass
class Episode():
  reward: float #total reward of the ep
  steps: tt.List[EpisodeStep] #list of all steps of the ep

def iterate_batches (env: gym.Env, net: MyNet, batch_size: int):
  batch = []
  episode_reward = 0.0
  episode_steps = []
  obs, _ = env.reset()
  sm = nn.Softmax(dim=1)
  while True :
    obs_v = torch.tensor(obs,dtype=torch.float32)
    act_probs_v = sm(net.forward(obs_v.unsqueeze(0)))
    act_probs = act_probs_v.data.numpy()[0]
    action = np.random.choice(len(act_probs), p=act_probs)
    next_obs, reward, is_done, is_trunc, _ = env.step(action)
    episode_reward += float(reward)
    step = EpisodeStep(observation=obs,action=action)
    episode_steps.append(step)
    if is_done or is_trunc :
      e = Episode(reward=episode_reward,steps=episode_steps)
      batch.append(e)
      episode_reward = 0.0
      episode_steps = []
      next_obs , _ = env.reset()
      if len(batch) == batch_size :
        yield batch
        batch = []
      obs = next_obs

def filter_batch (batch: tt.List[Episode], percentile: float):
  reward_fun = lambda s: s.reward * (GAMMA ** len(s.steps)) #discounted reward
  disc_rewards = list(map(reward_fun,batch))
  reward_bound = np.percentile(disc_rewards,percentile)

  train_obs : tt.List[np.array] = []
  train_act : tt.List[int] = []
  elite_batch: tt.List[Episode] = [] #episodes saved for more the one iteration

  for example, discounted_reward in zip(batch,disc_rewards):
    if discounted_reward > reward_bound:
      train_obs.extend(map(lambda step: step.observation, example.steps))
      train_act.extend(map(lambda step: step.action, example.steps))
      elite_batch.append(example)

  return elite_batch, train_obs, train_act, reward_bound

def train ():
  random.seed(12345)
  env = DiscreteOneHotWrapper(gym.make("FrozenLake-v1"))
  obs_size = env.observation_space.shape[0]
  n_actions = env.action_space.n

  net = MyNet(obs_size, HIDDEN_SIZE, n_actions)
  loss = nn.CrossEntropyLoss()
  optimizer = optim.Adam(params = net.parameters(), lr= 0.001)
  writer = SummaryWriter(comment="-frozenlake-tweaked")

  full_batch = []
  for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
    full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE)
    if not full_batch:
      continue
    obs_v = torch.FloatTensor(np.vstack(obs))
    acts_v = torch.LongTensor(acts)
    full_batch = full_batch[-500:]

    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = loss(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, rw_mean=%.3f, "
              "rw_bound=%.3f, batch=%d" % (
            iter_no, loss_v.item(), reward_mean,
            reward_bound, len(full_batch)))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_mean", reward_mean, iter_no)
    writer.add_scalar("reward_bound", reward_bound, iter_no)
    if reward_mean > 0.8:
      print("Solved!")
      break
  writer.close()


train()