# Cross-Entropy Method (CEM) RL Cartpole case
General idea is to record samples using different policies, take best performing policies and mutate them to get better policy.

In this implementation, author keeps one (state of) policy, but picks episodes where policy took more rewarding actions.

Then uses optimization to improve policy. Not sure it is correct.

Main idea of this implementation is the following:
* Actions will be determined via NN composed of two Linear layers
* Batch of episodes will be generated via current state of NN
* Best ones selected and from them NN will learn via backpropagation

In [1]:
import torch, gym
import torch.nn as nn
from collections import namedtuple
import numpy as np
from torch.utils.tensorboard import SummaryWriter
#tensorboard --logdir 'runs\RL' --host localhost --port 8888
import datetime




In [2]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

class Net(nn.Module):
    """
        Takes in observation returns un-normalized action prob distribution.
        Output is not passed though SoftMax (SM), since training loop will use nn.CrossEntropyLoss()
        Which applies SM automatically. Testing requires explicit use of SM.
    """
    def __init__(self, obs_size, hidden_size, n_actions) -> None:
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)
    
# episode length has no set length. must store it as dynamic memory
Episode     = namedtuple('Episode'      , field_names= ['reward'        , 'steps'])
EpisodeStep = namedtuple('EpisodeStep'  , field_names= ['observation'   , 'action'])

# Generator function that will indefinitely (if/when asked) return batched episodes 
def iterate_batches(env, net, batch_size):
    # setup first 'iteration'
    batch           = []
    episode_reward  = 0.0
    episode_steps   = []
    obs             = env.reset()[0]
    sm              = nn.Softmax(dim=1)
    # Record an episode. Collect step info into 'episode_steps'.
    # Loop is infinite. when function called via next() (or __next__), 
    # it executes all code before it encounters next 'yield'
    while True:
        obs_v       = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs   = act_probs_v.data.numpy()[0]
        action      = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done = env.step(action)[:3]
        episode_reward += reward
        step        = EpisodeStep(observation=obs, action=action)
        episode_steps.append(step)
        # end of episode. collect episode data and append to batch. reset env and metrics.
        if is_done:
            e = Episode(reward=episode_reward, steps=episode_steps)
            batch.append(e)
            episode_reward  = 0.0
            episode_steps   = []
            next_obs        = env.reset()[0]
            if len(batch) == batch_size: # when batch is full
                yield batch     # Execute and stop here, return batched data.
                batch = []      # On next call clear batch, proceed with a while-loop.
        obs = next_obs


def filter_batch(batch, percentile):
    # Extract best episodes which are in top percentile by total reward.
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    train_obs = []
    train_act = []
    for reward, steps in batch:
        if reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation  , steps))
        train_act.extend(map(lambda step: step.action       , steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)

    return train_obs_v, train_act_v, reward_bound, reward_mean

if __name__ == "__main__":
    env         = gym.make("CartPole-v1")
    obs_size    = env.observation_space.shape[0]
    n_actions   = env.action_space.n

    net         = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective   = nn.CrossEntropyLoss()
    optimizer   = torch.optim.Adam(params=net.parameters(), lr = 0.01)

    now         = datetime.datetime.now()
    s2          = now.strftime("%H_%M_%S")
    writer      = SummaryWriter(fr'runs/RL/{s2}',comment = '-cartpolse')

    # generate a batches using current state of a netural network.
    for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        # take only top percentile episodes
        obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
        optimizer.zero_grad()
        action_scores_v = net(obs_v)                # generate logits [n_steps, n_actions]
        loss_v = objective(action_scores_v, acts_v) # SoftMax and CrossEntropy
        loss_v.backward()
        optimizer.step()
        print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
        iter_no, loss_v.item(), reward_m, reward_b))
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)

        if reward_m > 199:
            print("SOLVED!")
            break
    writer.close()

  obs_v       = torch.FloatTensor([obs])
  if not isinstance(terminated, (bool, np.bool8)):


0: loss=0.702, reward_mean=17.2, rw_bound=18.0
1: loss=0.688, reward_mean=22.4, rw_bound=30.0
2: loss=0.680, reward_mean=26.7, rw_bound=27.5
3: loss=0.656, reward_mean=28.3, rw_bound=36.0
4: loss=0.650, reward_mean=34.4, rw_bound=45.0
5: loss=0.646, reward_mean=38.6, rw_bound=38.5
6: loss=0.635, reward_mean=40.9, rw_bound=47.5
7: loss=0.641, reward_mean=37.1, rw_bound=44.5
8: loss=0.627, reward_mean=42.9, rw_bound=53.0
9: loss=0.599, reward_mean=41.1, rw_bound=54.5
10: loss=0.599, reward_mean=40.9, rw_bound=48.5
11: loss=0.602, reward_mean=56.6, rw_bound=65.5
12: loss=0.596, reward_mean=56.1, rw_bound=70.5
13: loss=0.593, reward_mean=60.7, rw_bound=69.0
14: loss=0.582, reward_mean=65.5, rw_bound=73.5
15: loss=0.571, reward_mean=63.1, rw_bound=70.5
16: loss=0.576, reward_mean=68.2, rw_bound=83.0
17: loss=0.584, reward_mean=60.6, rw_bound=64.5
18: loss=0.562, reward_mean=66.8, rw_bound=77.5
19: loss=0.583, reward_mean=61.8, rw_bound=73.0
20: loss=0.556, reward_mean=63.5, rw_bound=67.0
21

Dummy function that explains how batches are generated

In [3]:
def return_samples(func, inp):
    # init data
    sample = []
    while True:
        # do some stuff
        sample.append(func(inp))
        # enough samples?
        if len(sample) == 10:
            yield sample    # return and stop
            # Start from here. Reset data. 
            sample = []     