# Cross-Entropy Method

---

In this notebook, we will train the Cross-Entropy Method with OpenAI Gym's MountainCarContinuous environment.

### 1. Import the Necessary Packages

In [None]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf

### 2. Instantiate the Environment and Agent

In [None]:
env = gym.make('MountainCarContinuous-v0')
env.seed(101)
np.random.seed(101)

print('observation space:', env.observation_space)
print('action space:', env.action_space)
print('  - low:', env.action_space.low)
print('  - high:', env.action_space.high)

class Linear(tf.keras.layers.Layer):
    
    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__(autocast=False)
        self.w = self.add_weight(shape=(input_dim, units),
                                 initializer='random_normal',
                                 dtype=tf.float64,
                                 name='weight',
                                 trainable=True)
        self.b = self.add_weight(shape=(units,),
                                 initializer='zeros',
                                 dtype=tf.float64,
                                 name='bias',
                                 trainable=True)
    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

class Agent(tf.keras.Model):
    def __init__(self, env, h_size=16):
        super(Agent, self).__init__(name='cross-entropy-method')
        self.env = env
        # state, hidden layer, action sizes
        self.s_size = env.observation_space.shape[0]
        self.h_size = h_size
        self.a_size = env.action_space.shape[0]
        # define layers
        self.fc1 = Linear(units=self.h_size, input_dim=self.s_size)
        self.fc2 = Linear(units = self.a_size, input_dim=self.h_size)
        
    def set_weights(self, weights):
        s_size = self.s_size
        h_size = self.h_size
        a_size = self.a_size
        # separate the weights for each layer
        fc1_end = (s_size*h_size)+h_size
        fc1_W = tf.convert_to_tensor(weights[:s_size*h_size].reshape(s_size, h_size))
        fc1_b = tf.convert_to_tensor(weights[s_size*h_size:fc1_end])
        fc2_W = tf.convert_to_tensor(weights[fc1_end:fc1_end+(h_size*a_size)].reshape(h_size, a_size))
        fc2_b = tf.convert_to_tensor(weights[fc1_end+(h_size*a_size):])
        # set the weights for each layer
        self.fc1.w.assign(fc1_W)
        self.fc1.b.assign(fc1_b)
        self.fc2.w.assign(fc2_W)
        self.fc2.b.assign(fc2_b)
    
    def get_weights_dim(self):
        return (self.s_size+1)*self.h_size + (self.h_size+1)*self.a_size
        
    @tf.function
    def call(self, x):
        out = tf.nn.relu(self.fc1(x))
        out = tf.nn.tanh(self.fc2(out))
        return out
        
    def evaluate(self, weights, gamma=1.0, max_t=5000):
        self.set_weights(weights)
        episode_return = 0.0
        state = self.env.reset()
        for t in range(max_t):
            state = np.array(state)[None]
            action = self.call(state)
            state, reward, done, _ = self.env.step(action[0])
            episode_return += reward * math.pow(gamma, t)
            if done:
                break
        return episode_return
    
agent = Agent(env)

### 3. Train the Agent with the Cross-Entropy Method

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [None]:
def cem(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5):
    """Tensorflow implementation of the cross-entropy method.
        
    Params
    ======
        n_iterations (int): maximum number of training iterations
        max_t (int): maximum number of timesteps per episode
        gamma (float): discount rate
        print_every (int): how often to print average score (over last 100 episodes)
        pop_size (int): size of population at each iteration
        elite_frac (float): percentage of top performers to use in update
        sigma (float): standard deviation of additive noise
    """
    ckpt = tf.train.Checkpoint(step=tf.Variable(0), model=agent)
    ckpt_manager = tf.train.CheckpointManager(ckpt, 'model/', max_to_keep=3)
    
    n_elite=int(pop_size*elite_frac)

    scores_deque = deque(maxlen=100)
    scores = []
    best_weight = sigma*np.random.randn(agent.get_weights_dim())

    for i_iteration in range(1, n_iterations+1):
        weights_pop = [best_weight + (sigma*np.random.randn(agent.get_weights_dim())) for i in range(pop_size)]
        rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])

        elite_idxs = rewards.argsort()[-n_elite:]
        elite_weights = [weights_pop[i] for i in elite_idxs]
        best_weight = np.array(elite_weights).mean(axis=0)

        reward = agent.evaluate(best_weight, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)
        
        ckpt.step.assign_add(1)
        ckpt_manager.save()
        
        if i_iteration % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))

        if np.mean(scores_deque)>=90.0:
            print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
            break
    return scores

scores = cem()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

### 4. Watch a Smart Agent!

In the next code cell, you will load the trained weights from file to watch a smart agent!

In [None]:
# load the weights from file
env = gym.wrappers.Monitor(env, 'videos/', force=True)
new_agent = Agent(env)
ckpt = tf.train.Checkpoint(model=new_agent)
latestSnapshot= tf.train.latest_checkpoint("model/")
if not latestSnapshot:
    raise Exception('No saved model found in: ' + 'model/')

ckpt.restore(latestSnapshot)
print("Restored saved model from latest snapshot")

state = env.reset()
while True:
    state = np.array(state)[None]
    action = agent(state)
    env.render()
    next_state, reward, done, _ = env.step(action[0])
    state = next_state
    if done:
        break

env.close()