# Requirements

In [1]:
try:
    import google.colab
    USE_COLAB = True
except:
    USE_COLAB = False

RUN_TESTS = True

if USE_COLAB:
    print("Don't forget to avoid disconnections:")
    print("""
    function ClickConnect(){
        console.log("Clicking"); 
        document.querySelector("colab-connect-button").click() 
    }
    setInterval(ClickConnect,60000)
    
    """)

In [2]:
import numpy as np
from numpy.linalg import inv

import scipy.signal

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Normal

import gym

import matplotlib.pyplot as plt

import time
from itertools import count
from collections import OrderedDict

# Description

Let's use TRPO to train evil robots! (pick any of two)
* [MuJoCo robots](https://gym.openai.com/envs#mujoco)
* [Box2d robot](https://gym.openai.com/envs/BipedalWalker-v2)

The catch here is that those environments have continuous action spaces. 

Luckily, TRPO is a policy gradient method, so it's gonna work for any parametric $\pi_\theta(a|s)$. We recommend starting with gaussian policy:

$$\pi_\theta(a|s) = N(\mu_\theta(s),\sigma^2_\theta(s)) = {1 \over \sqrt { 2 \pi {\sigma^2}_\theta(s) } } e^{ (a - 
\mu_\theta(s))^2 \over 2 {\sigma^2}_\theta(s) } $$

In the $\sqrt { 2 \pi {\sigma^2}_\theta(s) }$ clause, $\pi$ means ~3.1415926, not agent's policy.

This essentially means that you will need two output layers:
* $\mu_\theta(s)$, a dense layer with linear activation
* ${\sigma^2}_\theta(s)$, a dense layer with activation tf.exp (to make it positive; like rho from bandits)

For multidimensional actions, you can use fully factorized gaussian (basically a vector of gaussians).

__bonus task__: compare performance of continuous action space method to action space discretization

# Data Exploration

Explore given environment

In [3]:
test_env = gym.make("BipedalWalker-v2")
test_obs = test_env.reset()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [4]:
print(test_obs.shape)
print(test_env.action_space)

(24,)
Box(4,)


There are 4 possible actions: for all 4 Joints. Each parameter in range [-1, 1]. 

Let model learn it by **iteslf**

# Define functional

---
## 1. model

### defenition

In [5]:
class TRPOAgent(nn.Module):
    def __init__(self, state_shape, n_actions, hidden_size=[64, 32]):
        '''
        Here you should define your model
        You should have LOG-PROBABILITIES as output because you will need it to compute loss
        We recommend that you start simple: 
        use 1-2 hidden layers with 100-500 units and relu for the first try
        '''
        nn.Module.__init__(self)

        self.body = nn.Sequential(
            nn.Linear(state_shape[0], hidden_size[0]),
            nn.ReLU(),
            nn.Linear(hidden_size[0], hidden_size[1]),
            nn.ReLU(),
        )
        self.mu_head = nn.Sequential(
            nn.Linear(hidden_size[1], n_actions),
            nn.ReLU()
        )
        self.std_head = nn.Sequential(
            nn.Linear(hidden_size[1], n_actions),
        )

    def forward(self, states):
        """
        takes agent's observation (Variable), returns gaussian distribution parameters
        :param states: a batch of states, shape = [batch_size, state_shape]
        """
        hidden_state = self.body(states)
        return self.mu_head(hidden_state), torch.exp(self.std_head(hidden_state))
    
    def get_prob_dist(self, obs):
        """
        takes agent's observation (Variable), returns gaussian distribution
        :param states: a batch of states, shape = [batch_size, state_shape]
        """

        mu, std = self.forward(Variable(torch.FloatTensor(obs)))
        dist = Normal(mu, std)
        return dist

    def act(self, obs):
        '''
        Samples action from policy distribution
        :param: obs - single observation vector
        :returns: action (single integer) and probabilities class for all actions
        '''
        
        with torch.no_grad():
            dist = self.get_prob_dist(obs)
            
        action = dist.sample().numpy()
        return action, dist

### testing

In [6]:
test_model = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
test_input = torch.ones((24))
test_next_action, test_dist = test_model.act(test_obs)
assert (type(test_next_action) == np.ndarray)
assert (test_next_action.shape == (4,))
assert (test_dist.mean.shape == (4,))

print('Test passed')

Test passed


---
## 2. flat parameters operations

### definition

In [7]:
def get_flat_params_from(model):
    params = []
    for param in model.parameters():
        params.append(param.data.view(-1))

    flat_params = torch.cat(params)
    return flat_params


def set_flat_params_to(model, flat_params):
    prev_ind = 0
    for param in model.parameters():
        flat_size = int(np.prod(list(param.size())))
        param.data.copy_(
            flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
        prev_ind += flat_size

---
## 3. count cummulative

### definition



In [8]:
def get_cummulative_returns(r, gamma=1):
    """
    Computes cummulative discounted rewards given immediate rewards
    G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...
    Also known as R(s,a).
    """
    r = np.array(r)
    assert r.ndim >= 1
    return scipy.signal.lfilter([1], [1, -gamma], r[::-1], axis=0)[::-1]

### test

In [9]:
# simple demo on rewards [0,0,1,0,0,1]
assert (get_cummulative_returns([0, 0, 1, 0, 0, 1], gamma=1.) == np.array([2., 2., 2., 1., 1., 1.])).all()
print('Tests passed')

Tests passed


## 4. transformation list of distributions to one multidimensial distribution

### definition

In [10]:
def cat_normal(normals):
    """
    cat list of Normal distributions to single distribution
    :param: normals - list of Normal distributions
    """
    mu_s = [dist.mean for dist in normals]
    std_s = [dist.stddev for dist in normals]
    if len(mu_s[0].shape) == 1:
        return Normal(torch.stack(mu_s), torch.stack(std_s))
    else:
        return Normal(torch.cat(mu_s), torch.cat(std_s))

### testing


In [11]:
A = Normal(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
B = Normal(torch.tensor([2., 2.]), torch.tensor([3., 3.]))
AB = cat_normal([A, B])
ABAB = cat_normal([AB, AB])

assert (AB.mean == torch.tensor([[0., 0.], [2., 2.]])).all()
assert (AB.stddev == torch.tensor([[1., 1.], [3., 3.]])).all()
assert (ABAB.mean.shape == torch.Size([4, 2]))
print('Test passed')

Test passed


---
## 5. rollout

### definition

In [12]:
def rollout(env, agent, max_pathlength=2500, n_timesteps=50000):
    """
    Generate rollouts for training.
    :param: env - environment in which we will make actions to generate rollouts.
    :param: act - the function that can return policy and action given observation.
    :param: max_pathlength - maximum size of one path that we generate.
    :param: n_timesteps - total sum of sizes of all pathes we generate.
    """
    paths = []

    total_timesteps = 0
    while total_timesteps < n_timesteps:
        obervations, actions, rewards, probs_dist = [], [], [], []
        obervation = env.reset()
        for _ in range(max_pathlength):
            action, prob_dist = agent.act(obervation)
            obervations.append(obervation)
            actions.append(action)
            probs_dist.append(prob_dist)
            obervation, reward, done, _ = env.step(action)
            rewards.append(reward)
            total_timesteps += 1
            if done or total_timesteps == n_timesteps:
                path = {"observations": np.array(obervations),
                        "policy": cat_normal(probs_dist),
                        "actions": np.array(actions),
                        "rewards": np.array(rewards),
                        "cumulative_returns": get_cummulative_returns(rewards, gamma=0.99),
                        }
                paths.append(path)
                break
    return paths

### testing

In [13]:
test_agent = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
paths = rollout(test_env, test_agent, max_pathlength=5, n_timesteps=100)
print(paths[-1])
assert (type(paths[0]['policy']) == torch.distributions.normal.Normal)
assert (paths[0]['policy'].mean.shape == (5,test_env.action_space.shape[0]))
assert (paths[0]['cumulative_returns'].shape == (5,))
assert (paths[0]['rewards'].shape == (5,))
assert (paths[0]['observations'].shape == (5,)+test_obs.shape)
assert (paths[0]['actions'].shape == (5,test_env.action_space.shape[0]))
print('Test Passed')

{'observations': array([[ 2.74687400e-03,  4.21241100e-06, -5.49243987e-04,
        -1.60000253e-02,  9.22860801e-02,  1.27502775e-03,
         8.60011801e-01,  3.89398929e-04,  1.00000000e+00,
         3.26558985e-02,  1.27499446e-03,  8.53638798e-01,
        -8.67553909e-04,  1.00000000e+00,  4.40813661e-01,
         4.45819765e-01,  4.61422414e-01,  4.89549786e-01,
         5.34102380e-01,  6.02460563e-01,  7.09148288e-01,
         8.85931075e-01,  1.00000000e+00,  1.00000000e+00],
       [ 1.04732243e-02, -1.21068752e-02,  5.64398959e-03,
         2.68419623e-02, -2.76028782e-01, -3.35582793e-01,
         1.70778561e+00,  1.03140895e+00,  1.00000000e+00,
        -3.75023395e-01, -6.68609977e-01,  1.71342707e+00,
         9.91381168e-01,  1.00000000e+00,  4.56697792e-01,
         4.61884290e-01,  4.78049159e-01,  5.07190049e-01,
         5.53348005e-01,  6.24169409e-01,  7.34701514e-01,
         9.17854428e-01,  1.00000000e+00,  1.00000000e+00],
       [-3.39294709e-02, -7.79704332e

---
## 6. Surrogate loss

Now let's define the loss functions and something else for actual TRPO training.

The surrogate reward should be
$$J_{surr}= {1 \over N} \sum\limits_{i=0}^N \frac{\pi_{\theta}(s_i, a_i)}{\pi_{\theta_{old}}(s_i, a_i)}A_{\theta_{old}(s_i, a_i)}$$

For simplicity, let's use cummulative returns instead of advantage for now:
$$J'_{surr}= {1 \over N} \sum\limits_{i=0}^N \frac{\pi_{\theta}(s_i, a_i)}{\pi_{\theta_{old}}(s_i, a_i)}G_{\theta_{old}(s_i, a_i)}$$

Or alternatively, minimize the surrogate loss:
$$ L_{surr} = - J'_{surr} $$  


### definition

In [14]:
def get_loss(agent, observations, actions, cummulative_returns, old_prob_dist):
    """
    Computes TRPO objective
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns
    :param: old_prob_dist - torch gaussian distribution with shape of actions batch shape.
    :returns: scalar value of the objective function
    """
    batch_size, n_actions = actions.shape
    
    assert old_prob_dist.mean.shape == torch.Size([batch_size, n_actions])
    
    prob_dist = agent.get_prob_dist(observations)
    
    probs_for_actions = torch.exp(prob_dist.log_prob(torch.tensor(actions)))
    old_probs_for_actions = torch.exp(old_prob_dist.log_prob(torch.tensor(actions)))
    
    probs_for_actions = torch.prod(probs_for_actions, dim=1)
    old_probs_for_actions = torch.prod(old_probs_for_actions, dim=1)

    Loss = -torch.mean(probs_for_actions / old_probs_for_actions * cummulative_returns)

    assert Loss.shape == torch.Size([])
    return Loss

### testing

In [15]:
test_agent = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
paths = rollout(test_env, test_agent, max_pathlength=5, n_timesteps=100)

with torch.no_grad():
    assert get_loss(
        test_agent,
        paths[0]['observations'],
        paths[0]['actions'],
        paths[0]['cumulative_returns'],
        paths[0]['policy']
    ) > 0
print('Test passed')

Test passed


---
## 7. KL metric

We can ascend these gradients as long as our $pi_\theta(a|s)$ satisfies the constraint
$$E_{s,\pi_{\Theta_{t}}}\Big[KL(\pi(\Theta_{t}, s) \:||\:\pi(\Theta_{t+1}, s))\Big]< \alpha$$


The Kullback-Leibler divergence between a Gaussian distribution $p$ with mean $\mu_1$ and variance $\sigma_1^2$ and a Gaussian distribution $q$ with mean $\mu_2$ and variance $\sigma_2^2$ is following: 
$$\displaystyle \text{KL}(p, q) = \log \frac{\sigma_2}{\sigma_1} + \frac{\sigma_1^2 + (\mu_1 - \mu_2)^2}{2\sigma_2^2} - \frac{1}{2}$$

### definition

In [16]:
def get_kl(agent, observations, actions, cummulative_returns, old_prob_dist):
    """
    Computes KL-divergence between network policy and old policy
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns (we don't need it actually)
    :param: old_prob_dist - torch gaussian distribution with shape of actions batch shape.
    :returns: scalar value of the KL-divergence
    """
    batch_size, n_actions = actions.shape
    
    assert old_prob_dist.mean.shape == torch.Size([batch_size, n_actions])
    
    prob_dist = agent.get_prob_dist(observations)
    
    # Compute Kullback-Leibler divergence (see formula above)
    # shape [batch_size, n_actions]
    kl_s = torch.log(prob_dist.stddev / old_prob_dist.stddev) +\
        (old_prob_dist.stddev ** 2 + (old_prob_dist.mean - prob_dist.mean) ** 2) / (2 * prob_dist.stddev ** 2) - 0.5
    kl = torch.mean(kl_s)

    assert kl_s.shape == torch.Size([batch_size, n_actions])
    assert kl.shape == torch.Size([])
    assert (kl > -0.0001).all() and (kl < 10000).all()
    return kl

### testing

In [17]:
test_agent = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
paths = rollout(test_env, test_agent, max_pathlength=5, n_timesteps=100)

with torch.no_grad():
    assert get_kl(
        test_agent,
        paths[0]['observations'],
        paths[0]['actions'],
        paths[0]['cumulative_returns'],
        paths[0]['policy']
    ) < 0.001

print('Test passed')

Test passed


---
## 8. Entropy

### definition

In [18]:
def get_entropy(agent, observations):
    """
    Computes entropy of the network policy 
    :param: observations - batch of observations
    :returns: scalar value of the entropy
    """

    probs_dist = agent.get_prob_dist(observations)
    entropy = torch.mean(probs_dist.entropy())

    assert entropy.shape == torch.Size([])
    return entropy

### testing

In [19]:
test_agent = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
assert get_entropy(test_agent, test_obs) > 0
print('Test passed')

Test passed


---
## 9. Linear search
TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence. 

In order to enforce this constraint, we're gonna use linesearch. You can find out more about it [here](https://en.wikipedia.org/wiki/Linear_search)

### definition

In [20]:
def linesearch(f, x, fullstep, max_kl):
    """
    Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.
    :param: f - function that returns loss, kl and arbitrary third component.
    :param: x - old parameters of neural network.
    :param: fullstep - direction in which we make search.
    :param: max_kl - constraint of KL divergence.
    :returns:
    """
    max_backtracks = 10
    loss, _, = f(x)
    for stepfrac in .5**np.arange(max_backtracks):
        xnew = x + stepfrac * fullstep
        new_loss, kl = f(xnew)
        actual_improve = new_loss - loss
        if kl.data.numpy() <= max_kl and actual_improve.data.numpy() < 0:
            x = xnew
            loss = new_loss
    return x

### testing

---
## 10. Conjugate gradients

Since TRPO includes contrainted optimization, we will need to solve Ax=b using conjugate gradients.

In general, CG is an algorithm that solves Ax=b where A is positive-defined. A is Hessian matrix so A is positive-defined. You can find out more about them [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)

### definition

In [21]:
def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):
    """
    This method solves system of equation Ax=b using iterative method called conjugate gradients
    :f_Ax: function that returns Ax
    :b: targets for Ax
    :cg_iters: how many iterations this method should do
    :residual_tol: epsilon for stability
    """
    p = b.clone()
    r = b.clone()
    x = torch.zeros(b.size())
    rdotr = torch.sum(r*r)
    for i in range(cg_iters):
        z = f_Ax(p)
        v = rdotr / (torch.sum(p*z) + 1e-8)
        x += v * p
        r -= v * z
        newrdotr = torch.sum(r*r)
        mu = newrdotr / (rdotr + 1e-8)
        p = r + mu * p
        rdotr = newrdotr
        if rdotr < residual_tol:
            break
    return x

### testing

In [22]:
# This code validates conjugate gradients
A = np.random.rand(8, 8)
A = np.matmul(np.transpose(A), A)


def f_Ax(x):
    return torch.matmul(torch.FloatTensor(A), x.view((-1, 1))).view(-1)


b = np.random.rand(8)

w = np.matmul(np.matmul(inv(np.matmul(np.transpose(A), A)),
                        np.transpose(A)), b.reshape((-1, 1))).reshape(-1)
print(w)
print(conjugate_gradient(f_Ax, torch.FloatTensor(b)).numpy())
print('Test passed')

[ 0.03856398 -6.38060178  3.14657855  7.48432058  2.39630744 -5.13671443
 -5.79899611  7.39295898]
[ 0.03730226 -6.3783817   3.145097    7.4860563   2.3960571  -5.1406593
 -5.7969265   7.3925166 ]
Test passed


---
## 11. update_step

In this section we construct the whole update step function.

### definition

In [23]:
def update_step(agent, observations, actions, cummulative_returns, old_prob_dist, max_kl):
    """
    This function does the TRPO update step
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns
    :param: old_prob_dist - torch gaussian distribution with shape of actions batch shape.
    :param: max_kl - controls how big KL divergence may be between old and new policy every step.
    :returns: KL between new and old policies and the value of the loss function.
    """

    # Here we prepare the information
    actions = torch.LongTensor(actions)
    cummulative_returns = Variable(torch.FloatTensor(cummulative_returns))
#     old_probs = Variable(torch.FloatTensor(old_probs))

    # Here we compute gradient of the loss function
    loss = get_loss(agent, observations, actions,
                    cummulative_returns, old_prob_dist)
    grads = torch.autograd.grad(loss, agent.parameters())
    loss_grad = torch.cat([grad.view(-1) for grad in grads]).data

    def Fvp(v):
        # Here we compute Fx to do solve Fx = g using conjugate gradients
        # We actually do here a couple of tricks to compute it efficiently

        kl = get_kl(agent, observations, actions,
                    cummulative_returns, old_prob_dist)

        grads = torch.autograd.grad(kl, agent.parameters(), create_graph=True)
        flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])

        kl_v = (flat_grad_kl * Variable(v)).sum()
        grads = torch.autograd.grad(kl_v, agent.parameters())
        flat_grad_grad_kl = torch.cat(
            [grad.contiguous().view(-1) for grad in grads]).data

        return flat_grad_grad_kl + v * 0.1

    # Here we solveolve Fx = g system using conjugate gradients
    stepdir = conjugate_gradient(Fvp, -loss_grad, 10)

    # Here we compute the initial vector to do linear search
    shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True)

    lm = torch.sqrt(shs / max_kl)
    fullstep = stepdir / lm[0]

    neggdotstepdir = (-loss_grad * stepdir).sum(0, keepdim=True)

    # Here we get the start point
    prev_params = get_flat_params_from(agent)

    def get_loss_kl(params):
        # Helper for linear search
        set_flat_params_to(agent, params)
        return [get_loss(agent, observations, actions, cummulative_returns, old_prob_dist),
                get_kl(agent, observations, actions, cummulative_returns, old_prob_dist)]

    # Here we find our new parameters
    new_params = linesearch(get_loss_kl, prev_params, fullstep, max_kl)

    # And we set it to our network
    set_flat_params_to(agent, new_params)

    return get_loss_kl(new_params)

### testing

# Learning

## 0. Hyperparameters

In [24]:
# this is hyperparameter of TRPO. It controls how big KL divergence may be between old and new policy every step.
max_kl = 0.01
numeptotal = 0  # this is number of episodes that we played.

## 1. Objects definition

### environment

In [25]:
env = gym.make("BipedalWalker-v2")
observation_shape = env.observation_space.shape
n_actions = env.action_space.shape[0]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### model 

In [26]:
agent = TRPOAgent(observation_shape, n_actions)
agent

TRPOAgent(
  (body): Sequential(
    (0): Linear(in_features=24, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
  )
  (mu_head): Sequential(
    (0): Linear(in_features=32, out_features=4, bias=True)
    (1): ReLU()
  )
  (std_head): Sequential(
    (0): Linear(in_features=32, out_features=4, bias=True)
  )
)

## 1 Main Loop

In [None]:
start_time = time.time()

for i in count(1):
    print("\n********** Iteration %i ************" % i)

    # Generating paths.
    print("Rollout")
    paths = rollout(env, agent)
    print("Made rollout")

    # Updating policy.
    observations = np.concatenate([path["observations"] for path in paths])
    actions = np.concatenate([path["actions"] for path in paths])
    returns = np.concatenate([path["cumulative_returns"] for path in paths])
    old_prob_dist = cat_normal([path["policy"] for path in paths])

    loss, kl = update_step(agent, observations, actions,
                           returns, old_prob_dist, max_kl)

    # Report current progress
    episode_rewards = np.array([path["rewards"].sum() for path in paths])

    stats = OrderedDict()
    numeptotal += len(episode_rewards)
    stats["Total number of episodes"] = numeptotal
    stats["Average sum of rewards per episode"] = episode_rewards.mean()
    stats["Std of rewards per episode"] = episode_rewards.std()
    stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time)/60.)
    stats["KL between old and new distribution"] = kl.data.numpy()
    stats["Entropy"] = get_entropy(agent, observations).data.numpy()
    stats["Surrogate loss"] = loss.data.numpy()
    for k, v in stats.items():
        print(k + ": " + " " * (40 - len(k)) + str(v))
    i += 1


********** Iteration 1 ************
Rollout
Made rollout


  app.launch_new_instance()


Total number of episodes:                 92
Average sum of rewards per episode:       -110.49455157808742
Std of rewards per episode:               8.699661752949737
Time elapsed:                             0.70 mins
KL between old and new distribution:      0.009974399
Entropy:                                  1.5376222
Surrogate loss:                           10.355807

********** Iteration 2 ************
Rollout
Made rollout
Total number of episodes:                 174
Average sum of rewards per episode:       -111.62752920504987
Std of rewards per episode:               10.939797486945727
Time elapsed:                             1.39 mins
KL between old and new distribution:      0.00997244
Entropy:                                  1.639484
Surrogate loss:                           9.663303

********** Iteration 3 ************
Rollout
Made rollout
Total number of episodes:                 252
Average sum of rewards per episode:       -115.3731337503246
Std of rewards per episo

Made rollout
Total number of episodes:                 2290
Average sum of rewards per episode:       -121.75184992044531
Std of rewards per episode:               24.981355655395653
Time elapsed:                             13.89 mins
KL between old and new distribution:      0.009979391
Entropy:                                  3.5696425
Surrogate loss:                           23.395838

********** Iteration 21 ************
Rollout
Made rollout
Total number of episodes:                 2434
Average sum of rewards per episode:       -125.69107150444064
Std of rewards per episode:               31.19299495132951
Time elapsed:                             14.56 mins
KL between old and new distribution:      0.009976594
Entropy:                                  3.6308494
Surrogate loss:                           21.908875

********** Iteration 22 ************
Rollout
Made rollout
Total number of episodes:                 2608
Average sum of rewards per episode:       -120.90407341757417

Made rollout
Total number of episodes:                 5042
Average sum of rewards per episode:       -131.23004082390872
Std of rewards per episode:               37.32373217335336
Time elapsed:                             26.66 mins
KL between old and new distribution:      0.009974509
Entropy:                                  5.0752206
Surrogate loss:                           23.308353

********** Iteration 40 ************
Rollout
Made rollout
Total number of episodes:                 5161
Average sum of rewards per episode:       -136.46407367416066
Std of rewards per episode:               44.973369823014814
Time elapsed:                             27.32 mins
KL between old and new distribution:      0.009975338
Entropy:                                  5.116951
Surrogate loss:                           21.237057

********** Iteration 41 ************
Rollout
Made rollout
Total number of episodes:                 5261
Average sum of rewards per episode:       -138.15252060544572


Made rollout
Total number of episodes:                 7873
Average sum of rewards per episode:       -125.65321890339963
Std of rewards per episode:               34.18806989640902
Time elapsed:                             39.72 mins
KL between old and new distribution:      0.009979739
Entropy:                                  5.879926
Surrogate loss:                           23.9726

********** Iteration 59 ************
Rollout
Made rollout
Total number of episodes:                 8041
Average sum of rewards per episode:       -126.93054449038577
Std of rewards per episode:               35.71206233005799
Time elapsed:                             40.42 mins
KL between old and new distribution:      0.0099924905
Entropy:                                  5.9550257
Surrogate loss:                           25.286335

********** Iteration 60 ************
Rollout
Made rollout
Total number of episodes:                 8177
Average sum of rewards per episode:       -129.74219456023778
St