# Requirements

In [1]:
try:
    import google.colab
    USE_COLAB = True
except:
    USE_COLAB = False

RUN_TESTS = True

if USE_COLAB:
    print("Don't forget to avoid disconnections:")
    print("""
    function ClickConnect(){
        console.log("Clicking"); 
        document.querySelector("colab-connect-button").click() 
    }
    setInterval(ClickConnect,60000)
    
    """)

In [7]:
import numpy as np
from numpy.linalg import inv

import scipy.signal

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Normal
from torch.distributions.multivariate_normal import MultivariateNormal

import gym

import matplotlib.pyplot as plt

import time
from itertools import count
from collections import OrderedDict

# Description

Let's use TRPO to train evil robots! (pick any of two)
* [MuJoCo robots](https://gym.openai.com/envs#mujoco)
* [Box2d robot](https://gym.openai.com/envs/BipedalWalker-v2)

The catch here is that those environments have continuous action spaces. 

Luckily, TRPO is a policy gradient method, so it's gonna work for any parametric $\pi_\theta(a|s)$. We recommend starting with gaussian policy:

$$\pi_\theta(a|s) = N(\mu_\theta(s),\sigma^2_\theta(s)) = {1 \over \sqrt { 2 \pi {\sigma^2}_\theta(s) } } e^{ (a - 
\mu_\theta(s))^2 \over 2 {\sigma^2}_\theta(s) } $$

In the $\sqrt { 2 \pi {\sigma^2}_\theta(s) }$ clause, $\pi$ means ~3.1415926, not agent's policy.

This essentially means that you will need two output layers:
* $\mu_\theta(s)$, a dense layer with linear activation
* ${\sigma^2}_\theta(s)$, a dense layer with activation tf.exp (to make it positive; like rho from bandits)

For multidimensional actions, you can use fully factorized gaussian (basically a vector of gaussians).

__bonus task__: compare performance of continuous action space method to action space discretization

# Data Exploration

Explore given environment

In [3]:
test_env = gym.make("BipedalWalker-v2")
test_obs = test_env.reset()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [4]:
print(test_obs.shape)
print(test_env.action_space)

(24,)
Box(4,)


There are 4 possible actions: for all 4 Joints. Each parameter in range [-1, 1]. 

Let model learn it by **iteslf**

# Define functional

---
## 0. utils

### defenition

In [5]:
def init_weights(m):
    if type(m) in (nn.Linear, nn.Conv2d):
        nn.init.orthogonal_(m.weight.data, np.sqrt(2.0))
        if m.bias is not None:
            m.bias.data.fill_(0)

def get_body_network(layers_size):
    layers = []
    for in_size, out_size in zip(layers_size[:-1], layers_size[1:]):
        layers.append(nn.Linear(in_size, out_size))
        layers.append(nn.Tanh())
    return nn.Sequential(*layers)

---
## 1. model

### defenition

In [9]:
class TRPOAgents(nn.Module):
    def __init__(self, state_shape, n_actions, hidden_size=[128, 64, 64]):
        '''
        Here you should define your model
        '''
        super(TRPOAgent, self).__init__()
        
        layers_size = [state_shape[0]] + hidden_size

        self.body = get_body_network(layers_size).apply(init_weights)
        self.mu_head = nn.Linear(hidden_size[1], n_actions).apply(init_weights)
        self.log_std_head = nn.Parameter(torch.zeros(n_actions))
    
    
    def forward(self, states):
        """
        takes agent's observation (Variable), returns gaussian distribution parameters
        :param states: a batch of states, shape = [batch_size, state_shape]
        """
        hidden_state = self.body(states)
        
        mu = self.mu_head(hidden_state)
        log_std = self.log_std_head
        std = torch.exp(log_std)
        return mu, std

### testing

In [26]:
test_model = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
test_input1d = torch.ones(24)
test_input2d = torch.ones((2, 24))

mu, std = test_model(test_input1d)
assert mu.shape == torch.Size([4])
assert (std == torch.tensor([1., 1., 1., 1.])).all()

mu, std = test_model(test_input2d)
assert mu.shape == torch.Size([2, 4])
assert (std == torch.tensor([1., 1., 1., 1.])).all()

print('Test passed')

Test passed


---
## 2. Policy

### definition

In [35]:
class Policy:
    def __init__(self, model):
        self.model = model
    
    def get_prob_dist(self, states):
        """
        takes agent's observation (Variable), returns gaussian distribution
        :param states: a batch of states, shape = [batch_size, state_shape]
        """

        mu, std = self.model(states)
        dist = MultivariateNormal(mu, torch.diag(std))
        return dist

    def act(self, inputs, training=False):
        '''
        Samples action from policy distribution
        :param: inputs - observations vector
        :returns: action (single integer) and probabilities class for all actions
        '''
        states = Variable(torch.FloatTensor(inputs))
        
        if training:
            with torch.no_grad():
                distribution = self.get_prob_dist(states)
        else:
            distribution = self.get_prob_dist(states)
        
        actions = distribution.sample()
        return actions, distribution

### testing

In [49]:
test_model = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
test_policy = Policy(test_model)
test_input1d = np.ones(24)
test_input2d = np.ones((2, 24))

test_actions, test_distribution = test_policy.act(test_input1d)
assert test_actions.shape == torch.Size([4])
assert (test_distribution.stddev == torch.ones(4)).all()

test_actions, test_distribution = test_policy.act(test_input2d)
assert test_actions.shape == torch.Size([2, 4])
assert (test_distribution.stddev == torch.ones(4)).all()
print('Test passed')

Test passed


---
## 3. flat parameters operations

### definition

In [50]:
def get_flat_params_from(model):
    params = []
    for param in model.parameters():
        params.append(param.data.view(-1))

    flat_params = torch.cat(params)
    return flat_params


def set_flat_params_to(model, flat_params):
    prev_ind = 0
    for param in model.parameters():
        flat_size = int(np.prod(list(param.size())))
        param.data.copy_(
            flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
        prev_ind += flat_size

---
## 4. count cummulative

### definition



In [51]:
def get_cummulative_returns(r, gamma=0.99):
    """
    Computes cummulative discounted rewards given immediate rewards
    G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...
    Also known as R(s,a).
    """
    r = np.array(r)
    assert r.ndim >= 1
    return scipy.signal.lfilter([1], [1, -gamma], r[::-1], axis=0)[::-1]

### test

In [52]:
# simple demo on rewards [0,0,1,0,0,1]
assert (get_cummulative_returns([0, 0, 1, 0, 0, 1], gamma=1.) == np.array([2., 2., 2., 1., 1., 1.])).all()
print('Tests passed')

Tests passed


## 5. transformation list of distributions to one multidimensial distribution

### definition

In [11]:
def cat_normal(normals):
    """
    cat list of Normal distributions to single distribution
    :param: normals - list of Normal distributions
    """
    mu_s = [dist.mean for dist in normals]
    std_s = [dist.stddev for dist in normals]
    if len(mu_s[0].shape) == 1:
        return Normal(torch.stack(mu_s), torch.stack(std_s))
    else:
        return Normal(torch.cat(mu_s), torch.cat(std_s))

### testing


In [12]:
A = Normal(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
B = Normal(torch.tensor([2., 2.]), torch.tensor([3., 3.]))
AB = cat_normal([A, B])
ABAB = cat_normal([AB, AB])

assert (AB.mean == torch.tensor([[0., 0.], [2., 2.]])).all()
assert (AB.stddev == torch.tensor([[1., 1.], [3., 3.]])).all()
assert (ABAB.mean.shape == torch.Size([4, 2]))
print('Test passed')

Test passed


---
## 5. rollout

### definition

In [13]:
def rollout(env, agent, max_pathlength=2500, n_timesteps=50000):
    """
    Generate rollouts for training.
    :param: env - environment in which we will make actions to generate rollouts.
    :param: act - the function that can return policy and action given observation.
    :param: max_pathlength - maximum size of one path that we generate.
    :param: n_timesteps - total sum of sizes of all pathes we generate.
    """
    paths = []

    total_timesteps = 0
    while total_timesteps < n_timesteps:
        obervations, actions, rewards, probs_dist = [], [], [], []
        obervation = env.reset()
        for _ in range(max_pathlength):
            action, prob_dist = agent.act(obervation)
            obervations.append(obervation)
            actions.append(action)
            probs_dist.append(prob_dist)
            obervation, reward, done, _ = env.step(action)
            rewards.append(reward)
            total_timesteps += 1
            if done or total_timesteps == n_timesteps:
                path = {"observations": np.array(obervations),
                        "policy": cat_normal(probs_dist),
                        "actions": np.array(actions),
                        "rewards": np.array(rewards),
                        "cumulative_returns": get_cummulative_returns(rewards, gamma=0.99),
                        }
                paths.append(path)
                break
    return paths

### testing

In [14]:
test_agent = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
paths = rollout(test_env, test_agent, max_pathlength=5, n_timesteps=100)
print(paths[-1])
assert (type(paths[0]['policy']) == torch.distributions.normal.Normal)
assert (paths[0]['policy'].mean.shape == (5,test_env.action_space.shape[0]))
assert (paths[0]['cumulative_returns'].shape == (5,))
assert (paths[0]['rewards'].shape == (5,))
assert (paths[0]['observations'].shape == (5,)+test_obs.shape)
assert (paths[0]['actions'].shape == (5,test_env.action_space.shape[0]))
print('Test Passed')

{'observations': array([[ 2.74737785e-03, -1.57472654e-05,  1.22499783e-03,
        -1.59998930e-02,  9.19331312e-02, -1.61658891e-03,
         8.60285625e-01,  2.59747620e-03,  1.00000000e+00,
         3.23389731e-02, -1.61647156e-03,  8.53834331e-01,
         1.14326132e-03,  1.00000000e+00,  4.40814018e-01,
         4.45820123e-01,  4.61422771e-01,  4.89550203e-01,
         5.34102798e-01,  6.02461040e-01,  7.09148884e-01,
         8.85931849e-01,  1.00000000e+00,  1.00000000e+00],
       [ 7.57573347e-04, -1.22843349e-02, -8.50787848e-03,
         3.14872551e-02,  4.03214425e-01,  9.99907255e-01,
         1.64986551e-01, -9.99957800e-01,  1.00000000e+00,
        -3.80436748e-01, -6.59479856e-01,  1.76042593e+00,
         9.97951349e-01,  1.00000000e+00,  4.53531981e-01,
         4.58682507e-01,  4.74735320e-01,  5.03674269e-01,
         5.49512267e-01,  6.19842708e-01,  7.29608595e-01,
         9.11491930e-01,  1.00000000e+00,  1.00000000e+00],
       [ 1.70010999e-02,  4.19181108e

---
## 6. Surrogate loss

Now let's define the loss functions and something else for actual TRPO training.

The surrogate reward should be
$$J_{surr}= {1 \over N} \sum\limits_{i=0}^N \frac{\pi_{\theta}(s_i, a_i)}{\pi_{\theta_{old}}(s_i, a_i)}A_{\theta_{old}(s_i, a_i)}$$

For simplicity, let's use cummulative returns instead of advantage for now:
$$J'_{surr}= {1 \over N} \sum\limits_{i=0}^N \frac{\pi_{\theta}(s_i, a_i)}{\pi_{\theta_{old}}(s_i, a_i)}G_{\theta_{old}(s_i, a_i)}$$

Or alternatively, minimize the surrogate loss:
$$ L_{surr} = - J'_{surr} $$  


### definition

In [15]:
def get_loss(agent, observations, actions, cummulative_returns, old_prob_dist):
    """
    Computes TRPO objective
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns
    :param: old_prob_dist - torch gaussian distribution with shape of actions batch shape.
    :returns: scalar value of the objective function
    """
    batch_size, n_actions = actions.shape
    
    assert old_prob_dist.mean.shape == torch.Size([batch_size, n_actions])
    
    prob_dist = agent.get_prob_dist(observations)
    
    probs_for_actions = torch.exp(prob_dist.log_prob(torch.tensor(actions)))
    old_probs_for_actions = torch.exp(old_prob_dist.log_prob(torch.tensor(actions)))
    
    probs_for_actions = torch.prod(probs_for_actions, dim=1)
    old_probs_for_actions = torch.prod(old_probs_for_actions, dim=1)

    Loss = -torch.mean(probs_for_actions / old_probs_for_actions * cummulative_returns)

    assert Loss.shape == torch.Size([])
    return Loss

### testing

In [16]:
test_agent = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
paths = rollout(test_env, test_agent, max_pathlength=5, n_timesteps=100)

with torch.no_grad():
    assert get_loss(
        test_agent,
        paths[0]['observations'],
        paths[0]['actions'],
        paths[0]['cumulative_returns'],
        paths[0]['policy']
    ) > 0
print('Test passed')

Test passed


---
## 7. KL metric

We can ascend these gradients as long as our $pi_\theta(a|s)$ satisfies the constraint
$$E_{s,\pi_{\Theta_{t}}}\Big[KL(\pi(\Theta_{t}, s) \:||\:\pi(\Theta_{t+1}, s))\Big]< \alpha$$


The Kullback-Leibler divergence between a Gaussian distribution $p$ with mean $\mu_1$ and variance $\sigma_1^2$ and a Gaussian distribution $q$ with mean $\mu_2$ and variance $\sigma_2^2$ is following: 
$$\displaystyle \text{KL}(p, q) = \log \frac{\sigma_2}{\sigma_1} + \frac{\sigma_1^2 + (\mu_1 - \mu_2)^2}{2\sigma_2^2} - \frac{1}{2}$$

### definition

In [17]:
def get_kl(agent, observations, actions, cummulative_returns, old_prob_dist):
    """
    Computes KL-divergence between network policy and old policy
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns (we don't need it actually)
    :param: old_prob_dist - torch gaussian distribution with shape of actions batch shape.
    :returns: scalar value of the KL-divergence
    """
    batch_size, n_actions = actions.shape
    
    assert old_prob_dist.mean.shape == torch.Size([batch_size, n_actions])
    
    prob_dist = agent.get_prob_dist(observations)
    
    # Compute Kullback-Leibler divergence (see formula above)
    # shape [batch_size, n_actions]
    kl_s = torch.log(prob_dist.stddev / old_prob_dist.stddev) +\
        (old_prob_dist.stddev ** 2 + (old_prob_dist.mean - prob_dist.mean) ** 2) / (2 * prob_dist.stddev ** 2) - 0.5
    kl = torch.mean(torch.sum(kl_s, dim=1))

    assert kl_s.shape == torch.Size([batch_size, n_actions])
    assert kl.shape == torch.Size([])
    assert (kl > -0.0001).all() and (kl < 10000).all()
    return kl

### testing

In [18]:
test_agent = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
paths = rollout(test_env, test_agent, max_pathlength=5, n_timesteps=100)

with torch.no_grad():
    assert get_kl(
        test_agent,
        paths[0]['observations'],
        paths[0]['actions'],
        paths[0]['cumulative_returns'],
        paths[0]['policy']
    ) < 0.001

print('Test passed')

Test passed


---
## 8. Entropy

### definition

In [19]:
def get_entropy(agent, observations):
    """
    Computes entropy of the network policy 
    :param: observations - batch of observations
    :returns: scalar value of the entropy
    """

    probs_dist = agent.get_prob_dist(observations)
    entropy = torch.mean(probs_dist.entropy())

    assert entropy.shape == torch.Size([])
    return entropy

### testing

In [20]:
test_agent = TRPOAgent(test_obs.shape, test_env.action_space.shape[0])
print(get_entropy(test_agent, test_obs))
print('Test passed')

tensor(1.4189, grad_fn=<MeanBackward0>)
Test passed


---
## 9. Linear search
TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence. 

In order to enforce this constraint, we're gonna use linesearch. You can find out more about it [here](https://en.wikipedia.org/wiki/Linear_search)

### definition

In [21]:
def linesearch(f, x, fullstep, max_kl):
    """
    Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.
    :param: f - function that returns loss, kl and arbitrary third component.
    :param: x - old parameters of neural network.
    :param: fullstep - direction in which we make search.
    :param: max_kl - constraint of KL divergence.
    :returns:
    """
    max_backtracks = 10
    loss, _, = f(x)
    for stepfrac in .5**np.arange(max_backtracks):
        xnew = x + stepfrac * fullstep
        new_loss, kl = f(xnew)
        actual_improve = new_loss - loss
        if kl.data.numpy() <= max_kl and actual_improve.data.numpy() < 0:
            x = xnew
            loss = new_loss
    return x

### testing

---
## 10. Conjugate gradients

Since TRPO includes contrainted optimization, we will need to solve Ax=b using conjugate gradients.

In general, CG is an algorithm that solves Ax=b where A is positive-defined. A is Hessian matrix so A is positive-defined. You can find out more about them [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)

### definition

In [22]:
def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):
    """
    This method solves system of equation Ax=b using iterative method called conjugate gradients
    :f_Ax: function that returns Ax
    :b: targets for Ax
    :cg_iters: how many iterations this method should do
    :residual_tol: epsilon for stability
    """
    p = b.clone()
    r = b.clone()
    x = torch.zeros(b.size())
    rdotr = torch.sum(r*r)
    for i in range(cg_iters):
        z = f_Ax(p)
        v = rdotr / (torch.sum(p*z) + 1e-8)
        x += v * p
        r -= v * z
        newrdotr = torch.sum(r*r)
        mu = newrdotr / (rdotr + 1e-8)
        p = r + mu * p
        rdotr = newrdotr
        if rdotr < residual_tol:
            break
    return x

### testing

In [23]:
# This code validates conjugate gradients
A = np.random.rand(8, 8)
A = np.matmul(np.transpose(A), A)


def f_Ax(x):
    return torch.matmul(torch.FloatTensor(A), x.view((-1, 1))).view(-1)


b = np.random.rand(8)

w = np.matmul(np.matmul(inv(np.matmul(np.transpose(A), A)),
                        np.transpose(A)), b.reshape((-1, 1))).reshape(-1)
print(w)
print(conjugate_gradient(f_Ax, torch.FloatTensor(b)).numpy())
print('Test passed')

[-0.70671541 -1.51468377 -0.36074676 -2.5384675   2.74491156  2.4741922
 -1.37270964  1.09881173]
[-0.70670116 -1.5146954  -0.36071748 -2.5384498   2.7449124   2.4741642
 -1.3727542   1.098823  ]
Test passed


---
## 11. update_step

In this section we construct the whole update step function.

### definition

In [24]:
def update_step(agent, observations, actions, cummulative_returns, old_prob_dist, max_kl):
    """
    This function does the TRPO update step
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns
    :param: old_prob_dist - torch gaussian distribution with shape of actions batch shape.
    :param: max_kl - controls how big KL divergence may be between old and new policy every step.
    :returns: KL between new and old policies and the value of the loss function.
    """

    # Here we prepare the information
    actions = torch.LongTensor(actions)
    cummulative_returns = Variable(torch.FloatTensor(cummulative_returns))
#     old_probs = Variable(torch.FloatTensor(old_probs))

    # Here we compute gradient of the loss function
    loss = get_loss(agent, observations, actions,
                    cummulative_returns, old_prob_dist)
    grads = torch.autograd.grad(loss, agent.parameters())
    loss_grad = torch.cat([grad.view(-1) for grad in grads]).data

    def Fvp(v):
        # Here we compute Fx to do solve Fx = g using conjugate gradients
        # We actually do here a couple of tricks to compute it efficiently

        kl = get_kl(agent, observations, actions,
                    cummulative_returns, old_prob_dist)

        grads = torch.autograd.grad(kl, agent.parameters(), create_graph=True)
        flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])

        kl_v = (flat_grad_kl * Variable(v)).sum()
        grads = torch.autograd.grad(kl_v, agent.parameters())
        flat_grad_grad_kl = torch.cat(
            [grad.contiguous().view(-1) for grad in grads]).data

        return flat_grad_grad_kl + v * 0.1

    # Here we solveolve Fx = g system using conjugate gradients
    stepdir = conjugate_gradient(Fvp, -loss_grad, 10)

    # Here we compute the initial vector to do linear search
    shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True)

    lm = torch.sqrt(shs / max_kl)
    fullstep = stepdir / lm[0]

    neggdotstepdir = (-loss_grad * stepdir).sum(0, keepdim=True)

    # Here we get the start point
    prev_params = get_flat_params_from(agent)

    def get_loss_kl(params):
        # Helper for linear search
        set_flat_params_to(agent, params)
        return [get_loss(agent, observations, actions, cummulative_returns, old_prob_dist),
                get_kl(agent, observations, actions, cummulative_returns, old_prob_dist)]

    # Here we find our new parameters
    new_params = linesearch(get_loss_kl, prev_params, fullstep, max_kl)

    # And we set it to our network
    set_flat_params_to(agent, new_params)

    return get_loss_kl(new_params)

### testing

# Learning

---
## 0. Hyperparameters

In [25]:
# this is hyperparameter of TRPO. It controls how big KL divergence may be between old and new policy every step.
max_kl = 0.01
numeptotal = 0  # this is number of episodes that we played.

rollout_max_pathlength = 3000
rollout_n_timesteps = 500000
# rollout_max_pathlength = 30
# rollout_n_timesteps = 200

---
## 1. Objects definition

### environment

In [26]:
env = gym.make("BipedalWalker-v2")
observation_shape = env.observation_space.shape
n_actions = env.action_space.shape[0]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### model 

In [27]:
agent = TRPOAgent(observation_shape, n_actions)
agent

TRPOAgent(
  (body): Sequential(
    (0): Linear(in_features=24, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=64, bias=True)
    (5): Tanh()
  )
  (mu_head): Linear(in_features=64, out_features=4, bias=True)
)

---
## 2. Main Loop

In [28]:
start_time = time.time()

for i in count(1):
    print("\n********** Iteration %i ************" % i)

    # Generating paths.
    print("Rollout")
    paths = rollout(env, agent, max_pathlength=rollout_max_pathlength, n_timesteps=rollout_n_timesteps)
    print("Made rollout")

    # Updating policy.
    observations = np.concatenate([path["observations"] for path in paths])
    actions = np.concatenate([path["actions"] for path in paths])
    returns = np.concatenate([path["cumulative_returns"] for path in paths])
    old_prob_dist = cat_normal([path["policy"] for path in paths])

    loss, kl = update_step(agent, observations, actions,
                           returns, old_prob_dist, max_kl)

    # Report current progress
    episode_rewards = np.array([path["rewards"].sum() for path in paths])

    stats = OrderedDict()
    numeptotal += len(episode_rewards)
    stats["Total number of episodes"] = numeptotal
    stats["Average sum of rewards per episode"] = episode_rewards.mean()
    stats["Std of rewards per episode"] = episode_rewards.std()
    stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time)/60.)
    stats["KL between old and new distribution"] = kl.data.numpy()
    stats["Entropy"] = get_entropy(agent, observations).data.numpy()
    stats["Surrogate loss"] = loss.data.numpy()
    stats["Total paths"] = len(paths)
    stats["Mean path length"] = float(np.mean([paths[i]['observations'].shape[0] for i in range(len(paths))]))
    for k, v in stats.items():
        print(k + ": " + " " * (40 - len(k)) + str(v))
    i += 1


********** Iteration 1 ************
Rollout
Made rollout


  app.launch_new_instance()


Total number of episodes:                 803
Average sum of rewards per episode:       -122.05639855127235
Std of rewards per episode:               23.78502772787846
Time elapsed:                             12.54 mins
KL between old and new distribution:      0.009968565
Entropy:                                  1.459826
Surrogate loss:                           11.7816
Total paths:                              803
Mean path length:                         622.66500622665

********** Iteration 2 ************
Rollout
Made rollout
Total number of episodes:                 1717
Average sum of rewards per episode:       -122.48672018999443
Std of rewards per episode:               25.014194899715044
Time elapsed:                             20.83 mins
KL between old and new distribution:      0.009995617
Entropy:                                  1.4974455
Surrogate loss:                           12.885286
Total paths:                              914
Mean path length:                  

Made rollout
Total number of episodes:                 13615
Average sum of rewards per episode:       -168.77037496788924
Std of rewards per episode:               44.01150171889259
Time elapsed:                             151.69 mins
KL between old and new distribution:      0.009963588
Entropy:                                  2.0101202
Surrogate loss:                           22.001259
Total paths:                              862
Mean path length:                         580.046403712297

********** Iteration 18 ************
Rollout
Made rollout
Total number of episodes:                 14563
Average sum of rewards per episode:       -163.4063818204725
Std of rewards per episode:               43.69199867763981
Time elapsed:                             160.52 mins
KL between old and new distribution:      0.009964312
Entropy:                                  2.0387876
Surrogate loss:                           22.857285
Total paths:                              948
Mean path leng

Made rollout
Total number of episodes:                 29349
Average sum of rewards per episode:       -157.15363162309032
Std of rewards per episode:               38.54572979009522
Time elapsed:                             321.63 mins
KL between old and new distribution:      0.009973323
Entropy:                                  2.4145079
Surrogate loss:                           28.492682
Total paths:                              1200
Mean path length:                         416.6666666666667

********** Iteration 33 ************
Rollout
Made rollout
Total number of episodes:                 30528
Average sum of rewards per episode:       -158.7514711205224
Std of rewards per episode:               38.81307863254617
Time elapsed:                             329.83 mins
KL between old and new distribution:      0.009981131
Entropy:                                  2.4382715
Surrogate loss:                           28.540419
Total paths:                              1179
Mean path l

Made rollout
Total number of episodes:                 48131
Average sum of rewards per episode:       -156.72719794807946
Std of rewards per episode:               40.97084333255264
Time elapsed:                             451.70 mins
KL between old and new distribution:      0.009979413
Entropy:                                  2.6981375
Surrogate loss:                           30.02851
Total paths:                              1275
Mean path length:                         392.15686274509807

********** Iteration 48 ************
Rollout
Made rollout
Total number of episodes:                 49503
Average sum of rewards per episode:       -153.98054465901936
Std of rewards per episode:               39.61796354206393
Time elapsed:                             460.46 mins
KL between old and new distribution:      0.009995975
Entropy:                                  2.710285
Surrogate loss:                           31.266808
Total paths:                              1372
Mean path l

Made rollout
Total number of episodes:                 67543
Average sum of rewards per episode:       -155.34672488475164
Std of rewards per episode:               44.109996531545626
Time elapsed:                             583.10 mins
KL between old and new distribution:      0.009999125
Entropy:                                  2.8570082
Surrogate loss:                           30.03298
Total paths:                              1312
Mean path length:                         381.0975609756098

********** Iteration 63 ************
Rollout
Made rollout
Total number of episodes:                 68885
Average sum of rewards per episode:       -154.382861842666
Std of rewards per episode:               44.22567314731277
Time elapsed:                             591.82 mins
KL between old and new distribution:      0.00998532
Entropy:                                  2.8642206
Surrogate loss:                           30.371021
Total paths:                              1342
Mean path len

Made rollout
Total number of episodes:                 86228
Average sum of rewards per episode:       -160.76901376069313
Std of rewards per episode:               43.96404143952083
Time elapsed:                             715.35 mins
KL between old and new distribution:      0.009971526
Entropy:                                  2.9381094
Surrogate loss:                           28.73919
Total paths:                              1172
Mean path length:                         426.6211604095563

********** Iteration 78 ************
Rollout
Made rollout
Total number of episodes:                 87392
Average sum of rewards per episode:       -159.3527354937026
Std of rewards per episode:               43.38237511416692
Time elapsed:                             724.10 mins
KL between old and new distribution:      0.0099785505
Entropy:                                  2.94071
Surrogate loss:                           28.220766
Total paths:                              1164
Mean path len

KeyboardInterrupt: 