In [None]:
# to access functions from root directory
import sys
sys.path.append('/data/ad181/RemoteDir/ada_multigrid_ppo')

In [2]:
%matplotlib notebook
import numpy as np
import time
import pickle
import os
import matplotlib.pyplot as plt
from copy import copy, deepcopy

import gym
from stable_baselines3.ppo import PPO, MlpPolicy
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.callbacks import CallbackList
from utils.custom_eval_callback import CustomEvalCallback, CustomEvalCallbackParallel
from utils.env_wrappers import StateCoarse, BufferWrapper, EnvCoarseWrapper, StateCoarseMultiGrid
from typing import Callable
from utils.plot_functions import plot_learning
from utils.multigrid_framework_functions import env_wrappers_multigrid, make_env, generate_beta_environement, parallalize_env, multigrid_framework

from model.ressim import Grid
from ressim_env import ResSimEnv_v0, ResSimEnv_v1

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
seed=1
case='case_2_multigrid_fixed'
data_dir='./data'
log_dir='./data/'+case

In [4]:
os.makedirs(data_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

In [5]:
with open('../envs_params/env_data/env_train.pkl', 'rb') as input:
    env_train = pickle.load(input)

# define RL model and callbacks

In [6]:
def generate_model(env_train, seed):
    dummy_env =  generate_beta_environement(env_train, 0.5, env_train.p_x, env_train.p_y, seed)
    dummy_env_parallel = parallalize_env(dummy_env, num_actor=64, seed=seed)
    model = PPO(policy=MlpPolicy,
                env=dummy_env_parallel,
                learning_rate = 1e-4,
                n_steps = 40,
                batch_size = 16,
                n_epochs = 20,
                gamma = 0.99,
                gae_lambda = 0.95,
                clip_range = 0.15,
                clip_range_vf = None,
                ent_coef = 0.001,
                vf_coef = 0.5,
                max_grad_norm = 0.5,
                use_sde= False,
                create_eval_env= False,
                policy_kwargs = dict(net_arch=[70,70,50], log_std_init=-1.7),
                verbose = 1,
                target_kl =0.1,
                seed = seed,
                device = "auto")
    return model

def generate_callback(env_train, best_model_save_path, log_path, eval_freq):
    dummy_env = generate_beta_environement(env_train, 0.5, env_train.p_x, env_train.p_y, seed)
    callback = CustomEvalCallbackParallel(dummy_env, 
                                          best_model_save_path=best_model_save_path, 
                                          n_eval_episodes=1,
                                          log_path=log_path, 
                                          eval_freq=eval_freq)
    return callback

# multigrid framework

In [None]:
for seed in range(1,4):
    model = multigrid_framework(env_train, 
                                generate_model,
                                generate_callback, 
                                delta_pcent=0.2, 
                                n=np.inf,
                                grid_fidelity_factor_array =[0.25, 0.5, 1.0],
                                episode_limit_array=[50000, 50000, 50000], 
                                log_dir=log_dir,
                                seed=seed)

  for j in range(len(p_1)-1):


Using cuda device
seed 1: grid fidelity factor 0.25 learning ..
environement grid size (nx x ny ): 7 x 22




Eval num_timesteps=2560, episode_reward=0.68 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
---------------------------------
| eval/              |          |
|    mean_ep_length  | 5        |
|    mean_reward     | 0.679    |
| time/              |          |
|    fps             | 137      |
|    iterations      | 1        |
|    time_elapsed    | 18       |
|    total_timesteps | 2560     |
---------------------------------
policy iteration runtime: 50 seconds

Total episode rollouts: 512

Eval num_timesteps=2560, episode_reward=0.68 +/- 0.00
Episode length: 5.00 +/- 0.00
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.679     |
| time/                   |           |
|    fps                  | 746       |
|    iterations           | 1         |
|    time_elapsed         | 3         |
|    total_timesteps      | 2560      |
| train/                  |           |
|  

policy iteration runtime: 37 seconds

Total episode rollouts: 4608

Eval num_timesteps=2560, episode_reward=0.72 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
--------------------------------------
| eval/                   |          |
|    mean_ep_length       | 5        |
|    mean_reward          | 0.717    |
| time/                   |          |
|    fps                  | 736      |
|    iterations           | 1        |
|    time_elapsed         | 3        |
|    total_timesteps      | 2560     |
| train/                  |          |
|    approx_kl            | 0.049085 |
|    clip_fraction        | 0.493    |
|    clip_range           | 0.15     |
|    entropy_loss         | 6.11     |
|    explained_variance   | 0.961    |
|    learning_rate        | 0.0001   |
|    loss                 | -0.0788  |
|    n_updates            | 180      |
|    policy_gradient_loss | -0.0467  |
|    std                  | 0.181    |
|    value_loss           | 0.00163  |
-------

policy iteration runtime: 32 seconds

Total episode rollouts: 8704

Eval num_timesteps=2560, episode_reward=0.74 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.745      |
| time/                   |            |
|    fps                  | 762        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.05409802 |
|    clip_fraction        | 0.51       |
|    clip_range           | 0.15       |
|    entropy_loss         | 6.46       |
|    explained_variance   | 0.965      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0583    |
|    n_updates            | 340        |
|    policy_gradient_loss | -0.045     |
|    std                  | 0.178      |
|    v

policy iteration runtime: 33 seconds

Total episode rollouts: 12800

Eval num_timesteps=2560, episode_reward=0.77 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.771      |
| time/                   |            |
|    fps                  | 739        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.07528596 |
|    clip_fraction        | 0.558      |
|    clip_range           | 0.15       |
|    entropy_loss         | 6.65       |
|    explained_variance   | 0.968      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0489    |
|    n_updates            | 500        |
|    policy_gradient_loss | -0.0427    |
|    std                  | 0.177      |
|    

policy iteration runtime: 30 seconds

Total episode rollouts: 16896

Eval num_timesteps=2560, episode_reward=0.78 +/- 0.00
Episode length: 5.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.777       |
| time/                   |             |
|    fps                  | 741         |
|    iterations           | 1           |
|    time_elapsed         | 3           |
|    total_timesteps      | 2560        |
| train/                  |             |
|    approx_kl            | 0.065170564 |
|    clip_fraction        | 0.575       |
|    clip_range           | 0.15        |
|    entropy_loss         | 6.96        |
|    explained_variance   | 0.971       |
|    learning_rate        | 0.0001      |
|    loss                 | -0.065      |
|    n_updates            | 660         |
|    policy_gradient_loss | -0.0417     |
|    std                  | 0.174       |
|    va

policy iteration runtime: 34 seconds

Total episode rollouts: 20992

Eval num_timesteps=2560, episode_reward=0.79 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.791      |
| time/                   |            |
|    fps                  | 779        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.06294056 |
|    clip_fraction        | 0.579      |
|    clip_range           | 0.15       |
|    entropy_loss         | 7.45       |
|    explained_variance   | 0.974      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0117    |
|    n_updates            | 820        |
|    policy_gradient_loss | -0.0348    |
|    std                  | 0.17       |
|    

policy iteration runtime: 36 seconds

Total episode rollouts: 25088

Eval num_timesteps=2560, episode_reward=0.80 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.798      |
| time/                   |            |
|    fps                  | 760        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.07259835 |
|    clip_fraction        | 0.587      |
|    clip_range           | 0.15       |
|    entropy_loss         | 8.09       |
|    explained_variance   | 0.974      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0453    |
|    n_updates            | 980        |
|    policy_gradient_loss | -0.0353    |
|    std                  | 0.165      |
|    value_loss           |

policy iteration runtime: 43 seconds

Total episode rollouts: 29184

Eval num_timesteps=2560, episode_reward=0.81 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.808       |
| time/                   |             |
|    fps                  | 718         |
|    iterations           | 1           |
|    time_elapsed         | 3           |
|    total_timesteps      | 2560        |
| train/                  |             |
|    approx_kl            | 0.068703726 |
|    clip_fraction        | 0.597       |
|    clip_range           | 0.15        |
|    entropy_loss         | 8.94        |
|    explained_variance   | 0.98        |
|    learning_rate        | 0.0001      |
|    loss                 | -0.0173     |
|    n_updates            | 1140        |
|    policy_gradient_loss | -0.0325     |
|    std                  |

policy iteration runtime: 33 seconds

Total episode rollouts: 33280

Eval num_timesteps=2560, episode_reward=0.81 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.814      |
| time/                   |            |
|    fps                  | 728        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.07281404 |
|    clip_fraction        | 0.6        |
|    clip_range           | 0.15       |
|    entropy_loss         | 9.56       |
|    explained_variance   | 0.985      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0728    |
|    n_updates            | 1300       |
|    policy_gradient_loss | -0.0275    |
|    std                  | 0.155      |
|    

policy iteration runtime: 33 seconds

Total episode rollouts: 37376

Eval num_timesteps=2560, episode_reward=0.82 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.82        |
| time/                   |             |
|    fps                  | 775         |
|    iterations           | 1           |
|    time_elapsed         | 3           |
|    total_timesteps      | 2560        |
| train/                  |             |
|    approx_kl            | 0.091239884 |
|    clip_fraction        | 0.613       |
|    clip_range           | 0.15        |
|    entropy_loss         | 10.2        |
|    explained_variance   | 0.984       |
|    learning_rate        | 0.0001      |
|    loss                 | -0.0114     |
|    n_updates            | 1460        |
|    policy_gradient_loss | -0.0285     |
|    std                  |

policy iteration runtime: 33 seconds

Total episode rollouts: 41472

Eval num_timesteps=2560, episode_reward=0.82 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.823      |
| time/                   |            |
|    fps                  | 754        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.06918831 |
|    clip_fraction        | 0.619      |
|    clip_range           | 0.15       |
|    entropy_loss         | 10.8       |
|    explained_variance   | 0.988      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0685    |
|    n_updates            | 1620       |
|    policy_gradient_loss | -0.0257    |
|    std                  | 0.146      |
|    

policy iteration runtime: 34 seconds

Total episode rollouts: 45568

Eval num_timesteps=2560, episode_reward=0.82 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.824      |
| time/                   |            |
|    fps                  | 783        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.09592082 |
|    clip_fraction        | 0.622      |
|    clip_range           | 0.15       |
|    entropy_loss         | 11.3       |
|    explained_variance   | 0.985      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0407    |
|    n_updates            | 1780       |
|    policy_gradient_loss | -0.0238    |
|    std                  | 0.143      |
|    

policy iteration runtime: 32 seconds

Total episode rollouts: 49664

Eval num_timesteps=2560, episode_reward=0.83 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.826      |
| time/                   |            |
|    fps                  | 748        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.08383379 |
|    clip_fraction        | 0.628      |
|    clip_range           | 0.15       |
|    entropy_loss         | 11.6       |
|    explained_variance   | 0.989      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0279    |
|    n_updates            | 1940       |
|    policy_gradient_loss | -0.0241    |
|    std                  | 0.142      |
|    value_loss           |



Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.843      |
| time/                   |            |
|    fps                  | 139        |
|    iterations           | 1          |
|    time_elapsed         | 18         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.10461812 |
|    clip_fraction        | 0.621      |
|    clip_range           | 0.15       |
|    entropy_loss         | 11.7       |
|    explained_variance   | 0.988      |
|    learning_rate        | 0.0001     |
|    loss                 | 4.91e-05   |
|    n_updates            | 1960       |
|    policy_gradient_loss | -0.0192    |
|    std                  | 0.141      |
|    value_loss           | 0.000713   |
---------------------------------

policy iteration runtime: 34 seconds

Total episode rollouts: 54272

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.844     |
| time/                   |           |
|    fps                  | 629       |
|    iterations           | 1         |
|    time_elapsed         | 4         |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.1038513 |
|    clip_fraction        | 0.623     |
|    clip_range           | 0.15      |
|    entropy_loss         | 12.1      |
|    explained_variance   | 0.981     |
|    learning_rate        | 0.0001    |
|    loss                 | -0.0318   |
|    n_updates            | 2120      |
|    policy_gradient_loss | -0.022    |
|    std                  | 0.138     |
|    value_loss           | 0.00112   |
-------

policy iteration runtime: 39 seconds

Total episode rollouts: 58368

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.844      |
| time/                   |            |
|    fps                  | 637        |
|    iterations           | 1          |
|    time_elapsed         | 4          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.09064202 |
|    clip_fraction        | 0.617      |
|    clip_range           | 0.15       |
|    entropy_loss         | 12.5       |
|    explained_variance   | 0.98       |
|    learning_rate        | 0.0001     |
|    loss                 | 0.0174     |
|    n_updates            | 2280       |
|    policy_gradient_loss | -0.0145    |
|    std                  | 0.136      |
|    value_loss           |

policy iteration runtime: 24 seconds

Total episode rollouts: 62464

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.844      |
| time/                   |            |
|    fps                  | 649        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.10422556 |
|    clip_fraction        | 0.643      |
|    clip_range           | 0.15       |
|    entropy_loss         | 12.5       |
|    explained_variance   | 0.984      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.048     |
|    n_updates            | 2440       |
|    policy_gradient_loss | -0.0196    |
|    std                  | 0.136      |
|    value_loss           |

policy iteration runtime: 31 seconds

Total episode rollouts: 66560

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.843     |
| time/                   |           |
|    fps                  | 645       |
|    iterations           | 1         |
|    time_elapsed         | 3         |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.0935068 |
|    clip_fraction        | 0.633     |
|    clip_range           | 0.15      |
|    entropy_loss         | 12.8      |
|    explained_variance   | 0.983     |
|    learning_rate        | 0.0001    |
|    loss                 | -0.0397   |
|    n_updates            | 2600      |
|    policy_gradient_loss | -0.0165   |
|    std                  | 0.134     |
|    value_loss           | 0.00104   |
-------

policy iteration runtime: 34 seconds

Total episode rollouts: 70656

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.844       |
| time/                   |             |
|    fps                  | 652         |
|    iterations           | 1           |
|    time_elapsed         | 3           |
|    total_timesteps      | 2560        |
| train/                  |             |
|    approx_kl            | 0.104668796 |
|    clip_fraction        | 0.64        |
|    clip_range           | 0.15        |
|    entropy_loss         | 12.9        |
|    explained_variance   | 0.984       |
|    learning_rate        | 0.0001      |
|    loss                 | -0.00394    |
|    n_updates            | 2760        |
|    policy_gradient_loss | -0.0186     |
|    std                  | 0.133       |
|    va

policy iteration runtime: 25 seconds

Total episode rollouts: 74752

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.844     |
| time/                   |           |
|    fps                  | 657       |
|    iterations           | 1         |
|    time_elapsed         | 3         |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.0980331 |
|    clip_fraction        | 0.647     |
|    clip_range           | 0.15      |
|    entropy_loss         | 13.2      |
|    explained_variance   | 0.983     |
|    learning_rate        | 0.0001    |
|    loss                 | -0.0273   |
|    n_updates            | 2920      |
|    policy_gradient_loss | -0.0162   |
|    std                  | 0.131     |
|    value_loss           | 0.00102   |
-------

policy iteration runtime: 30 seconds

Total episode rollouts: 78848

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.845      |
| time/                   |            |
|    fps                  | 649        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.09705423 |
|    clip_fraction        | 0.647      |
|    clip_range           | 0.15       |
|    entropy_loss         | 13.5       |
|    explained_variance   | 0.985      |
|    learning_rate        | 0.0001     |
|    loss                 | 0.00723    |
|    n_updates            | 3080       |
|    policy_gradient_loss | -0.0155    |
|    std                  | 0.13       |
|    

policy iteration runtime: 24 seconds

Total episode rollouts: 82944

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.846       |
| time/                   |             |
|    fps                  | 646         |
|    iterations           | 1           |
|    time_elapsed         | 3           |
|    total_timesteps      | 2560        |
| train/                  |             |
|    approx_kl            | 0.102111004 |
|    clip_fraction        | 0.657       |
|    clip_range           | 0.15        |
|    entropy_loss         | 13.8        |
|    explained_variance   | 0.984       |
|    learning_rate        | 0.0001      |
|    loss                 | -0.0563     |
|    n_updates            | 3240        |
|    policy_gradient_loss | -0.0134     |
|    std                  | 0.128       |
|    va

policy iteration runtime: 30 seconds

Total episode rollouts: 87040

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.847      |
| time/                   |            |
|    fps                  | 654        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.11964104 |
|    clip_fraction        | 0.65       |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.1       |
|    explained_variance   | 0.984      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.00117   |
|    n_updates            | 3400       |
|    policy_gradient_loss | -0.0103    |
|    std                  | 0.126      |
|    value_loss           |

policy iteration runtime: 30 seconds

Total episode rollouts: 91136

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.848      |
| time/                   |            |
|    fps                  | 656        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.13030349 |
|    clip_fraction        | 0.663      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.3       |
|    explained_variance   | 0.984      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0221    |
|    n_updates            | 3560       |
|    policy_gradient_loss | -0.013     |
|    std                  | 0.125      |
|    value_loss           |

Early stopping at step 18 due to reaching max kl: 0.15
policy iteration runtime: 33 seconds

Total episode rollouts: 95232

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.848      |
| time/                   |            |
|    fps                  | 648        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15379074 |
|    clip_fraction        | 0.65       |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.3       |
|    explained_variance   | 0.984      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0408    |
|    n_updates            | 3720       |
|    policy_gradient_loss | -0.0105    |
|    std     

policy iteration runtime: 26 seconds

Total episode rollouts: 99328

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.849      |
| time/                   |            |
|    fps                  | 642        |
|    iterations           | 1          |
|    time_elapsed         | 3          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.13225384 |
|    clip_fraction        | 0.648      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.4       |
|    explained_variance   | 0.987      |
|    learning_rate        | 0.0001     |
|    loss                 | 0.0263     |
|    n_updates            | 3880       |
|    policy_gradient_loss | -0.00861   |
|    std                  | 0.125      |
|    value_loss           |



Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.854      |
| time/                   |            |
|    fps                  | 115        |
|    iterations           | 1          |
|    time_elapsed         | 22         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.12636353 |
|    clip_fraction        | 0.654      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.4       |
|    explained_variance   | 0.986      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0402    |
|    n_updates            | 3920       |
|    policy_gradient_loss | -0.0139    |
|    std                  | 0.125      |
|    value_loss           | 0.000898   |
---------------------------------

policy iteration runtime: 35 seconds

Total episode rollouts: 104448

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.854      |
| time/                   |            |
|    fps                  | 349        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.12423446 |
|    clip_fraction        | 0.657      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.4       |
|    explained_variance   | 0.981      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0341    |
|    n_updates            | 4080       |
|    policy_gradient_loss | -0.0154    |
|    std                  | 0.125      |
|    value_loss           

Early stopping at step 14 due to reaching max kl: 0.16
policy iteration runtime: 30 seconds

Total episode rollouts: 108544

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.854      |
| time/                   |            |
|    fps                  | 353        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15877259 |
|    clip_fraction        | 0.66       |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.3       |
|    explained_variance   | 0.98       |
|    learning_rate        | 0.0001     |
|    loss                 | 0.0859     |
|    n_updates            | 4240       |
|    policy_gradient_loss | -0.0112    |
|    std    

Early stopping at step 14 due to reaching max kl: 0.15
policy iteration runtime: 30 seconds

Total episode rollouts: 112640

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.855      |
| time/                   |            |
|    fps                  | 350        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15025549 |
|    clip_fraction        | 0.654      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.5       |
|    explained_variance   | 0.979      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0169    |
|    n_updates            | 4400       |
|    policy_gradient_loss | -0.00726   |
|    std    

policy iteration runtime: 36 seconds

Total episode rollouts: 116736

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.854     |
| time/                   |           |
|    fps                  | 348       |
|    iterations           | 1         |
|    time_elapsed         | 7         |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.1419702 |
|    clip_fraction        | 0.672     |
|    clip_range           | 0.15      |
|    entropy_loss         | 14.7      |
|    explained_variance   | 0.98      |
|    learning_rate        | 0.0001    |
|    loss                 | -0.0547   |
|    n_updates            | 4560      |
|    policy_gradient_loss | -0.0158   |
|    std                  | 0.124     |
|    value_loss           | 0.00126   |
------

policy iteration runtime: 34 seconds

Total episode rollouts: 120832

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.854      |
| time/                   |            |
|    fps                  | 349        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.13393903 |
|    clip_fraction        | 0.66       |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.6       |
|    explained_variance   | 0.981      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0781    |
|    n_updates            | 4720       |
|    policy_gradient_loss | -0.0113    |
|    std                  | 0.124      |
|    value_loss           

Early stopping at step 17 due to reaching max kl: 0.15
policy iteration runtime: 35 seconds

Total episode rollouts: 124928

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.854      |
| time/                   |            |
|    fps                  | 349        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15392235 |
|    clip_fraction        | 0.659      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.6       |
|    explained_variance   | 0.979      |
|    learning_rate        | 0.0001     |
|    loss                 | 0.0032     |
|    n_updates            | 4880       |
|    policy_gradient_loss | -0.0124    |
|    std    

Early stopping at step 10 due to reaching max kl: 0.15
policy iteration runtime: 23 seconds

Total episode rollouts: 129024

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.853      |
| time/                   |            |
|    fps                  | 350        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15216534 |
|    clip_fraction        | 0.658      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.6       |
|    explained_variance   | 0.979      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0447    |
|    n_updates            | 5040       |
|    policy_gradient_loss | -0.000174  |
|    std    

Early stopping at step 19 due to reaching max kl: 0.15
policy iteration runtime: 36 seconds

Total episode rollouts: 133120

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.854      |
| time/                   |            |
|    fps                  | 353        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15126094 |
|    clip_fraction        | 0.668      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.6       |
|    explained_variance   | 0.981      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0304    |
|    n_updates            | 5200       |
|    policy_gradient_loss | -0.0164    |
|    std    

Early stopping at step 14 due to reaching max kl: 0.15
policy iteration runtime: 29 seconds

Total episode rollouts: 137216

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.853     |
| time/                   |           |
|    fps                  | 345       |
|    iterations           | 1         |
|    time_elapsed         | 7         |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.1529886 |
|    clip_fraction        | 0.665     |
|    clip_range           | 0.15      |
|    entropy_loss         | 14.6      |
|    explained_variance   | 0.979     |
|    learning_rate        | 0.0001    |
|    loss                 | 0.096     |
|    n_updates            | 5360      |
|    policy_gradient_loss | -0.00577  |
|    std                  | 0.1

policy iteration runtime: 38 seconds

Total episode rollouts: 141312

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.854      |
| time/                   |            |
|    fps                  | 348        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.12749545 |
|    clip_fraction        | 0.669      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.6       |
|    explained_variance   | 0.982      |
|    learning_rate        | 0.0001     |
|    loss                 | 0.0106     |
|    n_updates            | 5520       |
|    policy_gradient_loss | -0.0128    |
|    std                  | 0.124      |
|    value_loss           

Early stopping at step 11 due to reaching max kl: 0.15
policy iteration runtime: 24 seconds

Total episode rollouts: 145408

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.854      |
| time/                   |            |
|    fps                  | 353        |
|    iterations           | 1          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15158339 |
|    clip_fraction        | 0.655      |
|    clip_range           | 0.15       |
|    entropy_loss         | 14.7       |
|    explained_variance   | 0.983      |
|    learning_rate        | 0.0001     |
|    loss                 | -0.0284    |
|    n_updates            | 5680       |
|    policy_gradient_loss | -0.00122   |
|    std    

Early stopping at step 13 due to reaching max kl: 0.16
policy iteration runtime: 28 seconds

Total episode rollouts: 149504

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
--------------------------------------
| eval/                   |          |
|    mean_ep_length       | 5        |
|    mean_reward          | 0.854    |
| time/                   |          |
|    fps                  | 350      |
|    iterations           | 1        |
|    time_elapsed         | 7        |
|    total_timesteps      | 2560     |
| train/                  |          |
|    approx_kl            | 0.155241 |
|    clip_fraction        | 0.666    |
|    clip_range           | 0.15     |
|    entropy_loss         | 14.8     |
|    explained_variance   | 0.979    |
|    learning_rate        | 0.0001   |
|    loss                 | -0.03    |
|    n_updates            | 5840     |
|    policy_gradient_loss | -0.00598 |
|    std                  | 0.122    |
|    value_

<IPython.core.display.Javascript object>