In [1]:
# to access functions from root directory
import sys
sys.path.append('/data/ad181/RemoteDir/ada_multigrid_ppo')

In [2]:
%matplotlib notebook
import numpy as np
import time
import pickle
import os
import matplotlib.pyplot as plt
from copy import copy, deepcopy

import gym
from stable_baselines3.ppo import PPO, MlpPolicy
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.callbacks import CallbackList
from utils.custom_eval_callback import CustomEvalCallback, CustomEvalCallbackParallel
from utils.env_wrappers import StateCoarse, BufferWrapper, EnvCoarseWrapper, StateCoarseMultiGrid
from typing import Callable
from utils.plot_functions import plot_learning
from utils.multigrid_framework_functions import env_wrappers_multigrid, make_env, generate_beta_environement, parallalize_env, multigrid_framework

from model.ressim import Grid
from ressim_env import ResSimEnv_v0, ResSimEnv_v1

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
seed=1
case='case_2_singlegrid_one'
data_dir='./data'
log_dir='./data/'+case

In [4]:
os.makedirs(data_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

In [5]:
with open('../envs_params/env_data/env_train.pkl', 'rb') as input:
    env_train = pickle.load(input)

# define RL model and callbacks

In [6]:
def generate_model(env_train, seed):
    dummy_env =  generate_beta_environement(env_train, 0.5, env_train.p_x, env_train.p_y, seed)
    dummy_env_parallel = parallalize_env(dummy_env, num_actor=64, seed=seed)
    model = PPO(policy=MlpPolicy,
                env=dummy_env_parallel,
                learning_rate = 5e-4,
                n_steps = 40,
                batch_size = 64,
                n_epochs = 20,
                gamma = 0.99,
                gae_lambda = 0.95,
                clip_range = 0.2,
                clip_range_vf = None,
                ent_coef = 0.001,
                vf_coef = 0.5,
                max_grad_norm = 0.5,
                use_sde= False,
                create_eval_env= False,
                policy_kwargs = dict(net_arch=[70,70,50], log_std_init=-1.7),
                verbose = 1,
                target_kl =0.1,
                seed = seed,
                device = "auto")
    return model

def generate_callback(env_train, best_model_save_path, log_path, eval_freq):
    dummy_env = generate_beta_environement(env_train, 0.5, env_train.p_x, env_train.p_y, seed)
    callback = CustomEvalCallbackParallel(dummy_env, 
                                          best_model_save_path=best_model_save_path, 
                                          n_eval_episodes=1,
                                          log_path=log_path, 
                                          eval_freq=eval_freq)
    return callback

# multigrid framework

In [7]:
for seed in range(1,4):
    multigrid_framework(env_train, 
                        generate_model,
                        generate_callback, 
                        delta_pcent=0.3, 
                        n=np.inf,
                        grid_fidelity_factor_array =[1.0],
                        episode_limit_array=[120000], 
                        log_dir=log_dir,
                        seed=seed)

Using cuda device
seed 1: grid fidelity factor 1.0 learning ..
environement grid size (nx x ny ): 31 x 91




Eval num_timesteps=2560, episode_reward=0.71 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
---------------------------------
| eval/              |          |
|    mean_ep_length  | 5        |
|    mean_reward     | 0.712    |
| time/              |          |
|    fps             | 132      |
|    iterations      | 1        |
|    time_elapsed    | 19       |
|    total_timesteps | 2560     |
---------------------------------
Early stopping at step 7 due to reaching max kl: 0.16

Total episode rollouts: 512

Eval num_timesteps=2560, episode_reward=0.69 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.694      |
| time/                   |            |
|    fps                  | 149        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/         

Early stopping at step 6 due to reaching max kl: 0.15

Total episode rollouts: 4608

Eval num_timesteps=2560, episode_reward=0.75 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.746     |
| time/                   |           |
|    fps                  | 160       |
|    iterations           | 1         |
|    time_elapsed         | 15        |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.1545586 |
|    clip_fraction        | 0.569     |
|    clip_range           | 0.2       |
|    entropy_loss         | 5.99      |
|    explained_variance   | 0.933     |
|    learning_rate        | 0.0005    |
|    loss                 | -0.0104   |
|    n_updates            | 180       |
|    policy_gradient_loss | -0.0281   |
|    std                  | 0.182     |
|    valu

Early stopping at step 4 due to reaching max kl: 0.16

Total episode rollouts: 8704

Eval num_timesteps=2560, episode_reward=0.77 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.767      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16102105 |
|    clip_fraction        | 0.569      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.25       |
|    explained_variance   | 0.944      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0243    |
|    n_updates            | 340        |
|    policy_gradient_loss | -0.00789   |
|    std                  | 0.

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 12800

Eval num_timesteps=2560, episode_reward=0.79 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.789      |
| time/                   |            |
|    fps                  | 152        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16380718 |
|    clip_fraction        | 0.563      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.19       |
|    explained_variance   | 0.943      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0143    |
|    n_updates            | 500        |
|    policy_gradient_loss | 0.000898   |
|    std                  | 0

Early stopping at step 3 due to reaching max kl: 0.15

Total episode rollouts: 16896

Eval num_timesteps=2560, episode_reward=0.80 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.799      |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15373662 |
|    clip_fraction        | 0.576      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.19       |
|    explained_variance   | 0.948      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0256    |
|    n_updates            | 660        |
|    policy_gradient_loss | 0.00074    |
|    std                  | 0

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 20992

Eval num_timesteps=2560, episode_reward=0.81 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.811      |
| time/                   |            |
|    fps                  | 152        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15953472 |
|    clip_fraction        | 0.599      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.29       |
|    explained_variance   | 0.96       |
|    learning_rate        | 0.0005     |
|    loss                 | 0.00293    |
|    n_updates            | 820        |
|    policy_gradient_loss | 0.0141     |
|    std                  | 0

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 25088

Eval num_timesteps=2560, episode_reward=0.82 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.817      |
| time/                   |            |
|    fps                  | 156        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15924972 |
|    clip_fraction        | 0.583      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.43       |
|    explained_variance   | 0.964      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.00634   |
|    n_updates            | 980        |
|    policy_gradient_loss | 0.0161     |
|    std                  | 0.179      |
|    value

Early stopping at step 4 due to reaching max kl: 0.16

Total episode rollouts: 29184

Eval num_timesteps=2560, episode_reward=0.82 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.823      |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15995161 |
|    clip_fraction        | 0.622      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.63       |
|    explained_variance   | 0.972      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0652     |
|    n_updates            | 1140       |
|    policy_gradient_loss | 0.00443    |
|    std                  | 0

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 33280

Eval num_timesteps=2560, episode_reward=0.83 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.828      |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15517561 |
|    clip_fraction        | 0.614      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.61       |
|    explained_variance   | 0.973      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.00277    |
|    n_updates            | 1300       |
|    policy_gradient_loss | 0.0218     |
|    std                  | 0.177      |
|    value

Early stopping at step 1 due to reaching max kl: 0.16

Total episode rollouts: 37376

Eval num_timesteps=2560, episode_reward=0.83 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.834      |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15755494 |
|    clip_fraction        | 0.585      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.68       |
|    explained_variance   | 0.967      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.141      |
|    n_updates            | 1460       |
|    policy_gradient_loss | 0.0382     |
|    std                  | 0

Early stopping at step 2 due to reaching max kl: 0.15

Total episode rollouts: 41472

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.84       |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15183409 |
|    clip_fraction        | 0.609      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.91       |
|    explained_variance   | 0.973      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0699     |
|    n_updates            | 1620       |
|    policy_gradient_loss | 0.0342     |
|    std                  | 0.175      |
|    value

Early stopping at step 2 due to reaching max kl: 0.18

Total episode rollouts: 45568

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.847      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.18273625 |
|    clip_fraction        | 0.596      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.05       |
|    explained_variance   | 0.972      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0202    |
|    n_updates            | 1780       |
|    policy_gradient_loss | 0.0301     |
|    std                  | 0.174      |
|    value

Early stopping at step 2 due to reaching max kl: 0.20

Total episode rollouts: 49664

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.848     |
| time/                   |           |
|    fps                  | 147       |
|    iterations           | 1         |
|    time_elapsed         | 17        |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.2010636 |
|    clip_fraction        | 0.618     |
|    clip_range           | 0.2       |
|    entropy_loss         | 7.12      |
|    explained_variance   | 0.972     |
|    learning_rate        | 0.0005    |
|    loss                 | 0.0852    |
|    n_updates            | 1940      |
|    policy_gradient_loss | 0.0418    |
|    std                  | 0.174     |
|    val

Early stopping at step 1 due to reaching max kl: 0.15

Total episode rollouts: 54272

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.852      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15470675 |
|    clip_fraction        | 0.585      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.09       |
|    explained_variance   | 0.976      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0246     |
|    n_updates            | 2120       |
|    policy_gradient_loss | 0.053      |
|    std                  | 0.174      |
|    value

Early stopping at step 1 due to reaching max kl: 0.16

Total episode rollouts: 58368

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.851      |
| time/                   |            |
|    fps                  | 152        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15680878 |
|    clip_fraction        | 0.595      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.14       |
|    explained_variance   | 0.976      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0794     |
|    n_updates            | 2280       |
|    policy_gradient_loss | 0.0529     |
|    std                  | 0.174      |
|    value

Early stopping at step 2 due to reaching max kl: 0.17

Total episode rollouts: 62464

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.851      |
| time/                   |            |
|    fps                  | 149        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.17106171 |
|    clip_fraction        | 0.601      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.01       |
|    explained_variance   | 0.978      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0626     |
|    n_updates            | 2440       |
|    policy_gradient_loss | 0.0512     |
|    std                  | 0.175      |
|    value

Early stopping at step 2 due to reaching max kl: 0.15

Total episode rollouts: 66560

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.849      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15057087 |
|    clip_fraction        | 0.589      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.99       |
|    explained_variance   | 0.981      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0637     |
|    n_updates            | 2600       |
|    policy_gradient_loss | 0.0423     |
|    std                  | 0.175      |
|    value

Early stopping at step 2 due to reaching max kl: 0.15

Total episode rollouts: 70656

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.85       |
| time/                   |            |
|    fps                  | 152        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15329893 |
|    clip_fraction        | 0.593      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.96       |
|    explained_variance   | 0.98       |
|    learning_rate        | 0.0005     |
|    loss                 | 0.134      |
|    n_updates            | 2760       |
|    policy_gradient_loss | 0.0468     |
|    std                  | 0.175      |
|    value

Early stopping at step 3 due to reaching max kl: 0.18

Total episode rollouts: 74752

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.852      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.18418786 |
|    clip_fraction        | 0.609      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.9        |
|    explained_variance   | 0.982      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0267     |
|    n_updates            | 2920       |
|    policy_gradient_loss | 0.0343     |
|    std                  | 0

Early stopping at step 3 due to reaching max kl: 0.19

Total episode rollouts: 78848

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.852      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.19267634 |
|    clip_fraction        | 0.605      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.83       |
|    explained_variance   | 0.982      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0216     |
|    n_updates            | 3080       |
|    policy_gradient_loss | 0.0366     |
|    std                  | 0.177      |
|    value

Early stopping at step 1 due to reaching max kl: 0.20

Total episode rollouts: 82944

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.852      |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.19696197 |
|    clip_fraction        | 0.587      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.9        |
|    explained_variance   | 0.982      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0514     |
|    n_updates            | 3240       |
|    policy_gradient_loss | 0.0486     |
|    std                  | 0.176      |
|    value

Early stopping at step 3 due to reaching max kl: 0.17

Total episode rollouts: 87040

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.852      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.17069212 |
|    clip_fraction        | 0.626      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.74       |
|    explained_variance   | 0.985      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0555     |
|    n_updates            | 3400       |
|    policy_gradient_loss | 0.0306     |
|    std                  | 0.178      |
|    value

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 91136

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.853      |
| time/                   |            |
|    fps                  | 147        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15592198 |
|    clip_fraction        | 0.621      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.67       |
|    explained_variance   | 0.986      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.00214   |
|    n_updates            | 3560       |
|    policy_gradient_loss | 0.0307     |
|    std                  | 0.178      |
|    value

Early stopping at step 1 due to reaching max kl: 0.20

Total episode rollouts: 95744

Eval num_timesteps=2560, episode_reward=0.86 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.855      |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.19934411 |
|    clip_fraction        | 0.617      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.66       |
|    explained_variance   | 0.986      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0752     |
|    n_updates            | 3740       |
|    policy_gradient_loss | 0.059      |
|    std                  | 0

Early stopping at step 3 due to reaching max kl: 0.15

Total episode rollouts: 99840

Eval num_timesteps=2560, episode_reward=0.86 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.855      |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15410616 |
|    clip_fraction        | 0.613      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.57       |
|    explained_variance   | 0.987      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0563     |
|    n_updates            | 3900       |
|    policy_gradient_loss | 0.0289     |
|    std                  | 0.179      |
|    value

Early stopping at step 3 due to reaching max kl: 0.20

Total episode rollouts: 103936

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.855     |
| time/                   |           |
|    fps                  | 152       |
|    iterations           | 1         |
|    time_elapsed         | 16        |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.2034039 |
|    clip_fraction        | 0.613     |
|    clip_range           | 0.2       |
|    entropy_loss         | 6.52      |
|    explained_variance   | 0.987     |
|    learning_rate        | 0.0005    |
|    loss                 | 0.0433    |
|    n_updates            | 4060      |
|    policy_gradient_loss | 0.0433    |
|    std                  | 0.179     |
|    value_loss           | 0

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 108032

Eval num_timesteps=2560, episode_reward=0.85 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.855      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16416538 |
|    clip_fraction        | 0.637      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.43       |
|    explained_variance   | 0.989      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.068      |
|    n_updates            | 4220       |
|    policy_gradient_loss | 0.0492     |
|    std                  | 0.181      |
|    valu

Early stopping at step 1 due to reaching max kl: 0.16

Total episode rollouts: 112128

Eval num_timesteps=2560, episode_reward=0.86 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.855      |
| time/                   |            |
|    fps                  | 152        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16289917 |
|    clip_fraction        | 0.591      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.3        |
|    explained_variance   | 0.988      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0619     |
|    n_updates            | 4380       |
|    policy_gradient_loss | 0.0451     |
|    std                  | 0.182      |
|    valu

Early stopping at step 2 due to reaching max kl: 0.17

Total episode rollouts: 116224

Eval num_timesteps=2560, episode_reward=0.86 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.856      |
| time/                   |            |
|    fps                  | 147        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16687155 |
|    clip_fraction        | 0.607      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.23       |
|    explained_variance   | 0.986      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.103      |
|    n_updates            | 4540       |
|    policy_gradient_loss | 0.0512     |
|    std                  | 

Early stopping at step 1 due to reaching max kl: 0.18

Total episode rollouts: 120320



<IPython.core.display.Javascript object>

Using cuda device
seed 2: grid fidelity factor 1.0 learning ..
environement grid size (nx x ny ): 31 x 91




Eval num_timesteps=2560, episode_reward=0.69 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.694      |
| time/                   |            |
|    fps                  | 142        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.17885463 |
|    clip_fraction        | 0.614      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.05       |
|    explained_variance   | 0.989      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0532     |
|    n_updates            | 4700       |
|    policy_gradient_loss | 0.0555     |
|    std                  | 0.184      |
|    value_loss           | 0.000768   |
---------------------------------

Early stopping at step 7 due to reaching max kl: 0.16

Total episode rollouts: 4096

Eval num_timesteps=2560, episode_reward=0.74 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.735     |
| time/                   |           |
|    fps                  | 154       |
|    iterations           | 1         |
|    time_elapsed         | 16        |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.1599662 |
|    clip_fraction        | 0.57      |
|    clip_range           | 0.2       |
|    entropy_loss         | 6.19      |
|    explained_variance   | 0.932     |
|    learning_rate        | 0.0005    |
|    loss                 | -0.0412   |
|    n_updates            | 160       |
|    policy_gradient_loss | -0.0369   |
|    std                  | 0.18      |
|    valu

Early stopping at step 4 due to reaching max kl: 0.15

Total episode rollouts: 8192

Eval num_timesteps=2560, episode_reward=0.74 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.745     |
| time/                   |           |
|    fps                  | 152       |
|    iterations           | 1         |
|    time_elapsed         | 16        |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.1543515 |
|    clip_fraction        | 0.559     |
|    clip_range           | 0.2       |
|    entropy_loss         | 6.08      |
|    explained_variance   | 0.942     |
|    learning_rate        | 0.0005    |
|    loss                 | -0.00596  |
|    n_updates            | 320       |
|    policy_gradient_loss | -0.0138   |
|    std                  | 0.181     |
|    valu

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 12288

Eval num_timesteps=2560, episode_reward=0.78 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.777      |
| time/                   |            |
|    fps                  | 149        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15667725 |
|    clip_fraction        | 0.567      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.22       |
|    explained_variance   | 0.941      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0154     |
|    n_updates            | 480        |
|    policy_gradient_loss | -0.00196   |
|    std                  | 0.18       |
|    value

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 16384

Eval num_timesteps=2560, episode_reward=0.80 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.799      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15787064 |
|    clip_fraction        | 0.598      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.29       |
|    explained_variance   | 0.955      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.00802    |
|    n_updates            | 640        |
|    policy_gradient_loss | 0.00873    |
|    std                  | 0

Early stopping at step 5 due to reaching max kl: 0.16

Total episode rollouts: 20480

Eval num_timesteps=2560, episode_reward=0.83 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.83       |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15540962 |
|    clip_fraction        | 0.615      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.66       |
|    explained_variance   | 0.967      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0358    |
|    n_updates            | 800        |
|    policy_gradient_loss | 6.24e-05   |
|    std                  | 0

Early stopping at step 2 due to reaching max kl: 0.22

Total episode rollouts: 24576

Eval num_timesteps=2560, episode_reward=0.84 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.844      |
| time/                   |            |
|    fps                  | 147        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.22424634 |
|    clip_fraction        | 0.618      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.11       |
|    explained_variance   | 0.973      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0747     |
|    n_updates            | 960        |
|    policy_gradient_loss | 0.0458     |
|    std                  | 0

Early stopping at step 4 due to reaching max kl: 0.17

Total episode rollouts: 28672

Eval num_timesteps=2560, episode_reward=0.86 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.856      |
| time/                   |            |
|    fps                  | 147        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.17363636 |
|    clip_fraction        | 0.627      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.32       |
|    explained_variance   | 0.983      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0147    |
|    n_updates            | 1120       |
|    policy_gradient_loss | 0.0188     |
|    std                  | 0

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 32768

Eval num_timesteps=2560, episode_reward=0.86 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.86       |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16322204 |
|    clip_fraction        | 0.606      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.69       |
|    explained_variance   | 0.988      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.068      |
|    n_updates            | 1280       |
|    policy_gradient_loss | 0.0346     |
|    std                  | 0

Early stopping at step 2 due to reaching max kl: 0.15

Total episode rollouts: 36864

Eval num_timesteps=2560, episode_reward=0.86 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.864      |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15087953 |
|    clip_fraction        | 0.601      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.77       |
|    explained_variance   | 0.988      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0408     |
|    n_updates            | 1440       |
|    policy_gradient_loss | 0.0437     |
|    std                  | 0

Early stopping at step 7 due to reaching max kl: 0.17

Total episode rollouts: 40960

Eval num_timesteps=2560, episode_reward=0.87 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.871      |
| time/                   |            |
|    fps                  | 147        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16788384 |
|    clip_fraction        | 0.643      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.78       |
|    explained_variance   | 0.99       |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0293    |
|    n_updates            | 1600       |
|    policy_gradient_loss | 0.0195     |
|    std                  | 0

Early stopping at step 2 due to reaching max kl: 0.20

Total episode rollouts: 45056

Eval num_timesteps=2560, episode_reward=0.87 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.875      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.19581237 |
|    clip_fraction        | 0.61       |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.84       |
|    explained_variance   | 0.989      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0784     |
|    n_updates            | 1760       |
|    policy_gradient_loss | 0.0433     |
|    std                  | 0

Early stopping at step 2 due to reaching max kl: 0.18

Total episode rollouts: 49152

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.877      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.18167345 |
|    clip_fraction        | 0.613      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.96       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.03       |
|    n_updates            | 1920       |
|    policy_gradient_loss | 0.0491     |
|    std                  | 0

Early stopping at step 4 due to reaching max kl: 0.15

Total episode rollouts: 53248

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.879      |
| time/                   |            |
|    fps                  | 146        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15152326 |
|    clip_fraction        | 0.629      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.89       |
|    explained_variance   | 0.991      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.139      |
|    n_updates            | 2080       |
|    policy_gradient_loss | 0.037      |
|    std                  | 0

Early stopping at step 1 due to reaching max kl: 0.17

Total episode rollouts: 57344

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.17185768 |
|    clip_fraction        | 0.599      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.99       |
|    explained_variance   | 0.991      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0086    |
|    n_updates            | 2240       |
|    policy_gradient_loss | 0.0543     |
|    std                  | 0

Early stopping at step 1 due to reaching max kl: 0.15

Total episode rollouts: 61440

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 152        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15234956 |
|    clip_fraction        | 0.604      |
|    clip_range           | 0.2        |
|    entropy_loss         | 8.09       |
|    explained_variance   | 0.991      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0708     |
|    n_updates            | 2400       |
|    policy_gradient_loss | 0.053      |
|    std                  | 0.167      |
|    value

Early stopping at step 2 due to reaching max kl: 0.17

Total episode rollouts: 65536

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.879      |
| time/                   |            |
|    fps                  | 152        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16928832 |
|    clip_fraction        | 0.604      |
|    clip_range           | 0.2        |
|    entropy_loss         | 8.09       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0628     |
|    n_updates            | 2560       |
|    policy_gradient_loss | 0.0499     |
|    std                  | 0.167      |
|    value

Early stopping at step 1 due to reaching max kl: 0.22

Total episode rollouts: 69632

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.21964207 |
|    clip_fraction        | 0.617      |
|    clip_range           | 0.2        |
|    entropy_loss         | 8.06       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.00121   |
|    n_updates            | 2720       |
|    policy_gradient_loss | 0.0713     |
|    std                  | 0

Early stopping at step 1 due to reaching max kl: 0.17

Total episode rollouts: 73728

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16839425 |
|    clip_fraction        | 0.619      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.93       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.00198    |
|    n_updates            | 2880       |
|    policy_gradient_loss | 0.0706     |
|    std                  | 0

Early stopping at step 1 due to reaching max kl: 0.21

Total episode rollouts: 77824

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.881      |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.20575733 |
|    clip_fraction        | 0.61       |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.88       |
|    explained_variance   | 0.993      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0717     |
|    n_updates            | 3040       |
|    policy_gradient_loss | 0.0619     |
|    std                  | 0.169      |
|    value

Early stopping at step 1 due to reaching max kl: 0.17

Total episode rollouts: 81920

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.881      |
| time/                   |            |
|    fps                  | 152        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.17078471 |
|    clip_fraction        | 0.613      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.82       |
|    explained_variance   | 0.993      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.00924    |
|    n_updates            | 3200       |
|    policy_gradient_loss | 0.0619     |
|    std                  | 0.17       |
|    value

Early stopping at step 9 due to reaching max kl: 0.15

Total episode rollouts: 86016

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.881      |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15313362 |
|    clip_fraction        | 0.651      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.84       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0151     |
|    n_updates            | 3360       |
|    policy_gradient_loss | 0.0234     |
|    std                  | 0

Early stopping at step 6 due to reaching max kl: 0.16

Total episode rollouts: 90112

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 149        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16016154 |
|    clip_fraction        | 0.643      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.87       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0047    |
|    n_updates            | 3520       |
|    policy_gradient_loss | 0.0261     |
|    std                  | 0.17       |
|    value

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 94208

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 153        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15994406 |
|    clip_fraction        | 0.631      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.81       |
|    explained_variance   | 0.993      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.095      |
|    n_updates            | 3680       |
|    policy_gradient_loss | 0.0439     |
|    std                  | 0.17       |
|    value

Early stopping at step 1 due to reaching max kl: 0.16

Total episode rollouts: 98304

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 147        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16231982 |
|    clip_fraction        | 0.622      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.87       |
|    explained_variance   | 0.993      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0373     |
|    n_updates            | 3840       |
|    policy_gradient_loss | 0.0609     |
|    std                  | 0.17       |
|    value

Early stopping at step 1 due to reaching max kl: 0.15

Total episode rollouts: 102400

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15034151 |
|    clip_fraction        | 0.602      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.81       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0684     |
|    n_updates            | 4000       |
|    policy_gradient_loss | 0.0589     |
|    std                  | 0.17       |
|    valu

Early stopping at step 2 due to reaching max kl: 0.18

Total episode rollouts: 106496

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 149        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.17532834 |
|    clip_fraction        | 0.614      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.74       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0648     |
|    n_updates            | 4160       |
|    policy_gradient_loss | 0.0453     |
|    std                  | 0.171      |
|    valu

Early stopping at step 2 due to reaching max kl: 0.17

Total episode rollouts: 110592

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16836476 |
|    clip_fraction        | 0.613      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.68       |
|    explained_variance   | 0.992      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0149     |
|    n_updates            | 4320       |
|    policy_gradient_loss | 0.0505     |
|    std                  | 0.172      |
|    valu

Early stopping at step 2 due to reaching max kl: 0.16

Total episode rollouts: 114688

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.16499186 |
|    clip_fraction        | 0.615      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.67       |
|    explained_variance   | 0.993      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0464     |
|    n_updates            | 4480       |
|    policy_gradient_loss | 0.0423     |
|    std                  | 0.171      |
|    valu

Early stopping at step 2 due to reaching max kl: 0.16

Total episode rollouts: 118784

Eval num_timesteps=2560, episode_reward=0.88 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.88       |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15889768 |
|    clip_fraction        | 0.596      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.66       |
|    explained_variance   | 0.994      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0369     |
|    n_updates            | 4640       |
|    policy_gradient_loss | 0.0375     |
|    std                  | 0.172      |
|    valu

<IPython.core.display.Javascript object>

Using cuda device
seed 3: grid fidelity factor 1.0 learning ..
environement grid size (nx x ny ): 31 x 91




Eval num_timesteps=2560, episode_reward=0.70 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.695      |
| time/                   |            |
|    fps                  | 131        |
|    iterations           | 1          |
|    time_elapsed         | 19         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15283477 |
|    clip_fraction        | 0.613      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.56       |
|    explained_variance   | 0.994      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0219     |
|    n_updates            | 4700       |
|    policy_gradient_loss | 0.0524     |
|    std                  | 0.172      |
|    value_loss           | 0.000518   |
---------------------------------

Early stopping at step 11 due to reaching max kl: 0.16

Total episode rollouts: 4096

Eval num_timesteps=2560, episode_reward=0.74 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.736      |
| time/                   |            |
|    fps                  | 149        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15798809 |
|    clip_fraction        | 0.597      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.11       |
|    explained_variance   | 0.939      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0658    |
|    n_updates            | 160        |
|    policy_gradient_loss | -0.0471    |
|    std                  | 0.181      |
|    value

Early stopping at step 4 due to reaching max kl: 0.15

Total episode rollouts: 8192

Eval num_timesteps=2560, episode_reward=0.77 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.766      |
| time/                   |            |
|    fps                  | 149        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15120777 |
|    clip_fraction        | 0.554      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.33       |
|    explained_variance   | 0.933      |
|    learning_rate        | 0.0005     |
|    loss                 | -0.0635    |
|    n_updates            | 320        |
|    policy_gradient_loss | -0.0121    |
|    std                  | 0.

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 12288

Eval num_timesteps=2560, episode_reward=0.78 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 5         |
|    mean_reward          | 0.782     |
| time/                   |           |
|    fps                  | 153       |
|    iterations           | 1         |
|    time_elapsed         | 16        |
|    total_timesteps      | 2560      |
| train/                  |           |
|    approx_kl            | 0.1563157 |
|    clip_fraction        | 0.56      |
|    clip_range           | 0.2       |
|    entropy_loss         | 6.6       |
|    explained_variance   | 0.936     |
|    learning_rate        | 0.0005    |
|    loss                 | -0.00551  |
|    n_updates            | 480       |
|    policy_gradient_loss | -0.00199  |
|    std                  | 0.177     |
|    val

Early stopping at step 3 due to reaching max kl: 0.16

Total episode rollouts: 16384

Eval num_timesteps=2560, episode_reward=0.79 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.794      |
| time/                   |            |
|    fps                  | 142        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15967788 |
|    clip_fraction        | 0.596      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.74       |
|    explained_variance   | 0.945      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0783     |
|    n_updates            | 640        |
|    policy_gradient_loss | 0.00979    |
|    std                  | 0

Early stopping at step 1 due to reaching max kl: 0.16

Total episode rollouts: 20480

Eval num_timesteps=2560, episode_reward=0.80 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.803      |
| time/                   |            |
|    fps                  | 150        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15751919 |
|    clip_fraction        | 0.58       |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.83       |
|    explained_variance   | 0.953      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.00252    |
|    n_updates            | 800        |
|    policy_gradient_loss | 0.0296     |
|    std                  | 0.175      |
|    value

Early stopping at step 2 due to reaching max kl: 0.18

Total episode rollouts: 24576

Eval num_timesteps=2560, episode_reward=0.82 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.818      |
| time/                   |            |
|    fps                  | 151        |
|    iterations           | 1          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.17568152 |
|    clip_fraction        | 0.552      |
|    clip_range           | 0.2        |
|    entropy_loss         | 6.99       |
|    explained_variance   | 0.947      |
|    learning_rate        | 0.0005     |
|    loss                 | 0.0724     |
|    n_updates            | 960        |
|    policy_gradient_loss | 0.015      |
|    std                  | 0

Early stopping at step 1 due to reaching max kl: 0.15

Total episode rollouts: 28672

Eval num_timesteps=2560, episode_reward=0.83 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.83       |
| time/                   |            |
|    fps                  | 148        |
|    iterations           | 1          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.15106432 |
|    clip_fraction        | 0.581      |
|    clip_range           | 0.2        |
|    entropy_loss         | 7.1        |
|    explained_variance   | 0.96       |
|    learning_rate        | 0.0005     |
|    loss                 | 0.014      |
|    n_updates            | 1120       |
|    policy_gradient_loss | 0.0488     |
|    std                  | 0

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ad181/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-6af32b7e68ec>", line 10, in <module>
    seed=seed)
  File "/data/ad181/RemoteDir/ada_multigrid_ppo/utils/multigrid_framework_functions.py", line 102, in multigrid_framework
    model.learn(total_timesteps= env.terminal_step*episodes_per_iteration, callback=callback)
  File "/home/ad181/anaconda3/lib/python3.7/site-packages/stable_baselines3/ppo/ppo.py", line 264, in learn
    reset_num_timesteps=reset_num_timesteps,
  File "/home/ad181/anaconda3/lib/python3.7/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 222, in learn
    continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
  File "/home/ad181/anaconda3/lib/python3.7/site-packages/stable_baselines3/common/on_policy_algorithm.py

KeyboardInterrupt: 