In [1]:
# to access functions from root directory
import sys
sys.path.append('/data/ad181/RemoteDir/multilevel_ppo')

In [2]:
%matplotlib notebook
import numpy as np
import time
import pickle
import os
import matplotlib.pyplot as plt
from copy import copy, deepcopy
from tqdm.notebook import trange, tqdm

import gym
from stable_baselines3.ppo import PPO, MlpPolicy
from stable_baselines3.ppo_multi_level import PPO_ML
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env.subproc_vec_multi_level_env import SubprocVecMultiLevelEnv
from stable_baselines3.common.envs.multi_level_ressim_env import MultiLevelRessimEnv
from stable_baselines3.common.logger import configure

from utils.custom_eval_callback import CustomEvalCallback, CustomEvalCallbackParallel
from utils.plot_functions import plot_learning
from utils.env_evaluate_functions import eval_actions

In [3]:
seed=1
case='ppo_1l'
data_dir='./data'
log_dir='./data/'+case

In [4]:
os.makedirs(data_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

In [5]:
with open('../envs_params/env_data_v1/env_train_dict.pkl', 'rb') as input:
    env_ck_dict = pickle.load(input)

In [6]:
# generate dictionaries for env (env_dict_), n_steps (T_ml) and minibatch (M_ml) for `n_level`s
n_levels=1
fine_level = len(env_ck_dict)
env_dict_ = {}
for i,l in enumerate(range(fine_level-n_levels, fine_level)):
    print(i+1,'->',l+1)
    env_dict_[i+1] = env_ck_dict[l+1]

1 -> 5


In [7]:
for seed in range(1,4):
    if True: 
        print(f'seed {seed}')
        log_dir = './data/'+case+'/seed_'+str(seed)
        os.makedirs(log_dir, exist_ok=True)
        T = 50 # n_steps
        N = 50 # number of actors
        M = 250 # minibatch size
        I = 300 # number of iterations
        K = 20 # number of epochs
        
        log_interval = I/30
        
        fine_level = len(env_dict_)
        
        print('generate callback ...')
        eval_callback = CustomEvalCallback( env_dict_[fine_level], 
                                            best_model_save_path=None, 
                                            n_eval_episodes=1,
                                            log_path=str(log_dir)+'/results_eval', 
                                            eval_freq=log_interval*T)
        
        print('vectorize environment ...')
        
        # generate PPO_ML parameters for MLMC analysis. 
        # we choose same n_steps and batch_size values on levels because only fine level values are used in the analysis
        env_dict = {}
        n_steps_dict = {}
        batch_size_dict = {}
        for env, level in zip(env_dict_.values(), env_dict_.keys()):
            print(f"vectorize env level {level}")
            env_dict[level] = make_vec_env( MultiLevelRessimEnv, 
                                    n_envs=N, 
                                    seed=seed, 
                                    env_kwargs= {"ressim_params":env.ressim_params, "level":env.level}, 
                                    vec_env_cls=SubprocVecMultiLevelEnv )
            n_steps_dict[level] = T
            batch_size_dict[level] = M
        
        print(env_dict_[level].observation_space)
        print('model definition ..')
        model = PPO_ML(policy=MlpPolicy,
                           env=env_dict,
                           learning_rate = 1e-5,
                           n_steps = n_steps_dict,
                           batch_size = batch_size_dict,
                           n_epochs = K,
                           clip_range = 0.1,
                           ent_coef = 0.001,
                           vf_coef = 0.5,
                           policy_kwargs = dict(net_arch=[150,100,80], log_std_init=-2.9),
                           verbose = 1,
                           seed = seed,
                           target_kl = 0.05,
                           device = "auto")
        # set logger for the model
        new_logger = configure(log_dir)
        model.set_logger(new_logger)
        print('policy learning ..')
        model.learn(total_timesteps=N*T*I, callback=eval_callback)
        model.save(log_dir+'/PPO', exclude=['env_dict'])
        del model
        for level in env_dict.keys():
            env_dict[level].close()


seed 1
generate callback ...
vectorize environment ...
vectorize env level 1
Box(-1.0, 1.0, (96,), float64)
model definition ..
Using cuda device
Logging to ./data/case_1_mlmc_ppo_1_level/seed_1
policy learning ..




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5        |
|    ep_rew_mean     | 0.576    |
| time/              |          |
|    fps             | 11       |
|    iterations      | 1        |
|    time_elapsed    | 225      |
|    total_timesteps | 2500     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.578       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 2           |
|    time_elapsed         | 447         |
|    total_timesteps      | 5000        |
| train/                  |             |
|    approx_kl            | 0.006550616 |
|    clip_fraction        | 0.358       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | -3.95       |
|    learning_rate        | 1e

  for j in range(len(p_1)-1):


Eval num_timesteps=25000, episode_reward=0.61 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.608       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.581       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 10          |
|    time_elapsed         | 2259        |
|    total_timesteps      | 25000       |
| train/                  |             |
|    approx_kl            | 0.007449447 |
|    clip_fraction        | 0.36        |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.636       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0577      |
|    n_updates            | 180         |
|    policy

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5          |
|    ep_rew_mean          | 0.588      |
| time/                   |            |
|    fps                  | 11         |
|    iterations           | 19         |
|    time_elapsed         | 4274       |
|    total_timesteps      | 47500      |
| train/                  |            |
|    approx_kl            | 0.00839086 |
|    clip_fraction        | 0.4        |
|    clip_range           | 0.1        |
|    entropy_loss         | -94.8      |
|    explained_variance   | 0.763      |
|    learning_rate        | 1e-05      |
|    loss                 | 0.0524     |
|    n_updates            | 360        |
|    policy_gradient_loss | -0.0365    |
|    std                  | 0.055      |
|    value_loss           | 0.00736    |
----------------------------------------
Eval num_timesteps=50000, episode_reward=0.63 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.596       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 28          |
|    time_elapsed         | 6316        |
|    total_timesteps      | 70000       |
| train/                  |             |
|    approx_kl            | 0.007490708 |
|    clip_fraction        | 0.372       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.812       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0541      |
|    n_updates            | 540         |
|    policy_gradient_loss | -0.0352     |
|    std                  | 0.055       |
|    value_loss           | 0.00587     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.598       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 37          |
|    time_elapsed         | 8354        |
|    total_timesteps      | 92500       |
| train/                  |             |
|    approx_kl            | 0.007746609 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.832       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.049       |
|    n_updates            | 720         |
|    policy_gradient_loss | -0.0354     |
|    std                  | 0.0551      |
|    value_loss           | 0.00533     |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.6         |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 46          |
|    time_elapsed         | 10387       |
|    total_timesteps      | 115000      |
| train/                  |             |
|    approx_kl            | 0.008300274 |
|    clip_fraction        | 0.407       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.858       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0447      |
|    n_updates            | 900         |
|    policy_gradient_loss | -0.0374     |
|    std                  | 0.0551      |
|    value_loss           | 0.00456     |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.609       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 55          |
|    time_elapsed         | 12418       |
|    total_timesteps      | 137500      |
| train/                  |             |
|    approx_kl            | 0.008832661 |
|    clip_fraction        | 0.39        |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.872       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.054       |
|    n_updates            | 1080        |
|    policy_gradient_loss | -0.0358     |
|    std                  | 0.0551      |
|    value_loss           | 0.00415     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.614        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 64           |
|    time_elapsed         | 14448        |
|    total_timesteps      | 160000       |
| train/                  |              |
|    approx_kl            | 0.0080512995 |
|    clip_fraction        | 0.376        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.7        |
|    explained_variance   | 0.876        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0487       |
|    n_updates            | 1260         |
|    policy_gradient_loss | -0.0349      |
|    std                  | 0.0551       |
|    value_loss           | 0.00406      |
------------------------------------------
------------------------------------------
| rollout/ 

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.617       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 73          |
|    time_elapsed         | 16477       |
|    total_timesteps      | 182500      |
| train/                  |             |
|    approx_kl            | 0.009090847 |
|    clip_fraction        | 0.4         |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.887       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0483      |
|    n_updates            | 1440        |
|    policy_gradient_loss | -0.0379     |
|    std                  | 0.0551      |
|    value_loss           | 0.0037      |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.618       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 82          |
|    time_elapsed         | 18504       |
|    total_timesteps      | 205000      |
| train/                  |             |
|    approx_kl            | 0.009061124 |
|    clip_fraction        | 0.396       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.892       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0492      |
|    n_updates            | 1620        |
|    policy_gradient_loss | -0.0363     |
|    std                  | 0.055       |
|    value_loss           | 0.00359     |
-----------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.621        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 91           |
|    time_elapsed         | 20539        |
|    total_timesteps      | 227500       |
| train/                  |              |
|    approx_kl            | 0.0077897217 |
|    clip_fraction        | 0.394        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.895        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0521       |
|    n_updates            | 1800         |
|    policy_gradient_loss | -0.0378      |
|    std                  | 0.055        |
|    value_loss           | 0.0035       |
------------------------------------------
-----------------------------------------
| rollout/  

Eval num_timesteps=250000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.663       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.625       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 100         |
|    time_elapsed         | 22575       |
|    total_timesteps      | 250000      |
| train/                  |             |
|    approx_kl            | 0.010278452 |
|    clip_fraction        | 0.432       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.893       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0454      |
|    n_updates            | 1980        |
|    polic

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.626       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 109         |
|    time_elapsed         | 24583       |
|    total_timesteps      | 272500      |
| train/                  |             |
|    approx_kl            | 0.008096466 |
|    clip_fraction        | 0.409       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.891       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0513      |
|    n_updates            | 2160        |
|    policy_gradient_loss | -0.038      |
|    std                  | 0.055       |
|    value_loss           | 0.00368     |
-----------------------------------------
Eval num_timesteps=275000, episode_reward=0.66 +/- 0.00
Episode length: 5.00

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5          |
|    ep_rew_mean          | 0.633      |
| time/                   |            |
|    fps                  | 11         |
|    iterations           | 118        |
|    time_elapsed         | 26615      |
|    total_timesteps      | 295000     |
| train/                  |            |
|    approx_kl            | 0.00784478 |
|    clip_fraction        | 0.391      |
|    clip_range           | 0.1        |
|    entropy_loss         | -94.8      |
|    explained_variance   | 0.894      |
|    learning_rate        | 1e-05      |
|    loss                 | 0.0468     |
|    n_updates            | 2340       |
|    policy_gradient_loss | -0.0361    |
|    std                  | 0.055      |
|    value_loss           | 0.00356    |
----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.639        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 127          |
|    time_elapsed         | 28645        |
|    total_timesteps      | 317500       |
| train/                  |              |
|    approx_kl            | 0.0077059334 |
|    clip_fraction        | 0.402        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.907        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0442       |
|    n_updates            | 2520         |
|    policy_gradient_loss | -0.0364      |
|    std                  | 0.055        |
|    value_loss           | 0.00315      |
------------------------------------------
------------------------------------------
| rollout/ 

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.637       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 136         |
|    time_elapsed         | 30678       |
|    total_timesteps      | 340000      |
| train/                  |             |
|    approx_kl            | 0.008570824 |
|    clip_fraction        | 0.413       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.908       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0482      |
|    n_updates            | 2700        |
|    policy_gradient_loss | -0.0376     |
|    std                  | 0.055       |
|    value_loss           | 0.00315     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.642       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 145         |
|    time_elapsed         | 32713       |
|    total_timesteps      | 362500      |
| train/                  |             |
|    approx_kl            | 0.008303338 |
|    clip_fraction        | 0.394       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.907       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0486      |
|    n_updates            | 2880        |
|    policy_gradient_loss | -0.0359     |
|    std                  | 0.055       |
|    value_loss           | 0.00322     |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.648       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 154         |
|    time_elapsed         | 34749       |
|    total_timesteps      | 385000      |
| train/                  |             |
|    approx_kl            | 0.010137902 |
|    clip_fraction        | 0.417       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.913       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.053       |
|    n_updates            | 3060        |
|    policy_gradient_loss | -0.0377     |
|    std                  | 0.055       |
|    value_loss           | 0.003       |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.647       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 163         |
|    time_elapsed         | 36780       |
|    total_timesteps      | 407500      |
| train/                  |             |
|    approx_kl            | 0.007510219 |
|    clip_fraction        | 0.407       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.901       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0491      |
|    n_updates            | 3240        |
|    policy_gradient_loss | -0.0365     |
|    std                  | 0.055       |
|    value_loss           | 0.00342     |
-----------------------------------------
----------------------------------------
| rollout/                |        

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5          |
|    ep_rew_mean          | 0.647      |
| time/                   |            |
|    fps                  | 11         |
|    iterations           | 172        |
|    time_elapsed         | 38811      |
|    total_timesteps      | 430000     |
| train/                  |            |
|    approx_kl            | 0.00792976 |
|    clip_fraction        | 0.414      |
|    clip_range           | 0.1        |
|    entropy_loss         | -94.9      |
|    explained_variance   | 0.911      |
|    learning_rate        | 1e-05      |
|    loss                 | 0.0571     |
|    n_updates            | 3420       |
|    policy_gradient_loss | -0.0376    |
|    std                  | 0.055      |
|    value_loss           | 0.00306    |
----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.648       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 181         |
|    time_elapsed         | 40853       |
|    total_timesteps      | 452500      |
| train/                  |             |
|    approx_kl            | 0.007497557 |
|    clip_fraction        | 0.37        |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.913       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0521      |
|    n_updates            | 3600        |
|    policy_gradient_loss | -0.0353     |
|    std                  | 0.055       |
|    value_loss           | 0.00302     |
-----------------------------------------
------------------------------------------
| rollout/                |      

Eval num_timesteps=475000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 5          |
|    mean_reward          | 0.662      |
| rollout/                |            |
|    ep_len_mean          | 5          |
|    ep_rew_mean          | 0.651      |
| time/                   |            |
|    fps                  | 11         |
|    iterations           | 190        |
|    time_elapsed         | 42885      |
|    total_timesteps      | 475000     |
| train/                  |            |
|    approx_kl            | 0.00733943 |
|    clip_fraction        | 0.397      |
|    clip_range           | 0.1        |
|    entropy_loss         | -94.9      |
|    explained_variance   | 0.907      |
|    learning_rate        | 1e-05      |
|    loss                 | 0.0538     |
|    n_updates            | 3780       |
|    policy_gradient_loss | -0.0353    |
|    std    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.65        |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 199         |
|    time_elapsed         | 44896       |
|    total_timesteps      | 497500      |
| train/                  |             |
|    approx_kl            | 0.008022196 |
|    clip_fraction        | 0.401       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.905       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0491      |
|    n_updates            | 3960        |
|    policy_gradient_loss | -0.0352     |
|    std                  | 0.0549      |
|    value_loss           | 0.00333     |
-----------------------------------------
Eval num_timesteps=500000, episode_reward=0.66 +/- 0.00
Episode length: 5.00

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.653       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 208         |
|    time_elapsed         | 46927       |
|    total_timesteps      | 520000      |
| train/                  |             |
|    approx_kl            | 0.007847186 |
|    clip_fraction        | 0.407       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.908       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0483      |
|    n_updates            | 4140        |
|    policy_gradient_loss | -0.036      |
|    std                  | 0.0549      |
|    value_loss           | 0.0032      |
-----------------------------------------
----------------------------------------
| rollout/                |        

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.654       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 217         |
|    time_elapsed         | 48964       |
|    total_timesteps      | 542500      |
| train/                  |             |
|    approx_kl            | 0.007995717 |
|    clip_fraction        | 0.383       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.912       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0558      |
|    n_updates            | 4320        |
|    policy_gradient_loss | -0.0347     |
|    std                  | 0.0549      |
|    value_loss           | 0.00307     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.655       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 226         |
|    time_elapsed         | 51004       |
|    total_timesteps      | 565000      |
| train/                  |             |
|    approx_kl            | 0.007079096 |
|    clip_fraction        | 0.38        |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.91        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0519      |
|    n_updates            | 4500        |
|    policy_gradient_loss | -0.0345     |
|    std                  | 0.0549      |
|    value_loss           | 0.00315     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.655       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 235         |
|    time_elapsed         | 53034       |
|    total_timesteps      | 587500      |
| train/                  |             |
|    approx_kl            | 0.008342955 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.916       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0478      |
|    n_updates            | 4680        |
|    policy_gradient_loss | -0.0347     |
|    std                  | 0.0549      |
|    value_loss           | 0.00295     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.658       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 244         |
|    time_elapsed         | 55067       |
|    total_timesteps      | 610000      |
| train/                  |             |
|    approx_kl            | 0.008339905 |
|    clip_fraction        | 0.396       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.913       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0509      |
|    n_updates            | 4860        |
|    policy_gradient_loss | -0.0359     |
|    std                  | 0.0549      |
|    value_loss           | 0.00307     |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.661       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 253         |
|    time_elapsed         | 57093       |
|    total_timesteps      | 632500      |
| train/                  |             |
|    approx_kl            | 0.007699651 |
|    clip_fraction        | 0.391       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.912       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0524      |
|    n_updates            | 5040        |
|    policy_gradient_loss | -0.0348     |
|    std                  | 0.0549      |
|    value_loss           | 0.00311     |
-----------------------------------------
----------------------------------------
| rollout/                |        

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.658       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 262         |
|    time_elapsed         | 59124       |
|    total_timesteps      | 655000      |
| train/                  |             |
|    approx_kl            | 0.008314547 |
|    clip_fraction        | 0.388       |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.914       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0502      |
|    n_updates            | 5220        |
|    policy_gradient_loss | -0.0356     |
|    std                  | 0.0549      |
|    value_loss           | 0.00305     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.658       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 271         |
|    time_elapsed         | 61157       |
|    total_timesteps      | 677500      |
| train/                  |             |
|    approx_kl            | 0.008740005 |
|    clip_fraction        | 0.398       |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.914       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0475      |
|    n_updates            | 5400        |
|    policy_gradient_loss | -0.0357     |
|    std                  | 0.0549      |
|    value_loss           | 0.00304     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

Eval num_timesteps=700000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.663       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.657       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 280         |
|    time_elapsed         | 63190       |
|    total_timesteps      | 700000      |
| train/                  |             |
|    approx_kl            | 0.009173853 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.915       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.053       |
|    n_updates            | 5580        |
|    policy_gradient_loss | -0.0

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.661       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 289         |
|    time_elapsed         | 65200       |
|    total_timesteps      | 722500      |
| train/                  |             |
|    approx_kl            | 0.008594423 |
|    clip_fraction        | 0.414       |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.917       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0497      |
|    n_updates            | 5760        |
|    policy_gradient_loss | -0.0356     |
|    std                  | 0.0549      |
|    value_loss           | 0.00294     |
-----------------------------------------
Eval num_timesteps=725000, episode_reward=0.66 +/- 0.00
Episode length: 5.00

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.66        |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 298         |
|    time_elapsed         | 67243       |
|    total_timesteps      | 745000      |
| train/                  |             |
|    approx_kl            | 0.007827274 |
|    clip_fraction        | 0.398       |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.917       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0483      |
|    n_updates            | 5940        |
|    policy_gradient_loss | -0.0348     |
|    std                  | 0.0549      |
|    value_loss           | 0.00294     |
-----------------------------------------
----------------------------------------
| rollout/                |        

  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in

seed 2
generate callback ...
vectorize environment ...
vectorize env level 1
Box(-1.0, 1.0, (96,), float64)
model definition ..
Using cuda device
Logging to ./data/case_1_mlmc_ppo_1_level/seed_2
policy learning ..




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5        |
|    ep_rew_mean     | 0.578    |
| time/              |          |
|    fps             | 11       |
|    iterations      | 1        |
|    time_elapsed    | 225      |
|    total_timesteps | 2500     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.578        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 2            |
|    time_elapsed         | 451          |
|    total_timesteps      | 5000         |
| train/                  |              |
|    approx_kl            | 0.0077587944 |
|    clip_fraction        | 0.368        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | -6.89        |
|    learning_r

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.584       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 11          |
|    time_elapsed         | 2487        |
|    total_timesteps      | 27500       |
| train/                  |             |
|    approx_kl            | 0.007828591 |
|    clip_fraction        | 0.374       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.694       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0504      |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0346     |
|    std                  | 0.055       |
|    value_loss           | 0.00933     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

Eval num_timesteps=50000, episode_reward=0.62 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.617       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.587       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 20          |
|    time_elapsed         | 4523        |
|    total_timesteps      | 50000       |
| train/                  |             |
|    approx_kl            | 0.008980239 |
|    clip_fraction        | 0.389       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.756       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0485      |
|    n_updates            | 380         |
|    policy

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.591        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 29           |
|    time_elapsed         | 6537         |
|    total_timesteps      | 72500        |
| train/                  |              |
|    approx_kl            | 0.0071494314 |
|    clip_fraction        | 0.385        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.797        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0568       |
|    n_updates            | 560          |
|    policy_gradient_loss | -0.0353      |
|    std                  | 0.0551       |
|    value_loss           | 0.00644      |
------------------------------------------
Eval num_timesteps=75000, episode_reward=0.63 +/- 0.00

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.597       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 38          |
|    time_elapsed         | 8570        |
|    total_timesteps      | 95000       |
| train/                  |             |
|    approx_kl            | 0.008807858 |
|    clip_fraction        | 0.403       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.822       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0468      |
|    n_updates            | 740         |
|    policy_gradient_loss | -0.0371     |
|    std                  | 0.0551      |
|    value_loss           | 0.00569     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.602       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 47          |
|    time_elapsed         | 10607       |
|    total_timesteps      | 117500      |
| train/                  |             |
|    approx_kl            | 0.009435235 |
|    clip_fraction        | 0.412       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.85        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0485      |
|    n_updates            | 920         |
|    policy_gradient_loss | -0.0371     |
|    std                  | 0.0551      |
|    value_loss           | 0.00481     |
-----------------------------------------
------------------------------------------
| rollout/                |      

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5          |
|    ep_rew_mean          | 0.602      |
| time/                   |            |
|    fps                  | 11         |
|    iterations           | 56         |
|    time_elapsed         | 12637      |
|    total_timesteps      | 140000     |
| train/                  |            |
|    approx_kl            | 0.00740358 |
|    clip_fraction        | 0.386      |
|    clip_range           | 0.1        |
|    entropy_loss         | -94.7      |
|    explained_variance   | 0.865      |
|    learning_rate        | 1e-05      |
|    loss                 | 0.0502     |
|    n_updates            | 1100       |
|    policy_gradient_loss | -0.0357    |
|    std                  | 0.0551     |
|    value_loss           | 0.00439    |
----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.613       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 65          |
|    time_elapsed         | 14674       |
|    total_timesteps      | 162500      |
| train/                  |             |
|    approx_kl            | 0.007945226 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.868       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0509      |
|    n_updates            | 1280        |
|    policy_gradient_loss | -0.0368     |
|    std                  | 0.0551      |
|    value_loss           | 0.0043      |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.615       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 74          |
|    time_elapsed         | 16718       |
|    total_timesteps      | 185000      |
| train/                  |             |
|    approx_kl            | 0.009281209 |
|    clip_fraction        | 0.386       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.871       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0538      |
|    n_updates            | 1460        |
|    policy_gradient_loss | -0.0352     |
|    std                  | 0.0551      |
|    value_loss           | 0.00428     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.619       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 83          |
|    time_elapsed         | 18761       |
|    total_timesteps      | 207500      |
| train/                  |             |
|    approx_kl            | 0.007728235 |
|    clip_fraction        | 0.383       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.883       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0516      |
|    n_updates            | 1640        |
|    policy_gradient_loss | -0.0352     |
|    std                  | 0.0551      |
|    value_loss           | 0.00389     |
-----------------------------------------
----------------------------------------
| rollout/                |        

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.623       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 92          |
|    time_elapsed         | 20799       |
|    total_timesteps      | 230000      |
| train/                  |             |
|    approx_kl            | 0.009118105 |
|    clip_fraction        | 0.388       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.883       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0496      |
|    n_updates            | 1820        |
|    policy_gradient_loss | -0.0358     |
|    std                  | 0.0551      |
|    value_loss           | 0.00391     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.627        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 101          |
|    time_elapsed         | 22835        |
|    total_timesteps      | 252500       |
| train/                  |              |
|    approx_kl            | 0.0074043972 |
|    clip_fraction        | 0.384        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.886        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0448       |
|    n_updates            | 2000         |
|    policy_gradient_loss | -0.0352      |
|    std                  | 0.055        |
|    value_loss           | 0.00382      |
------------------------------------------
-----------------------------------------
| rollout/  

Eval num_timesteps=275000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 5            |
|    mean_reward          | 0.665        |
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.629        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 110          |
|    time_elapsed         | 24868        |
|    total_timesteps      | 275000       |
| train/                  |              |
|    approx_kl            | 0.0069478746 |
|    clip_fraction        | 0.381        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.894        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0537       |
|    n_updates            | 2180         |
|    policy

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.633       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 119         |
|    time_elapsed         | 26898       |
|    total_timesteps      | 297500      |
| train/                  |             |
|    approx_kl            | 0.008544518 |
|    clip_fraction        | 0.414       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.895       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0468      |
|    n_updates            | 2360        |
|    policy_gradient_loss | -0.0371     |
|    std                  | 0.055       |
|    value_loss           | 0.00357     |
-----------------------------------------
Eval num_timesteps=300000, episode_reward=0.66 +/- 0.00
Episode length: 5.00

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.635        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 128          |
|    time_elapsed         | 28922        |
|    total_timesteps      | 320000       |
| train/                  |              |
|    approx_kl            | 0.0070554125 |
|    clip_fraction        | 0.411        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.894        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0487       |
|    n_updates            | 2540         |
|    policy_gradient_loss | -0.0352      |
|    std                  | 0.055        |
|    value_loss           | 0.00364      |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.639       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 137         |
|    time_elapsed         | 30958       |
|    total_timesteps      | 342500      |
| train/                  |             |
|    approx_kl            | 0.007678201 |
|    clip_fraction        | 0.391       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.898       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0487      |
|    n_updates            | 2720        |
|    policy_gradient_loss | -0.0347     |
|    std                  | 0.055       |
|    value_loss           | 0.0035      |
-----------------------------------------
-----------------------------------------
| rollout/                |       

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5          |
|    ep_rew_mean          | 0.644      |
| time/                   |            |
|    fps                  | 11         |
|    iterations           | 146        |
|    time_elapsed         | 32991      |
|    total_timesteps      | 365000     |
| train/                  |            |
|    approx_kl            | 0.00993668 |
|    clip_fraction        | 0.388      |
|    clip_range           | 0.1        |
|    entropy_loss         | -94.8      |
|    explained_variance   | 0.906      |
|    learning_rate        | 1e-05      |
|    loss                 | 0.0555     |
|    n_updates            | 2900       |
|    policy_gradient_loss | -0.0341    |
|    std                  | 0.055      |
|    value_loss           | 0.00321    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_me

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.643       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 155         |
|    time_elapsed         | 35014       |
|    total_timesteps      | 387500      |
| train/                  |             |
|    approx_kl            | 0.008204329 |
|    clip_fraction        | 0.394       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.907       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0508      |
|    n_updates            | 3080        |
|    policy_gradient_loss | -0.0356     |
|    std                  | 0.055       |
|    value_loss           | 0.00318     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.647       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 164         |
|    time_elapsed         | 37048       |
|    total_timesteps      | 410000      |
| train/                  |             |
|    approx_kl            | 0.008581808 |
|    clip_fraction        | 0.403       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.906       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0529      |
|    n_updates            | 3260        |
|    policy_gradient_loss | -0.0354     |
|    std                  | 0.055       |
|    value_loss           | 0.00327     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.649        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 173          |
|    time_elapsed         | 39084        |
|    total_timesteps      | 432500       |
| train/                  |              |
|    approx_kl            | 0.0070940885 |
|    clip_fraction        | 0.399        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.911        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0476       |
|    n_updates            | 3440         |
|    policy_gradient_loss | -0.0359      |
|    std                  | 0.055        |
|    value_loss           | 0.00309      |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.649       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 182         |
|    time_elapsed         | 41119       |
|    total_timesteps      | 455000      |
| train/                  |             |
|    approx_kl            | 0.007950384 |
|    clip_fraction        | 0.369       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.905       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0504      |
|    n_updates            | 3620        |
|    policy_gradient_loss | -0.0344     |
|    std                  | 0.055       |
|    value_loss           | 0.00329     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.645       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 191         |
|    time_elapsed         | 43146       |
|    total_timesteps      | 477500      |
| train/                  |             |
|    approx_kl            | 0.008851797 |
|    clip_fraction        | 0.414       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.915       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0521      |
|    n_updates            | 3800        |
|    policy_gradient_loss | -0.0358     |
|    std                  | 0.055       |
|    value_loss           | 0.00295     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

Eval num_timesteps=500000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 5            |
|    mean_reward          | 0.664        |
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.651        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 200          |
|    time_elapsed         | 45183        |
|    total_timesteps      | 500000       |
| train/                  |              |
|    approx_kl            | 0.0078010303 |
|    clip_fraction        | 0.413        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.9        |
|    explained_variance   | 0.914        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0459       |
|    n_updates            | 3980         |
|    policy

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5          |
|    ep_rew_mean          | 0.652      |
| time/                   |            |
|    fps                  | 11         |
|    iterations           | 209        |
|    time_elapsed         | 47192      |
|    total_timesteps      | 522500     |
| train/                  |            |
|    approx_kl            | 0.00666383 |
|    clip_fraction        | 0.375      |
|    clip_range           | 0.1        |
|    entropy_loss         | -94.9      |
|    explained_variance   | 0.914      |
|    learning_rate        | 1e-05      |
|    loss                 | 0.0539     |
|    n_updates            | 4160       |
|    policy_gradient_loss | -0.0343    |
|    std                  | 0.055      |
|    value_loss           | 0.003      |
----------------------------------------
Eval num_timesteps=525000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.654       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 218         |
|    time_elapsed         | 49221       |
|    total_timesteps      | 545000      |
| train/                  |             |
|    approx_kl            | 0.006632684 |
|    clip_fraction        | 0.372       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.912       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0572      |
|    n_updates            | 4340        |
|    policy_gradient_loss | -0.0332     |
|    std                  | 0.0549      |
|    value_loss           | 0.00309     |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.652       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 227         |
|    time_elapsed         | 51253       |
|    total_timesteps      | 567500      |
| train/                  |             |
|    approx_kl            | 0.008788474 |
|    clip_fraction        | 0.389       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.914       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0495      |
|    n_updates            | 4520        |
|    policy_gradient_loss | -0.034      |
|    std                  | 0.0549      |
|    value_loss           | 0.00299     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.653       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 236         |
|    time_elapsed         | 53287       |
|    total_timesteps      | 590000      |
| train/                  |             |
|    approx_kl            | 0.008260016 |
|    clip_fraction        | 0.392       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.913       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0479      |
|    n_updates            | 4700        |
|    policy_gradient_loss | -0.0343     |
|    std                  | 0.0549      |
|    value_loss           | 0.00303     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.657        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 245          |
|    time_elapsed         | 55317        |
|    total_timesteps      | 612500       |
| train/                  |              |
|    approx_kl            | 0.0071356646 |
|    clip_fraction        | 0.404        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.9        |
|    explained_variance   | 0.909        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0517       |
|    n_updates            | 4880         |
|    policy_gradient_loss | -0.035       |
|    std                  | 0.0549       |
|    value_loss           | 0.00317      |
------------------------------------------
------------------------------------------
| rollout/ 

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.656        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 254          |
|    time_elapsed         | 57348        |
|    total_timesteps      | 635000       |
| train/                  |              |
|    approx_kl            | 0.0079350285 |
|    clip_fraction        | 0.401        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.9        |
|    explained_variance   | 0.911        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0512       |
|    n_updates            | 5060         |
|    policy_gradient_loss | -0.0326      |
|    std                  | 0.0549       |
|    value_loss           | 0.00315      |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.657       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 263         |
|    time_elapsed         | 59377       |
|    total_timesteps      | 657500      |
| train/                  |             |
|    approx_kl            | 0.008704318 |
|    clip_fraction        | 0.397       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.912       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0557      |
|    n_updates            | 5240        |
|    policy_gradient_loss | -0.0331     |
|    std                  | 0.0549      |
|    value_loss           | 0.00305     |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.657       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 272         |
|    time_elapsed         | 61404       |
|    total_timesteps      | 680000      |
| train/                  |             |
|    approx_kl            | 0.007307296 |
|    clip_fraction        | 0.371       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.917       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0593      |
|    n_updates            | 5420        |
|    policy_gradient_loss | -0.0319     |
|    std                  | 0.0549      |
|    value_loss           | 0.00293     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.657       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 281         |
|    time_elapsed         | 63439       |
|    total_timesteps      | 702500      |
| train/                  |             |
|    approx_kl            | 0.007945287 |
|    clip_fraction        | 0.404       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.917       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0556      |
|    n_updates            | 5600        |
|    policy_gradient_loss | -0.034      |
|    std                  | 0.0549      |
|    value_loss           | 0.00293     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

Eval num_timesteps=725000, episode_reward=0.67 +/- 0.00
Episode length: 5.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.665       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.66        |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 290         |
|    time_elapsed         | 65470       |
|    total_timesteps      | 725000      |
| train/                  |             |
|    approx_kl            | 0.007719998 |
|    clip_fraction        | 0.391       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.919       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0471      |
|    n_updates            | 5780        |
|    policy_gradient_loss | -0.0

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.658        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 299          |
|    time_elapsed         | 67479        |
|    total_timesteps      | 747500       |
| train/                  |              |
|    approx_kl            | 0.0075664087 |
|    clip_fraction        | 0.396        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.9        |
|    explained_variance   | 0.922        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0524       |
|    n_updates            | 5960         |
|    policy_gradient_loss | -0.0339      |
|    std                  | 0.0549       |
|    value_loss           | 0.00275      |
------------------------------------------
Eval num_timesteps=750000, episode_reward=0.66 +/- 0.0

  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in

seed 3
generate callback ...
vectorize environment ...
vectorize env level 1
Box(-1.0, 1.0, (96,), float64)
model definition ..
Using cuda device
Logging to ./data/case_1_mlmc_ppo_1_level/seed_3
policy learning ..




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5        |
|    ep_rew_mean     | 0.579    |
| time/              |          |
|    fps             | 10       |
|    iterations      | 1        |
|    time_elapsed    | 229      |
|    total_timesteps | 2500     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.585       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 2           |
|    time_elapsed         | 452         |
|    total_timesteps      | 5000        |
| train/                  |             |
|    approx_kl            | 0.006904635 |
|    clip_fraction        | 0.37        |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | -4.01       |
|    learning_rate        | 1e

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.586       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 11          |
|    time_elapsed         | 2474        |
|    total_timesteps      | 27500       |
| train/                  |             |
|    approx_kl            | 0.007514202 |
|    clip_fraction        | 0.383       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.665       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0486      |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0367     |
|    std                  | 0.055       |
|    value_loss           | 0.0103      |
-----------------------------------------
------------------------------------------
| rollout/                |      

Eval num_timesteps=50000, episode_reward=0.63 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.634       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.587       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 20          |
|    time_elapsed         | 4510        |
|    total_timesteps      | 50000       |
| train/                  |             |
|    approx_kl            | 0.007901218 |
|    clip_fraction        | 0.386       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.752       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0516      |
|    n_updates            | 380         |
|    policy

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 5         |
|    ep_rew_mean          | 0.595     |
| time/                   |           |
|    fps                  | 11        |
|    iterations           | 29        |
|    time_elapsed         | 6513      |
|    total_timesteps      | 72500     |
| train/                  |           |
|    approx_kl            | 0.0088255 |
|    clip_fraction        | 0.379     |
|    clip_range           | 0.1       |
|    entropy_loss         | -94.8     |
|    explained_variance   | 0.801     |
|    learning_rate        | 1e-05     |
|    loss                 | 0.0518    |
|    n_updates            | 560       |
|    policy_gradient_loss | -0.0369   |
|    std                  | 0.0551    |
|    value_loss           | 0.00637   |
---------------------------------------
Eval num_timesteps=75000, episode_reward=0.65 +/- 0.00
Episode length: 5.00 +/- 0.00
New best mean reward!
-------------

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.599        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 38           |
|    time_elapsed         | 8542         |
|    total_timesteps      | 95000        |
| train/                  |              |
|    approx_kl            | 0.0074480814 |
|    clip_fraction        | 0.383        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.7        |
|    explained_variance   | 0.823        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0514       |
|    n_updates            | 740          |
|    policy_gradient_loss | -0.0376      |
|    std                  | 0.0551       |
|    value_loss           | 0.00566      |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.604       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 47          |
|    time_elapsed         | 10579       |
|    total_timesteps      | 117500      |
| train/                  |             |
|    approx_kl            | 0.007932332 |
|    clip_fraction        | 0.399       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.857       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0447      |
|    n_updates            | 920         |
|    policy_gradient_loss | -0.0372     |
|    std                  | 0.0551      |
|    value_loss           | 0.00462     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.606       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 56          |
|    time_elapsed         | 12611       |
|    total_timesteps      | 140000      |
| train/                  |             |
|    approx_kl            | 0.007693678 |
|    clip_fraction        | 0.399       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.7       |
|    explained_variance   | 0.872       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0474      |
|    n_updates            | 1100        |
|    policy_gradient_loss | -0.0372     |
|    std                  | 0.0551      |
|    value_loss           | 0.00416     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.615        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 65           |
|    time_elapsed         | 14633        |
|    total_timesteps      | 162500       |
| train/                  |              |
|    approx_kl            | 0.0072293933 |
|    clip_fraction        | 0.381        |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.87         |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0526       |
|    n_updates            | 1280         |
|    policy_gradient_loss | -0.0379      |
|    std                  | 0.055        |
|    value_loss           | 0.00427      |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.616       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 74          |
|    time_elapsed         | 16666       |
|    total_timesteps      | 185000      |
| train/                  |             |
|    approx_kl            | 0.007365689 |
|    clip_fraction        | 0.397       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.881       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0514      |
|    n_updates            | 1460        |
|    policy_gradient_loss | -0.0368     |
|    std                  | 0.0551      |
|    value_loss           | 0.00393     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.623       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 83          |
|    time_elapsed         | 18694       |
|    total_timesteps      | 207500      |
| train/                  |             |
|    approx_kl            | 0.008429658 |
|    clip_fraction        | 0.398       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.884       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0479      |
|    n_updates            | 1640        |
|    policy_gradient_loss | -0.0367     |
|    std                  | 0.0551      |
|    value_loss           | 0.00384     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.621       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 92          |
|    time_elapsed         | 20723       |
|    total_timesteps      | 230000      |
| train/                  |             |
|    approx_kl            | 0.009005575 |
|    clip_fraction        | 0.422       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.891       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0438      |
|    n_updates            | 1820        |
|    policy_gradient_loss | -0.0391     |
|    std                  | 0.055       |
|    value_loss           | 0.00363     |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.626       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 101         |
|    time_elapsed         | 22756       |
|    total_timesteps      | 252500      |
| train/                  |             |
|    approx_kl            | 0.008449148 |
|    clip_fraction        | 0.381       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.899       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0553      |
|    n_updates            | 2000        |
|    policy_gradient_loss | -0.0362     |
|    std                  | 0.055       |
|    value_loss           | 0.00339     |
-----------------------------------------
------------------------------------------
| rollout/                |      

Eval num_timesteps=275000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.665       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.627       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 110         |
|    time_elapsed         | 24791       |
|    total_timesteps      | 275000      |
| train/                  |             |
|    approx_kl            | 0.008454921 |
|    clip_fraction        | 0.41        |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.896       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.047       |
|    n_updates            | 2180        |
|    policy_gradient_loss | -0.0

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.631        |
| time/                   |              |
|    fps                  | 11           |
|    iterations           | 119          |
|    time_elapsed         | 26801        |
|    total_timesteps      | 297500       |
| train/                  |              |
|    approx_kl            | 0.0068763252 |
|    clip_fraction        | 0.37         |
|    clip_range           | 0.1          |
|    entropy_loss         | -94.8        |
|    explained_variance   | 0.899        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0498       |
|    n_updates            | 2360         |
|    policy_gradient_loss | -0.035       |
|    std                  | 0.055        |
|    value_loss           | 0.00344      |
------------------------------------------
Eval num_timesteps=300000, episode_reward=0.66 +/- 0.0

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.639       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 128         |
|    time_elapsed         | 28831       |
|    total_timesteps      | 320000      |
| train/                  |             |
|    approx_kl            | 0.006874793 |
|    clip_fraction        | 0.394       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.899       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.05        |
|    n_updates            | 2540        |
|    policy_gradient_loss | -0.0368     |
|    std                  | 0.0551      |
|    value_loss           | 0.00346     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.639       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 137         |
|    time_elapsed         | 30864       |
|    total_timesteps      | 342500      |
| train/                  |             |
|    approx_kl            | 0.008515933 |
|    clip_fraction        | 0.397       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.898       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0436      |
|    n_updates            | 2720        |
|    policy_gradient_loss | -0.0366     |
|    std                  | 0.055       |
|    value_loss           | 0.00349     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.642       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 146         |
|    time_elapsed         | 32897       |
|    total_timesteps      | 365000      |
| train/                  |             |
|    approx_kl            | 0.009611568 |
|    clip_fraction        | 0.388       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.91        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0488      |
|    n_updates            | 2900        |
|    policy_gradient_loss | -0.036      |
|    std                  | 0.055       |
|    value_loss           | 0.00309     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.642       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 155         |
|    time_elapsed         | 34927       |
|    total_timesteps      | 387500      |
| train/                  |             |
|    approx_kl            | 0.009416835 |
|    clip_fraction        | 0.4         |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.906       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0501      |
|    n_updates            | 3080        |
|    policy_gradient_loss | -0.0373     |
|    std                  | 0.055       |
|    value_loss           | 0.00323     |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.644       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 164         |
|    time_elapsed         | 36968       |
|    total_timesteps      | 410000      |
| train/                  |             |
|    approx_kl            | 0.008168137 |
|    clip_fraction        | 0.393       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.91        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0459      |
|    n_updates            | 3260        |
|    policy_gradient_loss | -0.0371     |
|    std                  | 0.055       |
|    value_loss           | 0.00311     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.65        |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 173         |
|    time_elapsed         | 39001       |
|    total_timesteps      | 432500      |
| train/                  |             |
|    approx_kl            | 0.006809397 |
|    clip_fraction        | 0.401       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.908       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.055       |
|    n_updates            | 3440        |
|    policy_gradient_loss | -0.0369     |
|    std                  | 0.055       |
|    value_loss           | 0.00318     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.647       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 182         |
|    time_elapsed         | 41034       |
|    total_timesteps      | 455000      |
| train/                  |             |
|    approx_kl            | 0.009446802 |
|    clip_fraction        | 0.411       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.8       |
|    explained_variance   | 0.907       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0474      |
|    n_updates            | 3620        |
|    policy_gradient_loss | -0.0378     |
|    std                  | 0.055       |
|    value_loss           | 0.00321     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 5         |
|    ep_rew_mean          | 0.644     |
| time/                   |           |
|    fps                  | 11        |
|    iterations           | 191       |
|    time_elapsed         | 43061     |
|    total_timesteps      | 477500    |
| train/                  |           |
|    approx_kl            | 0.0087344 |
|    clip_fraction        | 0.413     |
|    clip_range           | 0.1       |
|    entropy_loss         | -94.8     |
|    explained_variance   | 0.912     |
|    learning_rate        | 1e-05     |
|    loss                 | 0.0432    |
|    n_updates            | 3800      |
|    policy_gradient_loss | -0.0384   |
|    std                  | 0.055     |
|    value_loss           | 0.00305   |
---------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5       

Eval num_timesteps=500000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.662       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.651       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 200         |
|    time_elapsed         | 45095       |
|    total_timesteps      | 500000      |
| train/                  |             |
|    approx_kl            | 0.007848656 |
|    clip_fraction        | 0.39        |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.907       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0558      |
|    n_updates            | 3980        |
|    policy_gradient_loss | -0.0

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.65        |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 209         |
|    time_elapsed         | 47103       |
|    total_timesteps      | 522500      |
| train/                  |             |
|    approx_kl            | 0.007788081 |
|    clip_fraction        | 0.398       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.906       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0445      |
|    n_updates            | 4160        |
|    policy_gradient_loss | -0.037      |
|    std                  | 0.0549      |
|    value_loss           | 0.00328     |
-----------------------------------------
Eval num_timesteps=525000, episode_reward=0.66 +/- 0.00
Episode length: 5.00

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.652       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 218         |
|    time_elapsed         | 49138       |
|    total_timesteps      | 545000      |
| train/                  |             |
|    approx_kl            | 0.007352876 |
|    clip_fraction        | 0.372       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.911       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0449      |
|    n_updates            | 4340        |
|    policy_gradient_loss | -0.036      |
|    std                  | 0.0549      |
|    value_loss           | 0.00312     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.654       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 227         |
|    time_elapsed         | 51174       |
|    total_timesteps      | 567500      |
| train/                  |             |
|    approx_kl            | 0.008437086 |
|    clip_fraction        | 0.381       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.91        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.052       |
|    n_updates            | 4520        |
|    policy_gradient_loss | -0.0354     |
|    std                  | 0.0549      |
|    value_loss           | 0.00313     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.654       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 236         |
|    time_elapsed         | 53201       |
|    total_timesteps      | 590000      |
| train/                  |             |
|    approx_kl            | 0.008223009 |
|    clip_fraction        | 0.398       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.912       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0509      |
|    n_updates            | 4700        |
|    policy_gradient_loss | -0.0359     |
|    std                  | 0.0549      |
|    value_loss           | 0.00307     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.658       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 245         |
|    time_elapsed         | 55224       |
|    total_timesteps      | 612500      |
| train/                  |             |
|    approx_kl            | 0.006270281 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.91        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0509      |
|    n_updates            | 4880        |
|    policy_gradient_loss | -0.0355     |
|    std                  | 0.0549      |
|    value_loss           | 0.00316     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.655       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 254         |
|    time_elapsed         | 57252       |
|    total_timesteps      | 635000      |
| train/                  |             |
|    approx_kl            | 0.008173032 |
|    clip_fraction        | 0.401       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.91        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0539      |
|    n_updates            | 5060        |
|    policy_gradient_loss | -0.0359     |
|    std                  | 0.0549      |
|    value_loss           | 0.00315     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.658       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 263         |
|    time_elapsed         | 59286       |
|    total_timesteps      | 657500      |
| train/                  |             |
|    approx_kl            | 0.008515628 |
|    clip_fraction        | 0.382       |
|    clip_range           | 0.1         |
|    entropy_loss         | -94.9       |
|    explained_variance   | 0.914       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0481      |
|    n_updates            | 5240        |
|    policy_gradient_loss | -0.0357     |
|    std                  | 0.0549      |
|    value_loss           | 0.00304     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.658       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 272         |
|    time_elapsed         | 61311       |
|    total_timesteps      | 680000      |
| train/                  |             |
|    approx_kl            | 0.007981088 |
|    clip_fraction        | 0.394       |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.91        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0534      |
|    n_updates            | 5420        |
|    policy_gradient_loss | -0.0357     |
|    std                  | 0.0549      |
|    value_loss           | 0.0032      |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.658       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 281         |
|    time_elapsed         | 63343       |
|    total_timesteps      | 702500      |
| train/                  |             |
|    approx_kl            | 0.009585737 |
|    clip_fraction        | 0.4         |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.916       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0446      |
|    n_updates            | 5600        |
|    policy_gradient_loss | -0.0358     |
|    std                  | 0.0549      |
|    value_loss           | 0.00297     |
-----------------------------------------
------------------------------------------
| rollout/                |      

Eval num_timesteps=725000, episode_reward=0.66 +/- 0.00
Episode length: 5.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5           |
|    mean_reward          | 0.661       |
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.66        |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 290         |
|    time_elapsed         | 65374       |
|    total_timesteps      | 725000      |
| train/                  |             |
|    approx_kl            | 0.007947528 |
|    clip_fraction        | 0.396       |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.918       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.0486      |
|    n_updates            | 5780        |
|    policy_gradient_loss | -0.0

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.658       |
| time/                   |             |
|    fps                  | 11          |
|    iterations           | 299         |
|    time_elapsed         | 67385       |
|    total_timesteps      | 747500      |
| train/                  |             |
|    approx_kl            | 0.006966761 |
|    clip_fraction        | 0.398       |
|    clip_range           | 0.1         |
|    entropy_loss         | -95         |
|    explained_variance   | 0.92        |
|    learning_rate        | 1e-05       |
|    loss                 | 0.053       |
|    n_updates            | 5960        |
|    policy_gradient_loss | -0.0359     |
|    std                  | 0.0549      |
|    value_loss           | 0.00285     |
-----------------------------------------
Eval num_timesteps=750000, episode_reward=0.66 +/- 0.00
Episode length: 5.00

  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in range(len(p_1)-1):
  for j in