In [1]:
# to access functions from other locations
import sys
sys.path.append('/data/ad181/RemoteDir/rl_robust_owc')

In [2]:
%matplotlib notebook
import numpy as np
import time
import pickle
import os
import matplotlib.pyplot as plt

import gym
from stable_baselines3.ppo import PPO, MlpPolicy
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.callbacks import CallbackList
from utils.custom_eval_callback import CustomEvalCallback
from typing import Callable

from utils.plot_functions import plot_learning

from model.ressim import Grid
from ressim_env import ResSimEnv_v0, ResSimEnv_v1, ResSimEnv_v2
from k_distributions.generate_constr_k import generate_cond_
from utils.env_wrappers import StepReset, StateCoarse

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
seed=1
case='case_2_ppo'

In [4]:
os.makedirs('./data', exist_ok=True)
os.makedirs('./data/'+case, exist_ok=True)

In [5]:
with open('../envs_params/env_data/env_train.pkl', 'rb') as input:
    env_train = pickle.load(input)

with open('../envs_params/env_data/env_eval.pkl', 'rb') as input:
    env_eval = pickle.load(input)

In [6]:
# env wrapper to reduce state space
x_coords, y_coords = env_train.p_x, env_train.p_y

def env_wrappers(env, x_coords, y_coords):
    env = StepReset(env)
    env = StateCoarse(env, x_coords, y_coords, include_well_pr=True)
    return env

In [7]:
# env_train = env_wrappers(env_train, x_coords, y_coords)
# print(env_train.observation_space)
# base_action = np.ones(env_train.action_space.shape[0])

# state, done = env_train.reset(), False
# print(state)
# while not done:
#     state, reward, done, info = env_train.step(base_action)
#     print(state)

In [8]:
def make_env(env, rank: int, seed: int) -> Callable:
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env_ = env
        env_.seed(seed + rank)
        return env_
    return _init

In [9]:
for seed in range(1,4):
    if True:
        print(f'seed {seed}')
        log_dir = './data/'+case+'/seed_'+str(seed)
        os.makedirs(log_dir, exist_ok=True)
        num_cpu = 64
        env_train.seed(seed)
        env_eval.seed(seed)
        env_train_ = env_wrappers(env_train, x_coords, y_coords)
        env_eval_ = env_wrappers(env_eval, x_coords, y_coords)
        train_callback = CustomEvalCallback(env_train_, 
                                            best_model_save_path=None, 
                                            n_eval_episodes=1,
                                            log_path=str(log_dir)+'/results_train', 
                                            eval_freq=100)
        callback_list = [train_callback]
        eval_callback = CustomEvalCallback(env_eval_, 
                                           best_model_save_path=str(log_dir)+'/best_model_eval', 
                                           n_eval_episodes=1,
                                           log_path=str(log_dir)+'/results_eval', 
                                           eval_freq=100)
        callback_list.append(eval_callback)
        callback = CallbackList(callback_list)
        env = SubprocVecEnv([make_env(env_train_, i, seed) for i in range(num_cpu)])
        print(env.observation_space)
        print(f'seed {seed}: model definition ..')
        model = PPO(policy=MlpPolicy,
                env=env,
                learning_rate = 5e-5,
                n_steps = 50,
                batch_size = 16,
                n_epochs = 20,
                gamma = 0.99,
                gae_lambda = 0.95,
                clip_range = 0.1,
                clip_range_vf = None,
                ent_coef = 0.001,
                vf_coef = 0.5,
                max_grad_norm = 0.5,
                use_sde= False,
                create_eval_env= False,
                policy_kwargs = dict(net_arch=[20,20], log_std_init=-1.9),
                verbose = 1,
                target_kl =0.05,
                seed = seed,
                device = "auto")
        print(f'seed {seed}: learning ..')
        model.learn(total_timesteps=300000, callback=callback)
        model.save(log_dir+'/PPO')
        fig = plot_learning(log_dir, case='train')
        fig.savefig(log_dir+'/learn_train.png')
        fig = plot_learning(log_dir, case='eval')
        fig.savefig(log_dir+'/learn_eval.png')

seed 1
Box(-100000.0, 100000.0, (9,), float64)
seed 1: model definition ..
Using cuda device
seed 1: learning ..




-----------------------------
| time/              |      |
|    fps             | 251  |
|    iterations      | 1    |
|    time_elapsed    | 12   |
|    total_timesteps | 3200 |
-----------------------------
Eval num_timesteps=6400, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6400, episode_reward=0.57 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.572        |
| time/                   |              |
|    fps                  | 109          |
|    iterations           | 2            |
|    time_elapsed         | 58           |
|    total_timesteps      | 6400         |
| train/                  |              |
|    approx_kl            | 0.0026606661 |
|    clip_fraction        | 0.106        |
|    clip_range           | 0.1          |
|    entropy_loss  

------------------------------------------
| time/                   |              |
|    fps                  | 83           |
|    iterations           | 11           |
|    time_elapsed         | 422          |
|    total_timesteps      | 35200        |
| train/                  |              |
|    approx_kl            | 0.0017997539 |
|    clip_fraction        | 0.164        |
|    clip_range           | 0.1          |
|    entropy_loss         | 2.54         |
|    explained_variance   | 0.932        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0124      |
|    n_updates            | 200          |
|    policy_gradient_loss | -0.00889     |
|    std                  | 0.145        |
|    value_loss           | 0.00205      |
------------------------------------------
Eval num_timesteps=38400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=38400, episode_reward=0.60 +/- 0.00
Episode length: 4.00 

Eval num_timesteps=64000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=64000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.607        |
| time/                   |              |
|    fps                  | 79           |
|    iterations           | 20           |
|    time_elapsed         | 805          |
|    total_timesteps      | 64000        |
| train/                  |              |
|    approx_kl            | 0.0023344038 |
|    clip_fraction        | 0.136        |
|    clip_range           | 0.1          |
|    entropy_loss         | 2.7          |
|    explained_variance   | 0.936        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0106      |
|    n_updates            | 380          |
|    policy_

------------------------------------------
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 29           |
|    time_elapsed         | 1186         |
|    total_timesteps      | 92800        |
| train/                  |              |
|    approx_kl            | 0.0031549064 |
|    clip_fraction        | 0.149        |
|    clip_range           | 0.1          |
|    entropy_loss         | 3            |
|    explained_variance   | 0.944        |
|    learning_rate        | 5e-05        |
|    loss                 | 0.0239       |
|    n_updates            | 560          |
|    policy_gradient_loss | -0.0062      |
|    std                  | 0.133        |
|    value_loss           | 0.00173      |
------------------------------------------
Eval num_timesteps=96000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=96000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 

Eval num_timesteps=121600, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=121600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.607        |
| time/                   |              |
|    fps                  | 76           |
|    iterations           | 38           |
|    time_elapsed         | 1586         |
|    total_timesteps      | 121600       |
| train/                  |              |
|    approx_kl            | 0.0014356733 |
|    clip_fraction        | 0.13         |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.28         |
|    explained_variance   | 0.947        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0103      |
|    n_updates            | 740          |
|    polic

------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 47           |
|    time_elapsed         | 1941         |
|    total_timesteps      | 150400       |
| train/                  |              |
|    approx_kl            | 0.0038312613 |
|    clip_fraction        | 0.154        |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.54         |
|    explained_variance   | 0.953        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.00825     |
|    n_updates            | 920          |
|    policy_gradient_loss | -0.00518     |
|    std                  | 0.12         |
|    value_loss           | 0.00152      |
------------------------------------------
Eval num_timesteps=153600, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=153600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
-----------

Eval num_timesteps=179200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=179200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.609        |
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 56           |
|    time_elapsed         | 2325         |
|    total_timesteps      | 179200       |
| train/                  |              |
|    approx_kl            | 0.0023153112 |
|    clip_fraction        | 0.138        |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.88         |
|    explained_variance   | 0.957        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.00872     |
|    n_updates            | 1100         |
|    policy_gradient_loss | -0.00433     |
|    std   

------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 65           |
|    time_elapsed         | 2699         |
|    total_timesteps      | 208000       |
| train/                  |              |
|    approx_kl            | 0.0031137771 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.16         |
|    explained_variance   | 0.96         |
|    learning_rate        | 5e-05        |
|    loss                 | 0.0155       |
|    n_updates            | 1280         |
|    policy_gradient_loss | -0.00303     |
|    std                  | 0.106        |
|    value_loss           | 0.00135      |
------------------------------------------
Eval num_timesteps=211200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=211200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
-----------

Eval num_timesteps=236800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=236800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.61         |
| time/                   |              |
|    fps                  | 76           |
|    iterations           | 74           |
|    time_elapsed         | 3081         |
|    total_timesteps      | 236800       |
| train/                  |              |
|    approx_kl            | 0.0003227004 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.41         |
|    explained_variance   | 0.964        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.00623     |
|    n_updates            | 1460         |
|    policy_gradient_loss | -0.00293     |
|    std   

------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 83           |
|    time_elapsed         | 3445         |
|    total_timesteps      | 265600       |
| train/                  |              |
|    approx_kl            | 0.0015489731 |
|    clip_fraction        | 0.135        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.66         |
|    explained_variance   | 0.967        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0151      |
|    n_updates            | 1640         |
|    policy_gradient_loss | -0.00309     |
|    std                  | 0.0969       |
|    value_loss           | 0.00112      |
------------------------------------------
Eval num_timesteps=268800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=268800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
-----------

Eval num_timesteps=294400, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=294400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.61         |
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 92           |
|    time_elapsed         | 3813         |
|    total_timesteps      | 294400       |
| train/                  |              |
|    approx_kl            | 0.0017922219 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.86         |
|    explained_variance   | 0.97         |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0154      |
|    n_updates            | 1820         |
|    policy_gradient_loss | -0.00214     |
|    std   

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

seed 2
Box(-100000.0, 100000.0, (9,), float64)
seed 2: model definition ..
Using cuda device
seed 2: learning ..




------------------------------------------
| time/                   |              |
|    fps                  | 259          |
|    iterations           | 1            |
|    time_elapsed         | 12           |
|    total_timesteps      | 3200         |
| train/                  |              |
|    approx_kl            | 0.0013083592 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.95         |
|    explained_variance   | 0.97         |
|    learning_rate        | 5e-05        |
|    loss                 | 0.00107      |
|    n_updates            | 1880         |
|    policy_gradient_loss | -0.00178     |
|    std                  | 0.092        |
|    value_loss           | 0.00104      |
------------------------------------------
Eval num_timesteps=6400, episode_reward=0.58 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6400, episode_reward=0.57 +/- 0.00
Episode length: 4.00 +/

Eval num_timesteps=32000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=32000, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 4           |
|    mean_reward          | 0.599       |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 10          |
|    time_elapsed         | 398         |
|    total_timesteps      | 32000       |
| train/                  |             |
|    approx_kl            | 0.004057133 |
|    clip_fraction        | 0.127       |
|    clip_range           | 0.1         |
|    entropy_loss         | 2.46        |
|    explained_variance   | 0.931       |
|    learning_rate        | 5e-05       |
|    loss                 | 0.00399     |
|    n_updates            | 180         |
|    policy_gradient_loss | -0.00658    |
|    std                  | 0.14

------------------------------------------
| time/                   |              |
|    fps                  | 79           |
|    iterations           | 19           |
|    time_elapsed         | 765          |
|    total_timesteps      | 60800        |
| train/                  |              |
|    approx_kl            | 0.0012736561 |
|    clip_fraction        | 0.139        |
|    clip_range           | 0.1          |
|    entropy_loss         | 2.65         |
|    explained_variance   | 0.944        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.00685     |
|    n_updates            | 360          |
|    policy_gradient_loss | -0.00666     |
|    std                  | 0.142        |
|    value_loss           | 0.00172      |
------------------------------------------
Eval num_timesteps=64000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=64000, episode_reward=0.60 +/- 0.00
Episode length: 4.00 

Eval num_timesteps=89600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=89600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.605        |
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 28           |
|    time_elapsed         | 1148         |
|    total_timesteps      | 89600        |
| train/                  |              |
|    approx_kl            | 0.0025214092 |
|    clip_fraction        | 0.119        |
|    clip_range           | 0.1          |
|    entropy_loss         | 2.93         |
|    explained_variance   | 0.948        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0152      |
|    n_updates            | 540          |
|    policy_gradient_loss | -0.0048      |
|    std     

-----------------------------------------
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 37          |
|    time_elapsed         | 1507        |
|    total_timesteps      | 118400      |
| train/                  |             |
|    approx_kl            | 0.002254803 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.1         |
|    entropy_loss         | 3.19        |
|    explained_variance   | 0.954       |
|    learning_rate        | 5e-05       |
|    loss                 | 0.0084      |
|    n_updates            | 720         |
|    policy_gradient_loss | -0.00554    |
|    std                  | 0.128       |
|    value_loss           | 0.00147     |
-----------------------------------------
Eval num_timesteps=121600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=121600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
--------

Eval num_timesteps=147200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=147200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.609        |
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 46           |
|    time_elapsed         | 1886         |
|    total_timesteps      | 147200       |
| train/                  |              |
|    approx_kl            | 0.0033597657 |
|    clip_fraction        | 0.13         |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.45         |
|    explained_variance   | 0.958        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0184      |
|    n_updates            | 900          |
|    policy_gradient_loss | -0.00408     |
|    std   

------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 55           |
|    time_elapsed         | 2259         |
|    total_timesteps      | 176000       |
| train/                  |              |
|    approx_kl            | 0.0002719511 |
|    clip_fraction        | 0.12         |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.75         |
|    explained_variance   | 0.963        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0152      |
|    n_updates            | 1080         |
|    policy_gradient_loss | -0.00324     |
|    std                  | 0.115        |
|    value_loss           | 0.00121      |
------------------------------------------
Eval num_timesteps=179200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=179200, episode_reward=0.61 +/- 0.00
Episode length: 4.0

Eval num_timesteps=204800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=204800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.61         |
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 64           |
|    time_elapsed         | 2630         |
|    total_timesteps      | 204800       |
| train/                  |              |
|    approx_kl            | 0.0013683971 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.97         |
|    explained_variance   | 0.962        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.018       |
|    n_updates            | 1260         |
|    policy_gradient_loss | -0.00335     |
|    std   

------------------------------------------
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 73           |
|    time_elapsed         | 2984         |
|    total_timesteps      | 233600       |
| train/                  |              |
|    approx_kl            | 0.0035115355 |
|    clip_fraction        | 0.136        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.19         |
|    explained_variance   | 0.967        |
|    learning_rate        | 5e-05        |
|    loss                 | 0.0217       |
|    n_updates            | 1440         |
|    policy_gradient_loss | -0.00388     |
|    std                  | 0.106        |
|    value_loss           | 0.00111      |
------------------------------------------
Eval num_timesteps=236800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=236800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best me

Eval num_timesteps=262400, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=262400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.611        |
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 82           |
|    time_elapsed         | 3376         |
|    total_timesteps      | 262400       |
| train/                  |              |
|    approx_kl            | 0.0014431815 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.49         |
|    explained_variance   | 0.968        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.018       |
|    n_updates            | 1620         |
|    polic

------------------------------------------
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 91           |
|    time_elapsed         | 3727         |
|    total_timesteps      | 291200       |
| train/                  |              |
|    approx_kl            | 0.0022251995 |
|    clip_fraction        | 0.143        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.72         |
|    explained_variance   | 0.967        |
|    learning_rate        | 5e-05        |
|    loss                 | 0.0256       |
|    n_updates            | 1800         |
|    policy_gradient_loss | -0.00343     |
|    std                  | 0.0958       |
|    value_loss           | 0.00113      |
------------------------------------------
Eval num_timesteps=294400, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=294400, episode_reward=0.61 +/- 0.00
Episode length: 4.0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

seed 3
Box(-100000.0, 100000.0, (9,), float64)
seed 3: model definition ..
Using cuda device
seed 3: learning ..




------------------------------------------
| time/                   |              |
|    fps                  | 259          |
|    iterations           | 1            |
|    time_elapsed         | 12           |
|    total_timesteps      | 3200         |
| train/                  |              |
|    approx_kl            | 0.0025084144 |
|    clip_fraction        | 0.152        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.82         |
|    explained_variance   | 0.97         |
|    learning_rate        | 5e-05        |
|    loss                 | -0.00575     |
|    n_updates            | 1880         |
|    policy_gradient_loss | -0.00385     |
|    std                  | 0.0942       |
|    value_loss           | 0.00102      |
------------------------------------------
Eval num_timesteps=6400, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6400, episode_reward=0.58 +/- 0.00
Episode length: 4.00 +/

Eval num_timesteps=32000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=32000, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.604        |
| time/                   |              |
|    fps                  | 80           |
|    iterations           | 10           |
|    time_elapsed         | 399          |
|    total_timesteps      | 32000        |
| train/                  |              |
|    approx_kl            | 0.0036049315 |
|    clip_fraction        | 0.148        |
|    clip_range           | 0.1          |
|    entropy_loss         | 2.46         |
|    explained_variance   | 0.928        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0152      |
|    n_updates            | 180          |
|    policy_gradient_loss | -0.008

------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 19           |
|    time_elapsed         | 788          |
|    total_timesteps      | 60800        |
| train/                  |              |
|    approx_kl            | 0.0016256609 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.1          |
|    entropy_loss         | 2.69         |
|    explained_variance   | 0.938        |
|    learning_rate        | 5e-05        |
|    loss                 | 0.0427       |
|    n_updates            | 360          |
|    policy_gradient_loss | -0.00674     |
|    std                  | 0.141        |
|    value_loss           | 0.00189      |
------------------------------------------
Eval num_timesteps=64000, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=64000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 

Eval num_timesteps=89600, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=89600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.608        |
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 28           |
|    time_elapsed         | 1161         |
|    total_timesteps      | 89600        |
| train/                  |              |
|    approx_kl            | 0.0019759065 |
|    clip_fraction        | 0.143        |
|    clip_range           | 0.1          |
|    entropy_loss         | 2.93         |
|    explained_variance   | 0.942        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0109      |
|    n_updates            | 540          |
|    policy_gradient_loss | -0.00551     |
|    std     

------------------------------------------
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 37           |
|    time_elapsed         | 1517         |
|    total_timesteps      | 118400       |
| train/                  |              |
|    approx_kl            | 0.0042150794 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.18         |
|    explained_variance   | 0.949        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.000601    |
|    n_updates            | 720          |
|    policy_gradient_loss | -0.00465     |
|    std                  | 0.128        |
|    value_loss           | 0.00161      |
------------------------------------------
Eval num_timesteps=121600, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=121600, episode_reward=0.61 +/- 0.00
Episode length: 4.0

Eval num_timesteps=147200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=147200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 4            |
|    mean_reward          | 0.61         |
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 46           |
|    time_elapsed         | 1904         |
|    total_timesteps      | 147200       |
| train/                  |              |
|    approx_kl            | 0.0010552302 |
|    clip_fraction        | 0.121        |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.5          |
|    explained_variance   | 0.954        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0245      |
|    n_updates            | 900          |
|    policy_gradient_loss | -0.00418     |
|    std   

------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 55           |
|    time_elapsed         | 2274         |
|    total_timesteps      | 176000       |
| train/                  |              |
|    approx_kl            | 0.0015959807 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.1          |
|    entropy_loss         | 3.81         |
|    explained_variance   | 0.958        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0173      |
|    n_updates            | 1080         |
|    policy_gradient_loss | -0.00306     |
|    std                  | 0.114        |
|    value_loss           | 0.00136      |
------------------------------------------
Eval num_timesteps=179200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=179200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
-----------

Eval num_timesteps=204800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=204800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 4           |
|    mean_reward          | 0.611       |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 64          |
|    time_elapsed         | 2665        |
|    total_timesteps      | 204800      |
| train/                  |             |
|    approx_kl            | 0.006549967 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.1         |
|    entropy_loss         | 4.03        |
|    explained_variance   | 0.961       |
|    learning_rate        | 5e-05       |
|    loss                 | -0.0134     |
|    n_updates            | 1260        |
|    policy_gradient_loss | -0.004      |
|    std

------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 73           |
|    time_elapsed         | 3025         |
|    total_timesteps      | 233600       |
| train/                  |              |
|    approx_kl            | 0.0031448216 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.26         |
|    explained_variance   | 0.965        |
|    learning_rate        | 5e-05        |
|    loss                 | 0.0177       |
|    n_updates            | 1440         |
|    policy_gradient_loss | -0.00339     |
|    std                  | 0.105        |
|    value_loss           | 0.00121      |
------------------------------------------
Eval num_timesteps=236800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=236800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
-----------

Eval num_timesteps=262400, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=262400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
--------------------------------------------
| eval/                   |                |
|    mean_ep_length       | 4              |
|    mean_reward          | 0.612          |
| time/                   |                |
|    fps                  | 76             |
|    iterations           | 82             |
|    time_elapsed         | 3408           |
|    total_timesteps      | 262400         |
| train/                  |                |
|    approx_kl            | -8.0754235e-06 |
|    clip_fraction        | 0.12           |
|    clip_range           | 0.1            |
|    entropy_loss         | 4.56           |
|    explained_variance   | 0.967          |
|    learning_rate        | 5e-05          |
|    loss                 | 0.0154         |
|    n_updates            | 1620         

------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 91           |
|    time_elapsed         | 3769         |
|    total_timesteps      | 291200       |
| train/                  |              |
|    approx_kl            | 0.0017943489 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.1          |
|    entropy_loss         | 4.73         |
|    explained_variance   | 0.967        |
|    learning_rate        | 5e-05        |
|    loss                 | -0.0154      |
|    n_updates            | 1800         |
|    policy_gradient_loss | -0.00259     |
|    std                  | 0.0962       |
|    value_loss           | 0.00113      |
------------------------------------------
Eval num_timesteps=294400, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=294400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
-----------

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>