In [6]:
import numpy as np
from rl_env_sarl import SARLDraftEnv
import gymnasium as gym
# from stable_baselines3 import PPO
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.env_checker import check_env
from pprint import pprint

In [7]:

    
def learning_rate_schedule(initial_lr=1e-4, final_lr=5e-5):
    return lambda progress_remaining: progress_remaining * (initial_lr - final_lr) + final_lr

learning_rate_schedule_fn = learning_rate_schedule(initial_lr=1e-4, final_lr=5e-5)

ppo_params = {
    "policy": "MlpLstmPolicy",
    "learning_rate": learning_rate_schedule_fn,  # Adaptive learning rate
    "n_steps": 60,  # Adjusted to be a multiple of your sequence length
    "batch_size": 256,
    "n_epochs": 15,
    "gamma": 0.99,
    "gae_lambda": 0.95,
    "clip_range": 0.2,
    "clip_range_vf": None,
    "normalize_advantage": True,
    "ent_coef": .03,  # Entropy coefficient for the loss calculation
    "vf_coef": 0.5,
    "max_grad_norm": 0.7,
    "use_sde": False,
    "sde_sample_freq": -1,
    "target_kl": 0.01,
    "stats_window_size": 100,
    "policy_kwargs": dict(
        net_arch=[dict(pi=[128, 128], vf=[128, 128])],
        n_lstm_layers=1,  # Number of LSTM layers
        lstm_hidden_size=128  # Size of the LSTM's hidden state
    ),
    "verbose": 1,
    "seed": 42,
    "device": "auto",
    "_init_setup_model": True
}

In [8]:
check_env(SARLDraftEnv())

In [9]:
# env = SARLDraftEnv()
# model = DQN(env=env, **dqn_params)
# model.learn(total_timesteps=100000)

In [10]:
import os
import json
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback, EvalCallback
from stable_baselines3.common.logger import configure
from datetime import datetime

# Define your environment
env = SARLDraftEnv()

# Generate a unique run ID
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")

# Choose significant parameters to include in the log directory name
lr_str = ppo_params['learning_rate'].__name__ if callable(ppo_params['learning_rate']) else ppo_params['learning_rate']
gamma_str = ppo_params['gamma'].__name__ if callable(ppo_params['gamma']) else ppo_params['gamma']
param_str = f"RNN-PPO"
# param_str = f"PPO_lr_{lr_str}_gamma_{gamma_str}"
log_dir = f"./logs/{param_str}_{run_id}/"
os.makedirs(log_dir, exist_ok=True)

# Save ppo_params to a JSON file
params_save_path = os.path.join(log_dir, f'ppo_params_{run_id}.json')
with open(params_save_path, 'w') as f:
    # if the param is not json serializable (e.g. function), then write callable name
    json_params = {key: value if not callable(value) else value.__name__ for key, value in ppo_params.items()}
    json.dump(json_params, f, indent=2)

# Configure the logger
new_logger = configure(log_dir, ["stdout", "csv", "tensorboard"])

# Define the PPO model and attach the logger
model = RecurrentPPO(**ppo_params, env=env, tensorboard_log=log_dir)
model.set_logger(new_logger)

# Log ppo_params to TensorBoard/CSV
for key, value in ppo_params.items():
    model.logger.record(f'params/{key}', value)


# Custom callback for monitoring and saving during training
class TrainingMonitorCallback(BaseCallback):
    def __init__(self, log_interval: int, save_interval: int, draft_save_interval: int, log_dir: str, run_id: str, verbose=1):
        super(TrainingMonitorCallback, self).__init__(verbose)
        self.log_interval = log_interval
        self.save_interval = save_interval
        self.draft_save_interval = draft_save_interval
        self.log_dir = log_dir
        self.run_id = run_id
        self.best_mean_reward = -float("inf")
        self.rewards_history = []
        self.losses_history = []
        self.episode_counter = 0
        self.draft_history = []  # List to store draft dicts

        self.model_save_path = os.path.join(log_dir, f'best_model_{run_id}')
        self.rewards_losses_path = os.path.join(log_dir, f'rewards_losses_{run_id}.json')
        self.drafts_save_path = os.path.join(log_dir, f'draft_history_{run_id}.json')

    def _init_callback(self) -> None:
        os.makedirs(self.log_dir, exist_ok=True)

    def _on_step(self) -> bool:
        # Log rewards
        current_reward = self.locals['rewards']
        self.rewards_history.append(current_reward)

        # Log losses if they are available (PPO does not use TD errors)
        if 'loss' in self.locals:
            current_loss = self.locals['loss']
            self.losses_history.append(current_loss)

        # Periodic logging and saving
        if self.n_calls % self.log_interval == 0:
            mean_reward = np.mean(self.rewards_history[-self.log_interval:])
            self.logger.record('mean_reward', mean_reward)

            if mean_reward > self.best_mean_reward:
                self.best_mean_reward = mean_reward
                self.model.save(self.model_save_path)
                if self.verbose > 0:
                    print(f"New best mean reward: {self.best_mean_reward}. Model saved to {self.model_save_path}")

            # Convert rewards and losses to lists before saving
            rewards_list = [float(reward.item() if isinstance(reward, np.ndarray) else reward) for reward in self.rewards_history]
            losses_list = [float(loss.item() if isinstance(loss, np.ndarray) else loss) for loss in self.losses_history]

            # Save rewards and losses to a file
            with open(self.rewards_losses_path, 'w') as f:
                json.dump({'rewards': rewards_list, 'losses': losses_list}, f)

        # Check if the episode has terminated
        if self.locals['dones'][0]:
            
            if self.episode_counter % self.draft_save_interval == 0:
                draft = self.locals['infos'][0]['draft']
                self.draft_history.append(draft)
                with open(self.drafts_save_path, 'w') as f:
                    json.dump(self.draft_history, f, indent=4)
                
            self.episode_counter += 1

        return True

# Initialize and use the custom callback
monitor_callback = TrainingMonitorCallback(log_interval=1000, save_interval=50, draft_save_interval=250, log_dir=log_dir, run_id=run_id)

# Create a checkpoint callback to save the model periodically
checkpoint_callback = CheckpointCallback(save_freq=5000, save_path=log_dir, name_prefix='ppo_model')

# Create an evaluation callback to log evaluations during training
eval_callback = EvalCallback(env, best_model_save_path=log_dir, log_path=log_dir, eval_freq=5000, deterministic=True, render=False)

# Combine all callbacks
callback = [checkpoint_callback, eval_callback, monitor_callback]

# Train the model
model.learn(total_timesteps=2e6, callback=callback)

Logging to ./logs/RNN-PPO_20240827-013859/
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-----------------------------------------------------------------
| params/                |                                      |
|    _init_setup_model   | True                                 |
|    batch_size          | 256                                  |
|    clip_range          | 0.2                                  |
|    clip_range_vf       | None                                 |
|    device              | auto                                 |
|    ent_coef            | 0.03                                 |
|    gae_lambda          | 0.95                                 |
|    gamma               | 0.99                                 |
|    learning_rate       | <function learning_rate_schedule.... |
|    max_grad_norm       | 0.7                                  |
|    n_epochs            | 15                                   |
|    n_steps             | 60                                   |
|    normalize_advantage | True                                 |
|    polic



Eval num_timesteps=5000, episode_reward=-0.84 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.841      |
| time/                   |             |
|    total_timesteps      | 5000        |
| train/                  |             |
|    approx_kl            | 0.009225871 |
|    clip_fraction        | 0.0189      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.62       |
|    explained_variance   | 0.607       |
|    learning_rate        | 9.99e-05    |
|    loss                 | -0.067      |
|    n_updates            | 1245        |
|    policy_gradient_loss | -0.0123     |
|    value_loss           | 0.0637      |
-----------------------------------------
New best mean reward!
New best mean reward: -0.012029980309307575. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
------------------



Eval num_timesteps=10000, episode_reward=-0.85 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.851      |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.009521178 |
|    clip_fraction        | 0.0333      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.47       |
|    explained_variance   | -0.0093     |
|    learning_rate        | 9.98e-05    |
|    loss                 | 0.0093      |
|    n_updates            | 2490        |
|    policy_gradient_loss | -0.0114     |
|    value_loss           | 0.199       |
-----------------------------------------
-------------------------------------
| mean_reward        | -0.031597365 |
| rollout/           |              |
|    ep_len_mean     | 15.1         |
|    



Eval num_timesteps=15000, episode_reward=-0.83 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.834      |
| time/                   |             |
|    total_timesteps      | 15000       |
| train/                  |             |
|    approx_kl            | 0.008334978 |
|    clip_fraction        | 0.0611      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.15       |
|    explained_variance   | 0.489       |
|    learning_rate        | 9.96e-05    |
|    loss                 | -0.0342     |
|    n_updates            | 3735        |
|    policy_gradient_loss | -0.00825    |
|    value_loss           | 0.0845      |
-----------------------------------------
New best mean reward!
-------------------------------------
| mean_reward        | -0.017898196 |
| rollout/           |              |
|    ep_len_mean     



Eval num_timesteps=20000, episode_reward=-0.84 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.836      |
| time/                   |             |
|    total_timesteps      | 20000       |
| train/                  |             |
|    approx_kl            | 0.015927542 |
|    clip_fraction        | 0.0909      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.54       |
|    explained_variance   | -0.0251     |
|    learning_rate        | 9.95e-05    |
|    loss                 | 0.0694      |
|    n_updates            | 4995        |
|    policy_gradient_loss | -0.0228     |
|    value_loss           | 0.324       |
-----------------------------------------
-------------------------------------
| mean_reward        | -0.029303767 |
| rollout/           |              |
|    ep_len_mean     | 15.1         |
|    



Eval num_timesteps=25000, episode_reward=-0.84 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.838      |
| time/                   |             |
|    total_timesteps      | 25000       |
| train/                  |             |
|    approx_kl            | 0.016921887 |
|    clip_fraction        | 0.0528      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.43       |
|    explained_variance   | 0.0453      |
|    learning_rate        | 9.94e-05    |
|    loss                 | 0.0367      |
|    n_updates            | 6240        |
|    policy_gradient_loss | -0.0212     |
|    value_loss           | 0.252       |
-----------------------------------------
-------------------------------------
| mean_reward        | -0.019364154 |
| rollout/           |              |
|    ep_len_mean     | 15.1         |
|    



Eval num_timesteps=30000, episode_reward=-0.80 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.802      |
| time/                   |             |
|    total_timesteps      | 30000       |
| train/                  |             |
|    approx_kl            | 0.018381042 |
|    clip_fraction        | 0.0583      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.36       |
|    explained_variance   | 0.0177      |
|    learning_rate        | 9.93e-05    |
|    loss                 | 0.0544      |
|    n_updates            | 7485        |
|    policy_gradient_loss | -0.0137     |
|    value_loss           | 0.27        |
-----------------------------------------
New best mean reward!
-------------------------------------
| mean_reward        | -0.011595634 |
| rollout/           |              |
|    ep_len_mean     



Eval num_timesteps=35000, episode_reward=-0.81 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.815      |
| time/                   |             |
|    total_timesteps      | 35000       |
| train/                  |             |
|    approx_kl            | 0.018518217 |
|    clip_fraction        | 0.0917      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.4        |
|    explained_variance   | 0.0614      |
|    learning_rate        | 9.91e-05    |
|    loss                 | 0.0356      |
|    n_updates            | 8745        |
|    policy_gradient_loss | -0.00957    |
|    value_loss           | 0.184       |
-----------------------------------------
-------------------------------------
| mean_reward        | -0.024669282 |
| rollout/           |              |
|    ep_len_mean     | 15.1         |
|    



Eval num_timesteps=40000, episode_reward=-0.80 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.797      |
| time/                   |             |
|    total_timesteps      | 40000       |
| train/                  |             |
|    approx_kl            | 0.015203836 |
|    clip_fraction        | 0.0689      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.53       |
|    explained_variance   | 0.13        |
|    learning_rate        | 9.9e-05     |
|    loss                 | -0.0111     |
|    n_updates            | 9990        |
|    policy_gradient_loss | -0.024      |
|    value_loss           | 0.175       |
-----------------------------------------
New best mean reward!
-------------------------------------
| mean_reward        | -0.009450016 |
| rollout/           |              |
|    ep_len_mean     



Eval num_timesteps=45000, episode_reward=-0.79 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.794      |
| time/                   |             |
|    total_timesteps      | 45000       |
| train/                  |             |
|    approx_kl            | 0.017167214 |
|    clip_fraction        | 0.0704      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.34       |
|    explained_variance   | 0.0722      |
|    learning_rate        | 9.89e-05    |
|    loss                 | 0.0287      |
|    n_updates            | 11235       |
|    policy_gradient_loss | -0.0211     |
|    value_loss           | 0.232       |
-----------------------------------------
New best mean reward!
New best mean reward: 0.0027143375482410192. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
-----------------



Eval num_timesteps=50000, episode_reward=-0.71 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.706      |
| time/                   |             |
|    total_timesteps      | 50000       |
| train/                  |             |
|    approx_kl            | 0.016819786 |
|    clip_fraction        | 0.0688      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.41       |
|    explained_variance   | 0.0234      |
|    learning_rate        | 9.88e-05    |
|    loss                 | 0.0375      |
|    n_updates            | 12495       |
|    policy_gradient_loss | -0.0189     |
|    value_loss           | 0.228       |
-----------------------------------------
New best mean reward!
-------------------------------------
| mean_reward        | -0.004246942 |
| rollout/           |              |
|    ep_len_mean     



Eval num_timesteps=55000, episode_reward=-0.81 +/- 0.03
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.813      |
| time/                   |             |
|    total_timesteps      | 55000       |
| train/                  |             |
|    approx_kl            | 0.016387586 |
|    clip_fraction        | 0.0833      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.47       |
|    explained_variance   | 0.00169     |
|    learning_rate        | 9.86e-05    |
|    loss                 | 0.0386      |
|    n_updates            | 13740       |
|    policy_gradient_loss | -0.0123     |
|    value_loss           | 0.207       |
-----------------------------------------
-------------------------------------
| mean_reward        | -0.013918571 |
| rollout/           |              |
|    ep_len_mean     | 15.1         |
|    



Eval num_timesteps=60000, episode_reward=-0.16 +/- 0.70
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.161      |
| time/                   |             |
|    total_timesteps      | 60000       |
| train/                  |             |
|    approx_kl            | 0.016279114 |
|    clip_fraction        | 0.0903      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.51       |
|    explained_variance   | 0.134       |
|    learning_rate        | 9.85e-05    |
|    loss                 | -0.00137    |
|    n_updates            | 14985       |
|    policy_gradient_loss | -0.0233     |
|    value_loss           | 0.186       |
-----------------------------------------
New best mean reward!
-------------------------------------
| mean_reward        | -0.012370598 |
| rollout/           |              |
|    ep_len_mean     



Eval num_timesteps=65000, episode_reward=-0.75 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | -0.745      |
| time/                   |             |
|    total_timesteps      | 65000       |
| train/                  |             |
|    approx_kl            | 0.011135856 |
|    clip_fraction        | 0.148       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.42       |
|    explained_variance   | -0.0207     |
|    learning_rate        | 9.84e-05    |
|    loss                 | 0.0479      |
|    n_updates            | 16245       |
|    policy_gradient_loss | -0.0147     |
|    value_loss           | 0.28        |
-----------------------------------------
-------------------------------------
| mean_reward        | -0.010052225 |
| rollout/           |              |
|    ep_len_mean     | 15.1         |
|    



Eval num_timesteps=70000, episode_reward=0.75 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.75        |
| time/                   |             |
|    total_timesteps      | 70000       |
| train/                  |             |
|    approx_kl            | 0.016923292 |
|    clip_fraction        | 0.075       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.52       |
|    explained_variance   | 0.573       |
|    learning_rate        | 9.83e-05    |
|    loss                 | -0.00795    |
|    n_updates            | 17490       |
|    policy_gradient_loss | -0.0195     |
|    value_loss           | 0.139       |
-----------------------------------------
New best mean reward!
-------------------------------------
| mean_reward        | -0.017726794 |
| rollout/           |              |
|    ep_len_mean     |



Eval num_timesteps=75000, episode_reward=0.48 +/- 0.56
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.483       |
| time/                   |             |
|    total_timesteps      | 75000       |
| train/                  |             |
|    approx_kl            | 0.018302644 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -1.74       |
|    learning_rate        | 9.81e-05    |
|    loss                 | 0.0691      |
|    n_updates            | 18735       |
|    policy_gradient_loss | -0.011      |
|    value_loss           | 0.255       |
-----------------------------------------
------------------------------------
| mean_reward        | 0.008360417 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_re



Eval num_timesteps=80000, episode_reward=0.49 +/- 0.57
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.488       |
| time/                   |             |
|    total_timesteps      | 80000       |
| train/                  |             |
|    approx_kl            | 0.016105361 |
|    clip_fraction        | 0.0833      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.35       |
|    explained_variance   | 0.052       |
|    learning_rate        | 9.8e-05     |
|    loss                 | 0.0642      |
|    n_updates            | 19995       |
|    policy_gradient_loss | -0.0128     |
|    value_loss           | 0.262       |
-----------------------------------------
New best mean reward: 0.016253871843218803. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
------------------------------------
| me



Eval num_timesteps=85000, episode_reward=0.17 +/- 0.70
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.175       |
| time/                   |             |
|    total_timesteps      | 85000       |
| train/                  |             |
|    approx_kl            | 0.017470237 |
|    clip_fraction        | 0.0958      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.43       |
|    explained_variance   | 0.0412      |
|    learning_rate        | 9.79e-05    |
|    loss                 | 0.0328      |
|    n_updates            | 21240       |
|    policy_gradient_loss | -0.0228     |
|    value_loss           | 0.231       |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.01004201 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_me



Eval num_timesteps=90000, episode_reward=0.71 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.715       |
| time/                   |             |
|    total_timesteps      | 90000       |
| train/                  |             |
|    approx_kl            | 0.015013791 |
|    clip_fraction        | 0.0636      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.42       |
|    explained_variance   | 0.0054      |
|    learning_rate        | 9.78e-05    |
|    loss                 | 0.018       |
|    n_updates            | 22485       |
|    policy_gradient_loss | -0.0187     |
|    value_loss           | 0.22        |
-----------------------------------------
--------------------------------------
| mean_reward        | -0.0013273438 |
| rollout/           |               |
|    ep_len_mean     | 15            |
| 



Eval num_timesteps=95000, episode_reward=0.15 +/- 0.69
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.151       |
| time/                   |             |
|    total_timesteps      | 95000       |
| train/                  |             |
|    approx_kl            | 0.015621218 |
|    clip_fraction        | 0.0857      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | 0.0281      |
|    learning_rate        | 9.76e-05    |
|    loss                 | 0.0229      |
|    n_updates            | 23745       |
|    policy_gradient_loss | -0.0171     |
|    value_loss           | 0.196       |
-----------------------------------------
------------------------------------
| mean_reward        | 0.012495104 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_re



Eval num_timesteps=100000, episode_reward=0.72 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.721       |
| time/                   |             |
|    total_timesteps      | 100000      |
| train/                  |             |
|    approx_kl            | 0.015214946 |
|    clip_fraction        | 0.0845      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.22       |
|    explained_variance   | 0.0561      |
|    learning_rate        | 9.75e-05    |
|    loss                 | 0.00734     |
|    n_updates            | 24990       |
|    policy_gradient_loss | -0.0355     |
|    value_loss           | 0.235       |
-----------------------------------------
------------------------------------
| mean_reward        | 0.020698927 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=105000, episode_reward=0.48 +/- 0.57
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.476       |
| time/                   |             |
|    total_timesteps      | 105000      |
| train/                  |             |
|    approx_kl            | 0.013363501 |
|    clip_fraction        | 0.0856      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.22       |
|    explained_variance   | -0.774      |
|    learning_rate        | 9.74e-05    |
|    loss                 | -0.0595     |
|    n_updates            | 26235       |
|    policy_gradient_loss | -0.0293     |
|    value_loss           | 0.0947      |
-----------------------------------------
New best mean reward: 0.02434726059436798. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
-----------------------------------
| mea



Eval num_timesteps=110000, episode_reward=0.74 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.739       |
| time/                   |             |
|    total_timesteps      | 110000      |
| train/                  |             |
|    approx_kl            | 0.016177848 |
|    clip_fraction        | 0.0667      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.3        |
|    explained_variance   | 0.0588      |
|    learning_rate        | 9.73e-05    |
|    loss                 | 0.0173      |
|    n_updates            | 27495       |
|    policy_gradient_loss | -0.0194     |
|    value_loss           | 0.184       |
-----------------------------------------
------------------------------------
| mean_reward        | 0.026456166 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=115000, episode_reward=0.70 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.697       |
| time/                   |             |
|    total_timesteps      | 115000      |
| train/                  |             |
|    approx_kl            | 0.016567213 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.23       |
|    explained_variance   | 0.0675      |
|    learning_rate        | 9.71e-05    |
|    loss                 | -0.00445    |
|    n_updates            | 28740       |
|    policy_gradient_loss | -0.0254     |
|    value_loss           | 0.18        |
-----------------------------------------
------------------------------------
| mean_reward        | 0.031235911 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=120000, episode_reward=0.74 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.741       |
| time/                   |             |
|    total_timesteps      | 120000      |
| train/                  |             |
|    approx_kl            | 0.017312564 |
|    clip_fraction        | 0.0667      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.24       |
|    explained_variance   | 0.0859      |
|    learning_rate        | 9.7e-05     |
|    loss                 | 0.104       |
|    n_updates            | 29985       |
|    policy_gradient_loss | -0.0196     |
|    value_loss           | 0.366       |
-----------------------------------------
------------------------------------
| mean_reward        | 0.035356134 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=125000, episode_reward=0.76 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.756       |
| time/                   |             |
|    total_timesteps      | 125000      |
| train/                  |             |
|    approx_kl            | 0.022602983 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.22       |
|    explained_variance   | 0.934       |
|    learning_rate        | 9.69e-05    |
|    loss                 | -0.0597     |
|    n_updates            | 31245       |
|    policy_gradient_loss | -0.0154     |
|    value_loss           | 0.000668    |
-----------------------------------------
New best mean reward!
New best mean reward: 0.04821927472949028. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
-------------------



Eval num_timesteps=130000, episode_reward=0.77 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.775       |
| time/                   |             |
|    total_timesteps      | 130000      |
| train/                  |             |
|    approx_kl            | 0.015329361 |
|    clip_fraction        | 0.0667      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.11       |
|    explained_variance   | 0.64        |
|    learning_rate        | 9.68e-05    |
|    loss                 | -0.0492     |
|    n_updates            | 32490       |
|    policy_gradient_loss | -0.014      |
|    value_loss           | 0.00834     |
-----------------------------------------
New best mean reward!
------------------------------------
| mean_reward        | 0.038011156 |
| rollout/           |             |
|    ep_len_mean     | 1



Eval num_timesteps=135000, episode_reward=0.51 +/- 0.57
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.506       |
| time/                   |             |
|    total_timesteps      | 135000      |
| train/                  |             |
|    approx_kl            | 0.017555205 |
|    clip_fraction        | 0.0952      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.07       |
|    explained_variance   | 0.0984      |
|    learning_rate        | 9.66e-05    |
|    loss                 | 0.0491      |
|    n_updates            | 33735       |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 0.231       |
-----------------------------------------
------------------------------------
| mean_reward        | 0.047189426 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=140000, episode_reward=0.79 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.793       |
| time/                   |             |
|    total_timesteps      | 140000      |
| train/                  |             |
|    approx_kl            | 0.017095935 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.04       |
|    explained_variance   | 0.955       |
|    learning_rate        | 9.65e-05    |
|    loss                 | -0.0872     |
|    n_updates            | 34995       |
|    policy_gradient_loss | -0.026      |
|    value_loss           | 0.00193     |
-----------------------------------------
New best mean reward!
------------------------------------
| mean_reward        | 0.047201715 |
| rollout/           |             |
|    ep_len_mean     | 1



Eval num_timesteps=145000, episode_reward=0.78 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.777       |
| time/                   |             |
|    total_timesteps      | 145000      |
| train/                  |             |
|    approx_kl            | 0.015304471 |
|    clip_fraction        | 0.05        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.896      |
|    explained_variance   | 0.973       |
|    learning_rate        | 9.64e-05    |
|    loss                 | -0.0586     |
|    n_updates            | 36240       |
|    policy_gradient_loss | -0.011      |
|    value_loss           | 0.000306    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.04772204 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_m



Eval num_timesteps=150000, episode_reward=0.79 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.789       |
| time/                   |             |
|    total_timesteps      | 150000      |
| train/                  |             |
|    approx_kl            | 0.020662574 |
|    clip_fraction        | 0.0952      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.806      |
|    explained_variance   | 0.955       |
|    learning_rate        | 9.63e-05    |
|    loss                 | -0.0498     |
|    n_updates            | 37485       |
|    policy_gradient_loss | -0.0112     |
|    value_loss           | 0.00355     |
-----------------------------------------
------------------------------------
| mean_reward        | 0.050837547 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=155000, episode_reward=0.80 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.804       |
| time/                   |             |
|    total_timesteps      | 155000      |
| train/                  |             |
|    approx_kl            | 0.018647186 |
|    clip_fraction        | 0.0864      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.817      |
|    explained_variance   | 0.971       |
|    learning_rate        | 9.61e-05    |
|    loss                 | -0.0577     |
|    n_updates            | 38745       |
|    policy_gradient_loss | -0.0153     |
|    value_loss           | 0.000327    |
-----------------------------------------
New best mean reward!
------------------------------------
| mean_reward        | 0.052301735 |
| rollout/           |             |
|    ep_len_mean     | 1



Eval num_timesteps=160000, episode_reward=0.80 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.797       |
| time/                   |             |
|    total_timesteps      | 160000      |
| train/                  |             |
|    approx_kl            | 0.016670043 |
|    clip_fraction        | 0.0722      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.745      |
|    explained_variance   | 0.969       |
|    learning_rate        | 9.6e-05     |
|    loss                 | -0.0407     |
|    n_updates            | 39990       |
|    policy_gradient_loss | -0.00968    |
|    value_loss           | 0.000412    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05110791 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_m



Eval num_timesteps=165000, episode_reward=0.80 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.804       |
| time/                   |             |
|    total_timesteps      | 165000      |
| train/                  |             |
|    approx_kl            | 0.018305065 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.602      |
|    explained_variance   | 0.964       |
|    learning_rate        | 9.59e-05    |
|    loss                 | -0.0495     |
|    n_updates            | 41235       |
|    policy_gradient_loss | -0.0191     |
|    value_loss           | 0.000521    |
-----------------------------------------
New best mean reward!
------------------------------------
| mean_reward        | 0.052921135 |
| rollout/           |             |
|    ep_len_mean     | 1



Eval num_timesteps=170000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.833       |
| time/                   |             |
|    total_timesteps      | 170000      |
| train/                  |             |
|    approx_kl            | 0.025302988 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.699      |
|    explained_variance   | 0.968       |
|    learning_rate        | 9.58e-05    |
|    loss                 | -0.0542     |
|    n_updates            | 42495       |
|    policy_gradient_loss | -0.0183     |
|    value_loss           | 0.000354    |
-----------------------------------------
New best mean reward!
------------------------------------
| mean_reward        | 0.053563513 |
| rollout/           |             |
|    ep_len_mean     | 1



Eval num_timesteps=175000, episode_reward=0.81 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.806       |
| time/                   |             |
|    total_timesteps      | 175000      |
| train/                  |             |
|    approx_kl            | 0.016190315 |
|    clip_fraction        | 0.0917      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.617      |
|    explained_variance   | 0.982       |
|    learning_rate        | 9.56e-05    |
|    loss                 | -0.0478     |
|    n_updates            | 43740       |
|    policy_gradient_loss | -0.0151     |
|    value_loss           | 0.000236    |
-----------------------------------------
New best mean reward: 0.05385275185108185. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
-----------------------------------
| mea



Eval num_timesteps=180000, episode_reward=0.82 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.822       |
| time/                   |             |
|    total_timesteps      | 180000      |
| train/                  |             |
|    approx_kl            | 0.015534119 |
|    clip_fraction        | 0.0643      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.577      |
|    explained_variance   | 0.991       |
|    learning_rate        | 9.55e-05    |
|    loss                 | -0.0482     |
|    n_updates            | 44985       |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 0.000126    |
-----------------------------------------
New best mean reward: 0.05416392534971237. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
------------------------------------
| me



Eval num_timesteps=185000, episode_reward=0.81 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.811       |
| time/                   |             |
|    total_timesteps      | 185000      |
| train/                  |             |
|    approx_kl            | 0.018124117 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.503      |
|    explained_variance   | 0.993       |
|    learning_rate        | 9.54e-05    |
|    loss                 | -0.0349     |
|    n_updates            | 46245       |
|    policy_gradient_loss | -0.0139     |
|    value_loss           | 0.000104    |
-----------------------------------------
New best mean reward: 0.05424559861421585. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
----------------------------------
| mean



Eval num_timesteps=190000, episode_reward=0.81 +/- 0.02
Episode length: 15.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 15         |
|    mean_reward          | 0.807      |
| time/                   |            |
|    total_timesteps      | 190000     |
| train/                  |            |
|    approx_kl            | 0.01657237 |
|    clip_fraction        | 0.0583     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.499     |
|    explained_variance   | 0.984      |
|    learning_rate        | 9.53e-05   |
|    loss                 | -0.024     |
|    n_updates            | 47490      |
|    policy_gradient_loss | -0.004     |
|    value_loss           | 0.000302   |
----------------------------------------
------------------------------------
| mean_reward        | 0.052921835 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_rew_mean     | 0.80



Eval num_timesteps=195000, episode_reward=0.82 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.822       |
| time/                   |             |
|    total_timesteps      | 195000      |
| train/                  |             |
|    approx_kl            | 0.022639316 |
|    clip_fraction        | 0.0778      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.419      |
|    explained_variance   | 0.989       |
|    learning_rate        | 9.51e-05    |
|    loss                 | -0.0482     |
|    n_updates            | 48735       |
|    policy_gradient_loss | -0.0196     |
|    value_loss           | 0.000142    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054357056 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=200000, episode_reward=0.84 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.839       |
| time/                   |             |
|    total_timesteps      | 200000      |
| train/                  |             |
|    approx_kl            | 0.016079234 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.395      |
|    explained_variance   | 0.984       |
|    learning_rate        | 9.5e-05     |
|    loss                 | -0.0268     |
|    n_updates            | 49995       |
|    policy_gradient_loss | -0.0119     |
|    value_loss           | 0.013       |
-----------------------------------------
New best mean reward!
------------------------------------
| mean_reward        | 0.052859437 |
| rollout/           |             |
|    ep_len_mean     | 1



Eval num_timesteps=205000, episode_reward=0.82 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.823       |
| time/                   |             |
|    total_timesteps      | 205000      |
| train/                  |             |
|    approx_kl            | 0.015321744 |
|    clip_fraction        | 0.0796      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.36       |
|    explained_variance   | 0.996       |
|    learning_rate        | 9.49e-05    |
|    loss                 | -0.0446     |
|    n_updates            | 51240       |
|    policy_gradient_loss | -0.0147     |
|    value_loss           | 5.04e-05    |
-----------------------------------------
New best mean reward: 0.054848723113536835. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
------------------------------------
| m



Eval num_timesteps=210000, episode_reward=0.84 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.836       |
| time/                   |             |
|    total_timesteps      | 210000      |
| train/                  |             |
|    approx_kl            | 0.015412465 |
|    clip_fraction        | 0.081       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.355      |
|    explained_variance   | 0.983       |
|    learning_rate        | 9.48e-05    |
|    loss                 | -0.00928    |
|    n_updates            | 52485       |
|    policy_gradient_loss | 0.00638     |
|    value_loss           | 0.000222    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05478273 |
| rollout/           |            |
|    ep_len_mean     | 15         |
|    ep_rew_m



Eval num_timesteps=215000, episode_reward=0.84 +/- 0.01
Episode length: 15.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 15           |
|    mean_reward          | 0.839        |
| time/                   |              |
|    total_timesteps      | 215000       |
| train/                  |              |
|    approx_kl            | 0.0156054245 |
|    clip_fraction        | 0.0767       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.409       |
|    explained_variance   | 0.997        |
|    learning_rate        | 9.46e-05     |
|    loss                 | -0.0328      |
|    n_updates            | 53745        |
|    policy_gradient_loss | -0.00731     |
|    value_loss           | 7.47e-05     |
------------------------------------------
New best mean reward!
New best mean reward: 0.05509958043694496. Model saved to ./logs/RNN-PPO_20240827-013859/best_model_20240827-013859
-



Eval num_timesteps=220000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 15         |
|    mean_reward          | 0.831      |
| time/                   |            |
|    total_timesteps      | 220000     |
| train/                  |            |
|    approx_kl            | 0.01508276 |
|    clip_fraction        | 0.1        |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.367     |
|    explained_variance   | 0.998      |
|    learning_rate        | 9.45e-05   |
|    loss                 | -0.00649   |
|    n_updates            | 54990      |
|    policy_gradient_loss | -0.000247  |
|    value_loss           | 3.43e-05   |
----------------------------------------
------------------------------------
| mean_reward        | 0.055053193 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_rew_mean     | 0.82



Eval num_timesteps=225000, episode_reward=0.82 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.818       |
| time/                   |             |
|    total_timesteps      | 225000      |
| train/                  |             |
|    approx_kl            | 0.017144315 |
|    clip_fraction        | 0.0833      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.273      |
|    explained_variance   | 0.99        |
|    learning_rate        | 9.44e-05    |
|    loss                 | -0.0234     |
|    n_updates            | 56235       |
|    policy_gradient_loss | -0.0126     |
|    value_loss           | 0.000198    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.055065323 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=230000, episode_reward=0.81 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.81        |
| time/                   |             |
|    total_timesteps      | 230000      |
| train/                  |             |
|    approx_kl            | 0.020287748 |
|    clip_fraction        | 0.0524      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.294      |
|    explained_variance   | 0.976       |
|    learning_rate        | 9.43e-05    |
|    loss                 | -0.0242     |
|    n_updates            | 57495       |
|    policy_gradient_loss | -0.00998    |
|    value_loss           | 0.000446    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05505064 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_m



Eval num_timesteps=235000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.83        |
| time/                   |             |
|    total_timesteps      | 235000      |
| train/                  |             |
|    approx_kl            | 0.019396283 |
|    clip_fraction        | 0.0611      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.314      |
|    explained_variance   | 0.975       |
|    learning_rate        | 9.41e-05    |
|    loss                 | -0.0368     |
|    n_updates            | 58740       |
|    policy_gradient_loss | -0.0172     |
|    value_loss           | 0.000327    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054578666 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=240000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 15         |
|    mean_reward          | 0.826      |
| time/                   |            |
|    total_timesteps      | 240000     |
| train/                  |            |
|    approx_kl            | 0.01800988 |
|    clip_fraction        | 0.0569     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.23      |
|    explained_variance   | 0.991      |
|    learning_rate        | 9.4e-05    |
|    loss                 | -0.0277    |
|    n_updates            | 59985      |
|    policy_gradient_loss | -0.0115    |
|    value_loss           | 0.000158   |
----------------------------------------
------------------------------------
| mean_reward        | 0.055020433 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_rew_mean     | 0.82



Eval num_timesteps=245000, episode_reward=0.83 +/- 0.00
Episode length: 15.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 15         |
|    mean_reward          | 0.832      |
| time/                   |            |
|    total_timesteps      | 245000     |
| train/                  |            |
|    approx_kl            | 0.02217951 |
|    clip_fraction        | 0.047      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.254     |
|    explained_variance   | 0.997      |
|    learning_rate        | 9.39e-05   |
|    loss                 | -0.0581    |
|    n_updates            | 61245      |
|    policy_gradient_loss | -0.0202    |
|    value_loss           | 4.38e-05   |
----------------------------------------
------------------------------------
| mean_reward        | 0.054824926 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_rew_mean     | 0.80



Eval num_timesteps=250000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.829       |
| time/                   |             |
|    total_timesteps      | 250000      |
| train/                  |             |
|    approx_kl            | 0.018219596 |
|    clip_fraction        | 0.0313      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.306      |
|    explained_variance   | 0.995       |
|    learning_rate        | 9.38e-05    |
|    loss                 | -0.0312     |
|    n_updates            | 62490       |
|    policy_gradient_loss | -0.00985    |
|    value_loss           | 7.83e-05    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.055090547 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=255000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.829       |
| time/                   |             |
|    total_timesteps      | 255000      |
| train/                  |             |
|    approx_kl            | 0.023350852 |
|    clip_fraction        | 0.06        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.302      |
|    explained_variance   | 0.117       |
|    learning_rate        | 9.36e-05    |
|    loss                 | 0.0782      |
|    n_updates            | 63735       |
|    policy_gradient_loss | -0.0236     |
|    value_loss           | 0.246       |
-----------------------------------------
------------------------------------
| mean_reward        | 0.053077042 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=260000, episode_reward=0.82 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.818       |
| time/                   |             |
|    total_timesteps      | 260000      |
| train/                  |             |
|    approx_kl            | 0.007927756 |
|    clip_fraction        | 0.0233      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.185      |
|    explained_variance   | 0.986       |
|    learning_rate        | 9.35e-05    |
|    loss                 | -0.0232     |
|    n_updates            | 64995       |
|    policy_gradient_loss | -0.0132     |
|    value_loss           | 0.000201    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.051841468 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=265000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.828       |
| time/                   |             |
|    total_timesteps      | 265000      |
| train/                  |             |
|    approx_kl            | 0.017034132 |
|    clip_fraction        | 0.0604      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.335      |
|    explained_variance   | 0.992       |
|    learning_rate        | 9.34e-05    |
|    loss                 | -0.0234     |
|    n_updates            | 66240       |
|    policy_gradient_loss | -0.00724    |
|    value_loss           | 0.000111    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05513047 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_m



Eval num_timesteps=270000, episode_reward=0.82 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.821       |
| time/                   |             |
|    total_timesteps      | 270000      |
| train/                  |             |
|    approx_kl            | 0.015358603 |
|    clip_fraction        | 0.0278      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.341      |
|    explained_variance   | 0.986       |
|    learning_rate        | 9.33e-05    |
|    loss                 | -0.0388     |
|    n_updates            | 67485       |
|    policy_gradient_loss | -0.016      |
|    value_loss           | 0.000193    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.053089373 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=275000, episode_reward=0.83 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.828       |
| time/                   |             |
|    total_timesteps      | 275000      |
| train/                  |             |
|    approx_kl            | 0.015845831 |
|    clip_fraction        | 0.0367      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.232      |
|    explained_variance   | 0.996       |
|    learning_rate        | 9.31e-05    |
|    loss                 | -0.0431     |
|    n_updates            | 68745       |
|    policy_gradient_loss | -0.0163     |
|    value_loss           | 5.68e-05    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054989845 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=280000, episode_reward=0.54 +/- 0.60
Episode length: 15.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 15         |
|    mean_reward          | 0.538      |
| time/                   |            |
|    total_timesteps      | 280000     |
| train/                  |            |
|    approx_kl            | 0.01525517 |
|    clip_fraction        | 0.085      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.23      |
|    explained_variance   | 0.128      |
|    learning_rate        | 9.3e-05    |
|    loss                 | 0.0734     |
|    n_updates            | 69990      |
|    policy_gradient_loss | -0.00915   |
|    value_loss           | 0.22       |
----------------------------------------
------------------------------------
| mean_reward        | 0.046081457 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_rew_mean     | 0.72



Eval num_timesteps=285000, episode_reward=0.79 +/- 0.03
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.795       |
| time/                   |             |
|    total_timesteps      | 285000      |
| train/                  |             |
|    approx_kl            | 0.015278955 |
|    clip_fraction        | 0.0909      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.52       |
|    explained_variance   | 0.979       |
|    learning_rate        | 9.29e-05    |
|    loss                 | -0.042      |
|    n_updates            | 71235       |
|    policy_gradient_loss | -0.00996    |
|    value_loss           | 0.000275    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05375151 |
| rollout/           |            |
|    ep_len_mean     | 15         |
|    ep_rew_m



Eval num_timesteps=290000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.833       |
| time/                   |             |
|    total_timesteps      | 290000      |
| train/                  |             |
|    approx_kl            | 0.016518923 |
|    clip_fraction        | 0.0438      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.333      |
|    explained_variance   | 0.994       |
|    learning_rate        | 9.28e-05    |
|    loss                 | -0.0204     |
|    n_updates            | 72495       |
|    policy_gradient_loss | -0.00719    |
|    value_loss           | 0.000183    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05474368 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_m



Eval num_timesteps=295000, episode_reward=0.82 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.824       |
| time/                   |             |
|    total_timesteps      | 295000      |
| train/                  |             |
|    approx_kl            | 0.015117321 |
|    clip_fraction        | 0.0542      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.34       |
|    explained_variance   | 0.723       |
|    learning_rate        | 9.26e-05    |
|    loss                 | -0.0353     |
|    n_updates            | 73740       |
|    policy_gradient_loss | -0.0151     |
|    value_loss           | 0.00383     |
-----------------------------------------
------------------------------------
| mean_reward        | 0.048762493 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=300000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.828       |
| time/                   |             |
|    total_timesteps      | 300000      |
| train/                  |             |
|    approx_kl            | 0.016307805 |
|    clip_fraction        | 0.0283      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.261      |
|    explained_variance   | 0.993       |
|    learning_rate        | 9.25e-05    |
|    loss                 | -0.0253     |
|    n_updates            | 74985       |
|    policy_gradient_loss | -0.00811    |
|    value_loss           | 0.000123    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054878302 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=305000, episode_reward=0.84 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.836       |
| time/                   |             |
|    total_timesteps      | 305000      |
| train/                  |             |
|    approx_kl            | 0.015548444 |
|    clip_fraction        | 0.0591      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.284      |
|    explained_variance   | 0.977       |
|    learning_rate        | 9.24e-05    |
|    loss                 | -0.0341     |
|    n_updates            | 76245       |
|    policy_gradient_loss | -0.0156     |
|    value_loss           | 0.000294    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054787822 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=310000, episode_reward=0.81 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.812       |
| time/                   |             |
|    total_timesteps      | 310000      |
| train/                  |             |
|    approx_kl            | 0.021947902 |
|    clip_fraction        | 0.0548      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.425      |
|    explained_variance   | 0.997       |
|    learning_rate        | 9.23e-05    |
|    loss                 | -0.0441     |
|    n_updates            | 77490       |
|    policy_gradient_loss | -0.0158     |
|    value_loss           | 6.1e-05     |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054639947 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=315000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.832       |
| time/                   |             |
|    total_timesteps      | 315000      |
| train/                  |             |
|    approx_kl            | 0.015156134 |
|    clip_fraction        | 0.0483      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.306      |
|    explained_variance   | 0.989       |
|    learning_rate        | 9.21e-05    |
|    loss                 | -0.034      |
|    n_updates            | 78735       |
|    policy_gradient_loss | -0.0155     |
|    value_loss           | 0.000165    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054871153 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=320000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.834       |
| time/                   |             |
|    total_timesteps      | 320000      |
| train/                  |             |
|    approx_kl            | 0.022234047 |
|    clip_fraction        | 0.0667      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.409      |
|    explained_variance   | 0.937       |
|    learning_rate        | 9.2e-05     |
|    loss                 | -0.0476     |
|    n_updates            | 79995       |
|    policy_gradient_loss | -0.0219     |
|    value_loss           | 0.000748    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05326533 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_m



Eval num_timesteps=325000, episode_reward=0.77 +/- 0.03
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.772       |
| time/                   |             |
|    total_timesteps      | 325000      |
| train/                  |             |
|    approx_kl            | 0.019188438 |
|    clip_fraction        | 0.0455      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.293      |
|    explained_variance   | 0.968       |
|    learning_rate        | 9.19e-05    |
|    loss                 | -0.0386     |
|    n_updates            | 81240       |
|    policy_gradient_loss | -0.0123     |
|    value_loss           | 0.000493    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.053169645 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=330000, episode_reward=0.83 +/- 0.00
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.83        |
| time/                   |             |
|    total_timesteps      | 330000      |
| train/                  |             |
|    approx_kl            | 0.016331881 |
|    clip_fraction        | 0.0813      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.474      |
|    explained_variance   | 0.945       |
|    learning_rate        | 9.18e-05    |
|    loss                 | -0.0491     |
|    n_updates            | 82485       |
|    policy_gradient_loss | -0.0219     |
|    value_loss           | 0.00708     |
-----------------------------------------
------------------------------------
| mean_reward        | 0.051816765 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=335000, episode_reward=0.81 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.814       |
| time/                   |             |
|    total_timesteps      | 335000      |
| train/                  |             |
|    approx_kl            | 0.015031619 |
|    clip_fraction        | 0.0444      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.236      |
|    explained_variance   | 0.979       |
|    learning_rate        | 9.16e-05    |
|    loss                 | -0.0343     |
|    n_updates            | 83745       |
|    policy_gradient_loss | -0.0157     |
|    value_loss           | 0.000342    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.052173804 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=340000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.826       |
| time/                   |             |
|    total_timesteps      | 340000      |
| train/                  |             |
|    approx_kl            | 0.016266216 |
|    clip_fraction        | 0.0481      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.205      |
|    explained_variance   | 0.985       |
|    learning_rate        | 9.15e-05    |
|    loss                 | -0.0277     |
|    n_updates            | 84990       |
|    policy_gradient_loss | -0.0155     |
|    value_loss           | 0.000235    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05244018 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_m



Eval num_timesteps=345000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.826       |
| time/                   |             |
|    total_timesteps      | 345000      |
| train/                  |             |
|    approx_kl            | 0.020094704 |
|    clip_fraction        | 0.0889      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.364      |
|    explained_variance   | 0.995       |
|    learning_rate        | 9.14e-05    |
|    loss                 | -0.0481     |
|    n_updates            | 86235       |
|    policy_gradient_loss | -0.0264     |
|    value_loss           | 8.26e-05    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.051434126 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=350000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.834       |
| time/                   |             |
|    total_timesteps      | 350000      |
| train/                  |             |
|    approx_kl            | 0.006665925 |
|    clip_fraction        | 0.0211      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.283      |
|    explained_variance   | 0.991       |
|    learning_rate        | 9.13e-05    |
|    loss                 | -0.0198     |
|    n_updates            | 87495       |
|    policy_gradient_loss | -0.00553    |
|    value_loss           | 0.000139    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054430194 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=355000, episode_reward=0.82 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.825       |
| time/                   |             |
|    total_timesteps      | 355000      |
| train/                  |             |
|    approx_kl            | 0.019111983 |
|    clip_fraction        | 0.0643      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.394      |
|    explained_variance   | 0.918       |
|    learning_rate        | 9.11e-05    |
|    loss                 | -0.0465     |
|    n_updates            | 88740       |
|    policy_gradient_loss | -0.0204     |
|    value_loss           | 0.00118     |
-----------------------------------------
------------------------------------
| mean_reward        | 0.050811034 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=360000, episode_reward=0.82 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.825       |
| time/                   |             |
|    total_timesteps      | 360000      |
| train/                  |             |
|    approx_kl            | 0.017312212 |
|    clip_fraction        | 0.0481      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.365      |
|    explained_variance   | 0.102       |
|    learning_rate        | 9.1e-05     |
|    loss                 | 0.0548      |
|    n_updates            | 89985       |
|    policy_gradient_loss | -0.0182     |
|    value_loss           | 0.205       |
-----------------------------------------
------------------------------------
| mean_reward        | 0.047347773 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=365000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.828       |
| time/                   |             |
|    total_timesteps      | 365000      |
| train/                  |             |
|    approx_kl            | 0.016444078 |
|    clip_fraction        | 0.0712      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.26       |
|    explained_variance   | 0.99        |
|    learning_rate        | 9.09e-05    |
|    loss                 | -0.0234     |
|    n_updates            | 91245       |
|    policy_gradient_loss | -0.00862    |
|    value_loss           | 0.000138    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054782666 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=370000, episode_reward=0.81 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.814       |
| time/                   |             |
|    total_timesteps      | 370000      |
| train/                  |             |
|    approx_kl            | 0.017798679 |
|    clip_fraction        | 0.0563      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.244      |
|    explained_variance   | 0.922       |
|    learning_rate        | 9.08e-05    |
|    loss                 | -0.0361     |
|    n_updates            | 92490       |
|    policy_gradient_loss | -0.0147     |
|    value_loss           | 0.00271     |
-----------------------------------------
------------------------------------
| mean_reward        | 0.047052376 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=375000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.825       |
| time/                   |             |
|    total_timesteps      | 375000      |
| train/                  |             |
|    approx_kl            | 0.015228998 |
|    clip_fraction        | 0.0433      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.28       |
|    explained_variance   | 0.994       |
|    learning_rate        | 9.06e-05    |
|    loss                 | -0.0411     |
|    n_updates            | 93735       |
|    policy_gradient_loss | -0.0149     |
|    value_loss           | 0.000101    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05469471 |
| rollout/           |            |
|    ep_len_mean     | 15         |
|    ep_rew_m



Eval num_timesteps=380000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.828       |
| time/                   |             |
|    total_timesteps      | 380000      |
| train/                  |             |
|    approx_kl            | 0.015797082 |
|    clip_fraction        | 0.0639      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.388      |
|    explained_variance   | 0.99        |
|    learning_rate        | 9.05e-05    |
|    loss                 | -0.0333     |
|    n_updates            | 94995       |
|    policy_gradient_loss | -0.0118     |
|    value_loss           | 0.000141    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054768607 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=385000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.833       |
| time/                   |             |
|    total_timesteps      | 385000      |
| train/                  |             |
|    approx_kl            | 0.016324114 |
|    clip_fraction        | 0.0375      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.241      |
|    explained_variance   | 0.123       |
|    learning_rate        | 9.04e-05    |
|    loss                 | 0.0838      |
|    n_updates            | 96240       |
|    policy_gradient_loss | -0.0167     |
|    value_loss           | 0.245       |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.04598895 |
| rollout/           |            |
|    ep_len_mean     | 15.1       |
|    ep_rew_m



Eval num_timesteps=390000, episode_reward=0.85 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.845       |
| time/                   |             |
|    total_timesteps      | 390000      |
| train/                  |             |
|    approx_kl            | 0.008607439 |
|    clip_fraction        | 0.0511      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.308      |
|    explained_variance   | 0.986       |
|    learning_rate        | 9.03e-05    |
|    loss                 | -0.0378     |
|    n_updates            | 97485       |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 0.000221    |
-----------------------------------------
New best mean reward!
------------------------------------
| mean_reward        | 0.048903912 |
| rollout/           |             |
|    ep_len_mean     | 1



Eval num_timesteps=395000, episode_reward=0.82 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.819       |
| time/                   |             |
|    total_timesteps      | 395000      |
| train/                  |             |
|    approx_kl            | 0.015942305 |
|    clip_fraction        | 0.05        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.437      |
|    explained_variance   | 0.979       |
|    learning_rate        | 9.01e-05    |
|    loss                 | -0.0495     |
|    n_updates            | 98745       |
|    policy_gradient_loss | -0.0186     |
|    value_loss           | 0.000417    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.054680977 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=400000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.828       |
| time/                   |             |
|    total_timesteps      | 400000      |
| train/                  |             |
|    approx_kl            | 0.017297653 |
|    clip_fraction        | 0.07        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.295      |
|    explained_variance   | 0.929       |
|    learning_rate        | 9e-05       |
|    loss                 | -0.0421     |
|    n_updates            | 99990       |
|    policy_gradient_loss | -0.0161     |
|    value_loss           | 0.00127     |
-----------------------------------------
------------------------------------
| mean_reward        | 0.051661886 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=405000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.832       |
| time/                   |             |
|    total_timesteps      | 405000      |
| train/                  |             |
|    approx_kl            | 0.016663147 |
|    clip_fraction        | 0.0726      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.304      |
|    explained_variance   | 0.981       |
|    learning_rate        | 8.99e-05    |
|    loss                 | -0.0305     |
|    n_updates            | 101235      |
|    policy_gradient_loss | -0.0122     |
|    value_loss           | 0.000361    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.051660903 |
| rollout/           |             |
|    ep_len_mean     | 15          |
|    ep_r



Eval num_timesteps=410000, episode_reward=0.83 +/- 0.02
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.834       |
| time/                   |             |
|    total_timesteps      | 410000      |
| train/                  |             |
|    approx_kl            | 0.010602395 |
|    clip_fraction        | 0.0489      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.243      |
|    explained_variance   | 0.997       |
|    learning_rate        | 8.98e-05    |
|    loss                 | -0.0232     |
|    n_updates            | 102495      |
|    policy_gradient_loss | -0.0068     |
|    value_loss           | 4.53e-05    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.051899932 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=415000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.833       |
| time/                   |             |
|    total_timesteps      | 415000      |
| train/                  |             |
|    approx_kl            | 0.005096938 |
|    clip_fraction        | 0.0278      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.292      |
|    explained_variance   | 0.995       |
|    learning_rate        | 8.96e-05    |
|    loss                 | -0.026      |
|    n_updates            | 103740      |
|    policy_gradient_loss | -0.00887    |
|    value_loss           | 7.03e-05    |
-----------------------------------------
------------------------------------
| mean_reward        | 0.053639594 |
| rollout/           |             |
|    ep_len_mean     | 15.1        |
|    ep_r



Eval num_timesteps=420000, episode_reward=0.83 +/- 0.01
Episode length: 15.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 15          |
|    mean_reward          | 0.828       |
| time/                   |             |
|    total_timesteps      | 420000      |
| train/                  |             |
|    approx_kl            | 0.015625969 |
|    clip_fraction        | 0.0405      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.29       |
|    explained_variance   | 0.989       |
|    learning_rate        | 8.95e-05    |
|    loss                 | -0.0432     |
|    n_updates            | 104985      |
|    policy_gradient_loss | -0.0209     |
|    value_loss           | 0.000139    |
-----------------------------------------
-----------------------------------
| mean_reward        | 0.05493687 |
| rollout/           |            |
|    ep_len_mean     | 15         |
|    ep_rew_m

KeyboardInterrupt: 