Dependencies, Imports and util functions

In [None]:
!pip install gymnasium[box2d]
!pip install swig
!pip install gymnasium[box2d] imageio

Collecting box2d==2.3.10 (from gymnasium[box2d])
  Downloading Box2D-2.3.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (573 bytes)
Collecting swig==4.* (from gymnasium[box2d])
  Downloading swig-4.4.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading Box2D-2.3.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m114.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading swig-4.4.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m103.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig, box2d
Successfully installed box2d-2.3.10 swig-4.4.1


In [None]:
import gymnasium as gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import imageio
import os
from google.colab import files

In [None]:
# Util functions

def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
# Actor Critic Network (shared backbone architecture)
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden=128):
        super().__init__()

        self.shared = nn.Sequential(
            nn.Linear(state_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
        )

        self.mu_head = nn.Linear(hidden, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        self.value_head = nn.Linear(hidden, 1)

    def forward(self, x):
        z = self.shared(x)
        mu = self.mu_head(z)
        log_std = torch.clamp(self.log_std, -20, 2)
        std = torch.exp(log_std)
        value = self.value_head(z).squeeze(-1)
        return mu, std, value

    def get_action(self, state):
        mu, std, value = self.forward(state)
        dist = torch.distributions.Normal(mu, std)

        u = dist.rsample()
        action = torch.tanh(u)

        logp = dist.log_prob(u)
        logp -= torch.log(1 - action.pow(2) + 1e-6)
        logp = logp.sum(-1)

        entropy = dist.entropy().sum(-1)
        return action, logp, entropy, value

    def evaluate_actions(self, states, actions):
        mu, std, value = self.forward(states)
        dist = torch.distributions.Normal(mu, std)

        eps = 1e-6
        actions = torch.clamp(actions, -1 + eps, 1 - eps)
        u = 0.5 * torch.log((1 + actions) / (1 - actions))

        logp = dist.log_prob(u)
        logp -= torch.log(1 - actions.pow(2) + 1e-6)
        logp = logp.sum(-1)

        entropy = dist.entropy().sum(-1)
        return logp, entropy, value

    def act_deterministic(self, state):
        with torch.no_grad():
            mu, _, _ = self.forward(state)
            return torch.tanh(mu)


# Computing GAE

@torch.no_grad()
def compute_gae(rewards, dones, values, next_value, gamma=0.99, gae_lambda=0.95):
    T = rewards.shape[0]
    advantages = torch.zeros(T, device=rewards.device)

    gae = 0
    for t in reversed(range(T)):
        mask = 1.0 - dones[t]
        v_next = next_value if t == T - 1 else values[t + 1]
        delta = rewards[t] + gamma * v_next * mask - values[t]
        gae = delta + gamma * gae_lambda * mask * gae
        advantages[t] = gae

    returns = advantages + values
    return advantages.detach(), returns.detach()



PPO v1 (without LR annealing and value clipping)

In [None]:
# Training setup

seed = 0
set_seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

env = gym.make("LunarLanderContinuous-v3")
eval_env = gym.make("LunarLanderContinuous-v3", render_mode="rgb_array")

os.makedirs("videos", exist_ok=True)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

model = ActorCritic(state_dim, action_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-4)

# PPO hyperparameters
gamma = 0.99
gae_lambda = 0.95
clip_eps = 0.2
vf_coef = 0.5
ent_coef = 0.005
max_grad_norm = 0.5
ppo_epochs = 10
mini_batch_size = 64

rollout_len = 2048
total_timesteps = 1_000_000
eval_interval = 10_000

global_step = 0
state, _ = env.reset()

# PPO v1 training loop

while global_step < total_timesteps:

    states, actions, rewards, dones, values, logps = [], [], [], [], [], []
    rollout_rewards = []

    # Rollouts
    for _ in range(rollout_len):

        state_tensor = torch.tensor(state, dtype=torch.float32, device=device)
        action, logp, entropy, value = model.get_action(state_tensor)

        next_state, reward, terminated, truncated, _ = env.step(
            action.detach().cpu().numpy()
        )

        done = terminated or truncated

        states.append(state_tensor)
        actions.append(action.detach())
        rewards.append(torch.tensor(reward, dtype=torch.float32, device=device))
        dones.append(torch.tensor(float(done), dtype=torch.float32, device=device))
        values.append(value.detach())
        logps.append(logp.detach())

        rollout_rewards.append(reward)

        state = next_state
        global_step += 1

        if done:
            state, _ = env.reset()

        if global_step >= total_timesteps:
            break

    mean_rollout_reward = np.mean(rollout_rewards)

    states = torch.stack(states)
    actions = torch.stack(actions)
    rewards = torch.stack(rewards)
    dones = torch.stack(dones)
    values = torch.stack(values)
    logps_old = torch.stack(logps)

    with torch.no_grad():
        last_state_tensor = torch.tensor(state, dtype=torch.float32, device=device)
        _, _, next_value = model.forward(last_state_tensor)

    advantages, returns = compute_gae(
        rewards, dones, values, next_value, gamma, gae_lambda
    )

    # Advantage normalisation

    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)


    dataset_size = states.size(0)

    # Logging metrics
    approx_kl_total = 0
    clip_frac_total = 0
    update_steps = 0

    # PPO update
    for _ in range(ppo_epochs):

        indices = torch.randperm(dataset_size, device=device)

        for start in range(0, dataset_size, mini_batch_size):
            end = start + mini_batch_size
            batch_idx = indices[start:end]

            mb_states = states[batch_idx]
            mb_actions = actions[batch_idx]
            mb_advantages = advantages[batch_idx]
            mb_returns = returns[batch_idx]
            mb_logps_old = logps_old[batch_idx]

            logps_new, entropy, values_new = model.evaluate_actions(
                mb_states, mb_actions
            )

            ratio = torch.exp(logps_new - mb_logps_old)

            surr1 = ratio * mb_advantages
            surr2 = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * mb_advantages  # PPO clipping happens here

            policy_loss = -torch.min(surr1, surr2).mean()
            value_loss = 0.5 * (mb_returns - values_new).pow(2).mean()
            entropy_loss = entropy.mean()

            loss = policy_loss + vf_coef * value_loss - ent_coef * entropy_loss   # Loss formula (Actor loss + Critic loss + entropy term)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)     # Gradient clipping (for stable updates)
            optimizer.step()

            # Logging stats
            approx_kl = (mb_logps_old - logps_new).mean().item()
            clip_frac = ((ratio - 1.0).abs() > clip_eps).float().mean().item()

            approx_kl_total += approx_kl
            clip_frac_total += clip_frac
            update_steps += 1

    # Print logged metrics
    print(
        f"Step: {global_step} | "
        f"Rollout Reward: {mean_rollout_reward:.2f} | "
        f"KL: {approx_kl_total/update_steps:.5f} | "
        f"ClipFrac: {clip_frac_total/update_steps:.3f} | "
        f"Entropy: {entropy_loss.item():.3f}"
    )

    # Evaluation for 1 episode
    if global_step % eval_interval < rollout_len:

        s, _ = eval_env.reset()
        done = False
        ep_return = 0
        frames = []

        while not done:
            s_tensor = torch.tensor(s, dtype=torch.float32, device=device)

            # Deterministic evaluation
            action = model.act_deterministic(s_tensor)

            s, r, terminated, truncated, _ = eval_env.step(
                action.cpu().numpy()
            )
            done = terminated or truncated
            ep_return += r

            frames.append(eval_env.render())

        print(f"[Eval @ {global_step}] Return: {ep_return:.2f}")

        # Saving eval videos
        video_path = f"videos/ppo_eval_{global_step}.mp4"
        imageio.mimsave(video_path, frames, fps=30)
        print(f"Saved video: {video_path}")

env.close()
eval_env.close()


Device: cuda
Step: 2048 | Rollout Reward: -2.02 | KL: 0.01227 | ClipFrac: 0.138 | Entropy: 2.845
Step: 4096 | Rollout Reward: -1.88 | KL: 0.01900 | ClipFrac: 0.191 | Entropy: 2.862
Step: 6144 | Rollout Reward: -2.22 | KL: 0.01231 | ClipFrac: 0.139 | Entropy: 2.861
Step: 8192 | Rollout Reward: -2.98 | KL: 0.01119 | ClipFrac: 0.142 | Entropy: 2.871
Step: 10240 | Rollout Reward: -2.89 | KL: 0.00890 | ClipFrac: 0.156 | Entropy: 2.862
[Eval @ 10240] Return: -323.45




Saved video: videos/ppo_eval_10240.mp4
Step: 12288 | Rollout Reward: -2.45 | KL: 0.01808 | ClipFrac: 0.173 | Entropy: 2.871
Step: 14336 | Rollout Reward: -1.78 | KL: 0.02211 | ClipFrac: 0.165 | Entropy: 2.885
Step: 16384 | Rollout Reward: -1.50 | KL: 0.01488 | ClipFrac: 0.153 | Entropy: 2.896
Step: 18432 | Rollout Reward: -0.96 | KL: 0.01794 | ClipFrac: 0.205 | Entropy: 2.907
Step: 20480 | Rollout Reward: -1.15 | KL: 0.01209 | ClipFrac: 0.112 | Entropy: 2.905




[Eval @ 20480] Return: -30.97
Saved video: videos/ppo_eval_20480.mp4
Step: 22528 | Rollout Reward: -0.77 | KL: 0.01121 | ClipFrac: 0.156 | Entropy: 2.900
Step: 24576 | Rollout Reward: -0.70 | KL: 0.02256 | ClipFrac: 0.216 | Entropy: 2.891
Step: 26624 | Rollout Reward: -0.52 | KL: 0.01297 | ClipFrac: 0.190 | Entropy: 2.875
Step: 28672 | Rollout Reward: -0.47 | KL: 0.01236 | ClipFrac: 0.166 | Entropy: 2.892
Step: 30720 | Rollout Reward: -0.53 | KL: 0.01436 | ClipFrac: 0.179 | Entropy: 2.877




[Eval @ 30720] Return: -42.02
Saved video: videos/ppo_eval_30720.mp4
Step: 32768 | Rollout Reward: -0.47 | KL: 0.01762 | ClipFrac: 0.181 | Entropy: 2.863
Step: 34816 | Rollout Reward: -0.11 | KL: 0.01489 | ClipFrac: 0.155 | Entropy: 2.833
Step: 36864 | Rollout Reward: -0.10 | KL: 0.00973 | ClipFrac: 0.120 | Entropy: 2.804
Step: 38912 | Rollout Reward: -0.12 | KL: 0.01049 | ClipFrac: 0.171 | Entropy: 2.774
Step: 40960 | Rollout Reward: -0.08 | KL: 0.01229 | ClipFrac: 0.155 | Entropy: 2.760




[Eval @ 40960] Return: -149.45
Saved video: videos/ppo_eval_40960.mp4
Step: 43008 | Rollout Reward: -0.18 | KL: 0.01108 | ClipFrac: 0.132 | Entropy: 2.748
Step: 45056 | Rollout Reward: -0.09 | KL: 0.01428 | ClipFrac: 0.119 | Entropy: 2.739
Step: 47104 | Rollout Reward: 0.00 | KL: 0.01018 | ClipFrac: 0.129 | Entropy: 2.749
Step: 49152 | Rollout Reward: -0.05 | KL: 0.00814 | ClipFrac: 0.116 | Entropy: 2.726
Step: 51200 | Rollout Reward: 0.04 | KL: 0.01508 | ClipFrac: 0.176 | Entropy: 2.736




[Eval @ 51200] Return: -136.69
Saved video: videos/ppo_eval_51200.mp4
Step: 53248 | Rollout Reward: -0.08 | KL: 0.01006 | ClipFrac: 0.137 | Entropy: 2.736
Step: 55296 | Rollout Reward: 0.07 | KL: 0.01095 | ClipFrac: 0.153 | Entropy: 2.694
Step: 57344 | Rollout Reward: -0.02 | KL: 0.02008 | ClipFrac: 0.196 | Entropy: 2.691
Step: 59392 | Rollout Reward: 0.01 | KL: 0.01450 | ClipFrac: 0.148 | Entropy: 2.708
Step: 61440 | Rollout Reward: 0.04 | KL: 0.01190 | ClipFrac: 0.150 | Entropy: 2.714




[Eval @ 61440] Return: 193.03
Saved video: videos/ppo_eval_61440.mp4
Step: 63488 | Rollout Reward: -0.01 | KL: 0.01057 | ClipFrac: 0.138 | Entropy: 2.703
Step: 65536 | Rollout Reward: 0.04 | KL: 0.01218 | ClipFrac: 0.142 | Entropy: 2.678
Step: 67584 | Rollout Reward: 0.06 | KL: 0.00877 | ClipFrac: 0.133 | Entropy: 2.658
Step: 69632 | Rollout Reward: 0.02 | KL: 0.01220 | ClipFrac: 0.131 | Entropy: 2.643
Step: 71680 | Rollout Reward: 0.05 | KL: 0.01072 | ClipFrac: 0.162 | Entropy: 2.629




[Eval @ 71680] Return: -64.93
Saved video: videos/ppo_eval_71680.mp4
Step: 73728 | Rollout Reward: 0.08 | KL: 0.00737 | ClipFrac: 0.144 | Entropy: 2.624
Step: 75776 | Rollout Reward: 0.05 | KL: 0.01869 | ClipFrac: 0.199 | Entropy: 2.628
Step: 77824 | Rollout Reward: 0.07 | KL: 0.01089 | ClipFrac: 0.108 | Entropy: 2.636
Step: 79872 | Rollout Reward: 0.03 | KL: 0.01589 | ClipFrac: 0.147 | Entropy: 2.626
Step: 81920 | Rollout Reward: 0.13 | KL: 0.00865 | ClipFrac: 0.135 | Entropy: 2.596




[Eval @ 81920] Return: 249.52
Saved video: videos/ppo_eval_81920.mp4
Step: 83968 | Rollout Reward: 0.16 | KL: 0.01867 | ClipFrac: 0.225 | Entropy: 2.593
Step: 86016 | Rollout Reward: 0.06 | KL: 0.00753 | ClipFrac: 0.158 | Entropy: 2.569
Step: 88064 | Rollout Reward: -0.09 | KL: 0.01635 | ClipFrac: 0.190 | Entropy: 2.570
Step: 90112 | Rollout Reward: 0.08 | KL: 0.00808 | ClipFrac: 0.095 | Entropy: 2.564




[Eval @ 90112] Return: -20.36
Saved video: videos/ppo_eval_90112.mp4
Step: 92160 | Rollout Reward: 0.05 | KL: 0.01205 | ClipFrac: 0.140 | Entropy: 2.555
Step: 94208 | Rollout Reward: 0.02 | KL: 0.01247 | ClipFrac: 0.170 | Entropy: 2.556
Step: 96256 | Rollout Reward: 0.04 | KL: 0.01858 | ClipFrac: 0.135 | Entropy: 2.562
Step: 98304 | Rollout Reward: 0.06 | KL: 0.01165 | ClipFrac: 0.134 | Entropy: 2.558
Step: 100352 | Rollout Reward: 0.14 | KL: 0.01060 | ClipFrac: 0.160 | Entropy: 2.556




[Eval @ 100352] Return: 183.11
Saved video: videos/ppo_eval_100352.mp4
Step: 102400 | Rollout Reward: 0.14 | KL: 0.01629 | ClipFrac: 0.189 | Entropy: 2.543
Step: 104448 | Rollout Reward: 0.16 | KL: 0.00959 | ClipFrac: 0.120 | Entropy: 2.550
Step: 106496 | Rollout Reward: 0.09 | KL: 0.00635 | ClipFrac: 0.094 | Entropy: 2.528
Step: 108544 | Rollout Reward: 0.01 | KL: 0.01272 | ClipFrac: 0.198 | Entropy: 2.526
Step: 110592 | Rollout Reward: 0.06 | KL: 0.01633 | ClipFrac: 0.162 | Entropy: 2.502




[Eval @ 110592] Return: -154.21
Saved video: videos/ppo_eval_110592.mp4
Step: 112640 | Rollout Reward: 0.04 | KL: 0.02129 | ClipFrac: 0.233 | Entropy: 2.503
Step: 114688 | Rollout Reward: 0.09 | KL: 0.01844 | ClipFrac: 0.187 | Entropy: 2.520
Step: 116736 | Rollout Reward: 0.01 | KL: 0.01041 | ClipFrac: 0.119 | Entropy: 2.501
Step: 118784 | Rollout Reward: 0.09 | KL: 0.01708 | ClipFrac: 0.179 | Entropy: 2.497
Step: 120832 | Rollout Reward: -0.14 | KL: 0.01570 | ClipFrac: 0.170 | Entropy: 2.503




[Eval @ 120832] Return: -145.22
Saved video: videos/ppo_eval_120832.mp4
Step: 122880 | Rollout Reward: 0.07 | KL: 0.01638 | ClipFrac: 0.174 | Entropy: 2.526
Step: 124928 | Rollout Reward: 0.08 | KL: 0.01654 | ClipFrac: 0.200 | Entropy: 2.521
Step: 126976 | Rollout Reward: 0.09 | KL: 0.01354 | ClipFrac: 0.194 | Entropy: 2.507
Step: 129024 | Rollout Reward: 0.10 | KL: 0.00844 | ClipFrac: 0.128 | Entropy: 2.493
Step: 131072 | Rollout Reward: 0.02 | KL: 0.01469 | ClipFrac: 0.184 | Entropy: 2.485




[Eval @ 131072] Return: -91.01
Saved video: videos/ppo_eval_131072.mp4
Step: 133120 | Rollout Reward: 0.08 | KL: 0.01325 | ClipFrac: 0.138 | Entropy: 2.497
Step: 135168 | Rollout Reward: 0.12 | KL: 0.01246 | ClipFrac: 0.215 | Entropy: 2.482
Step: 137216 | Rollout Reward: -0.03 | KL: 0.01247 | ClipFrac: 0.157 | Entropy: 2.514
Step: 139264 | Rollout Reward: 0.04 | KL: 0.00928 | ClipFrac: 0.145 | Entropy: 2.469
Step: 141312 | Rollout Reward: 0.03 | KL: 0.01242 | ClipFrac: 0.112 | Entropy: 2.472




[Eval @ 141312] Return: 208.32
Saved video: videos/ppo_eval_141312.mp4
Step: 143360 | Rollout Reward: 0.09 | KL: 0.01146 | ClipFrac: 0.161 | Entropy: 2.460
Step: 145408 | Rollout Reward: 0.21 | KL: 0.00645 | ClipFrac: 0.096 | Entropy: 2.451
Step: 147456 | Rollout Reward: 0.07 | KL: 0.01990 | ClipFrac: 0.247 | Entropy: 2.418
Step: 149504 | Rollout Reward: 0.01 | KL: 0.02404 | ClipFrac: 0.236 | Entropy: 2.410
Step: 151552 | Rollout Reward: 0.09 | KL: 0.02338 | ClipFrac: 0.219 | Entropy: 2.408




[Eval @ 151552] Return: 173.06
Saved video: videos/ppo_eval_151552.mp4
Step: 153600 | Rollout Reward: 0.08 | KL: 0.01038 | ClipFrac: 0.164 | Entropy: 2.405
Step: 155648 | Rollout Reward: 0.13 | KL: 0.00982 | ClipFrac: 0.138 | Entropy: 2.410
Step: 157696 | Rollout Reward: 0.17 | KL: 0.01173 | ClipFrac: 0.113 | Entropy: 2.417
Step: 159744 | Rollout Reward: 0.13 | KL: 0.01001 | ClipFrac: 0.119 | Entropy: 2.398
Step: 161792 | Rollout Reward: 0.11 | KL: 0.01493 | ClipFrac: 0.165 | Entropy: 2.387




[Eval @ 161792] Return: 191.11
Saved video: videos/ppo_eval_161792.mp4
Step: 163840 | Rollout Reward: 0.08 | KL: 0.01091 | ClipFrac: 0.108 | Entropy: 2.410
Step: 165888 | Rollout Reward: 0.13 | KL: 0.01008 | ClipFrac: 0.122 | Entropy: 2.383
Step: 167936 | Rollout Reward: 0.06 | KL: 0.01324 | ClipFrac: 0.222 | Entropy: 2.372
Step: 169984 | Rollout Reward: 0.04 | KL: 0.01577 | ClipFrac: 0.179 | Entropy: 2.379
Step: 172032 | Rollout Reward: 0.19 | KL: 0.01424 | ClipFrac: 0.139 | Entropy: 2.367




[Eval @ 172032] Return: 290.37
Saved video: videos/ppo_eval_172032.mp4
Step: 174080 | Rollout Reward: 0.16 | KL: 0.01207 | ClipFrac: 0.200 | Entropy: 2.358
Step: 176128 | Rollout Reward: 0.13 | KL: 0.00709 | ClipFrac: 0.144 | Entropy: 2.385
Step: 178176 | Rollout Reward: 0.15 | KL: 0.01635 | ClipFrac: 0.178 | Entropy: 2.365
Step: 180224 | Rollout Reward: 0.18 | KL: 0.01138 | ClipFrac: 0.161 | Entropy: 2.349




[Eval @ 180224] Return: -54.21
Saved video: videos/ppo_eval_180224.mp4
Step: 182272 | Rollout Reward: 0.11 | KL: 0.01002 | ClipFrac: 0.149 | Entropy: 2.348
Step: 184320 | Rollout Reward: 0.14 | KL: 0.01181 | ClipFrac: 0.154 | Entropy: 2.345
Step: 186368 | Rollout Reward: 0.14 | KL: 0.01943 | ClipFrac: 0.245 | Entropy: 2.335
Step: 188416 | Rollout Reward: 0.17 | KL: 0.01311 | ClipFrac: 0.169 | Entropy: 2.317
Step: 190464 | Rollout Reward: 0.14 | KL: 0.01553 | ClipFrac: 0.202 | Entropy: 2.302




[Eval @ 190464] Return: 245.99
Saved video: videos/ppo_eval_190464.mp4
Step: 192512 | Rollout Reward: 0.14 | KL: 0.01512 | ClipFrac: 0.182 | Entropy: 2.298
Step: 194560 | Rollout Reward: 0.05 | KL: 0.00924 | ClipFrac: 0.115 | Entropy: 2.285
Step: 196608 | Rollout Reward: 0.24 | KL: 0.00843 | ClipFrac: 0.092 | Entropy: 2.290
Step: 198656 | Rollout Reward: 0.12 | KL: 0.01515 | ClipFrac: 0.127 | Entropy: 2.290
Step: 200704 | Rollout Reward: 0.19 | KL: 0.00777 | ClipFrac: 0.149 | Entropy: 2.297




[Eval @ 200704] Return: 266.55
Saved video: videos/ppo_eval_200704.mp4
Step: 202752 | Rollout Reward: 0.14 | KL: 0.01757 | ClipFrac: 0.173 | Entropy: 2.270
Step: 204800 | Rollout Reward: 0.16 | KL: 0.01774 | ClipFrac: 0.175 | Entropy: 2.236
Step: 206848 | Rollout Reward: 0.11 | KL: 0.01102 | ClipFrac: 0.155 | Entropy: 2.227
Step: 208896 | Rollout Reward: 0.16 | KL: 0.01307 | ClipFrac: 0.183 | Entropy: 2.221
Step: 210944 | Rollout Reward: 0.10 | KL: 0.01201 | ClipFrac: 0.198 | Entropy: 2.196




[Eval @ 210944] Return: -209.30
Saved video: videos/ppo_eval_210944.mp4
Step: 212992 | Rollout Reward: 0.12 | KL: 0.01003 | ClipFrac: 0.155 | Entropy: 2.151
Step: 215040 | Rollout Reward: 0.12 | KL: 0.01501 | ClipFrac: 0.178 | Entropy: 2.164
Step: 217088 | Rollout Reward: 0.17 | KL: 0.01164 | ClipFrac: 0.113 | Entropy: 2.157
Step: 219136 | Rollout Reward: 0.16 | KL: 0.01183 | ClipFrac: 0.215 | Entropy: 2.143
Step: 221184 | Rollout Reward: 0.16 | KL: 0.00873 | ClipFrac: 0.156 | Entropy: 2.105




[Eval @ 221184] Return: -158.80
Saved video: videos/ppo_eval_221184.mp4
Step: 223232 | Rollout Reward: 0.14 | KL: 0.01856 | ClipFrac: 0.216 | Entropy: 2.067
Step: 225280 | Rollout Reward: 0.16 | KL: 0.01184 | ClipFrac: 0.148 | Entropy: 2.059
Step: 227328 | Rollout Reward: 0.15 | KL: 0.01099 | ClipFrac: 0.151 | Entropy: 2.044
Step: 229376 | Rollout Reward: 0.12 | KL: 0.01756 | ClipFrac: 0.202 | Entropy: 2.017
Step: 231424 | Rollout Reward: 0.14 | KL: 0.01185 | ClipFrac: 0.164 | Entropy: 1.994




[Eval @ 231424] Return: 257.10
Saved video: videos/ppo_eval_231424.mp4
Step: 233472 | Rollout Reward: 0.26 | KL: 0.01637 | ClipFrac: 0.183 | Entropy: 1.999
Step: 235520 | Rollout Reward: 0.15 | KL: 0.01409 | ClipFrac: 0.204 | Entropy: 2.000
Step: 237568 | Rollout Reward: 0.16 | KL: 0.01667 | ClipFrac: 0.217 | Entropy: 1.976
Step: 239616 | Rollout Reward: 0.16 | KL: 0.01809 | ClipFrac: 0.181 | Entropy: 1.955
Step: 241664 | Rollout Reward: 0.15 | KL: 0.01818 | ClipFrac: 0.215 | Entropy: 1.960




[Eval @ 241664] Return: 247.14
Saved video: videos/ppo_eval_241664.mp4
Step: 243712 | Rollout Reward: 0.15 | KL: 0.01190 | ClipFrac: 0.143 | Entropy: 1.937
Step: 245760 | Rollout Reward: 0.17 | KL: 0.01493 | ClipFrac: 0.138 | Entropy: 1.923
Step: 247808 | Rollout Reward: 0.20 | KL: 0.00608 | ClipFrac: 0.101 | Entropy: 1.918
Step: 249856 | Rollout Reward: 0.28 | KL: 0.01565 | ClipFrac: 0.161 | Entropy: 1.924
Step: 251904 | Rollout Reward: 0.15 | KL: 0.01166 | ClipFrac: 0.187 | Entropy: 1.929




[Eval @ 251904] Return: 289.51
Saved video: videos/ppo_eval_251904.mp4
Step: 253952 | Rollout Reward: 0.15 | KL: 0.01505 | ClipFrac: 0.195 | Entropy: 1.894
Step: 256000 | Rollout Reward: 0.17 | KL: 0.01176 | ClipFrac: 0.194 | Entropy: 1.868
Step: 258048 | Rollout Reward: 0.18 | KL: 0.01162 | ClipFrac: 0.183 | Entropy: 1.875
Step: 260096 | Rollout Reward: 0.26 | KL: 0.01864 | ClipFrac: 0.173 | Entropy: 1.880




[Eval @ 260096] Return: 220.29
Saved video: videos/ppo_eval_260096.mp4
Step: 262144 | Rollout Reward: 0.14 | KL: 0.01919 | ClipFrac: 0.220 | Entropy: 1.898
Step: 264192 | Rollout Reward: 0.23 | KL: 0.03170 | ClipFrac: 0.300 | Entropy: 1.894
Step: 266240 | Rollout Reward: 0.17 | KL: 0.01147 | ClipFrac: 0.152 | Entropy: 1.862
Step: 268288 | Rollout Reward: 0.31 | KL: 0.01759 | ClipFrac: 0.193 | Entropy: 1.861
Step: 270336 | Rollout Reward: 0.18 | KL: 0.01260 | ClipFrac: 0.135 | Entropy: 1.865




[Eval @ 270336] Return: 238.70
Saved video: videos/ppo_eval_270336.mp4
Step: 272384 | Rollout Reward: 0.16 | KL: 0.01604 | ClipFrac: 0.189 | Entropy: 1.877
Step: 274432 | Rollout Reward: 0.19 | KL: 0.01044 | ClipFrac: 0.137 | Entropy: 1.869
Step: 276480 | Rollout Reward: 0.24 | KL: 0.00932 | ClipFrac: 0.144 | Entropy: 1.875
Step: 278528 | Rollout Reward: 0.15 | KL: 0.01042 | ClipFrac: 0.150 | Entropy: 1.877
Step: 280576 | Rollout Reward: 0.16 | KL: 0.01042 | ClipFrac: 0.152 | Entropy: 1.883




[Eval @ 280576] Return: 143.49
Saved video: videos/ppo_eval_280576.mp4
Step: 282624 | Rollout Reward: 0.10 | KL: 0.02556 | ClipFrac: 0.286 | Entropy: 1.851
Step: 284672 | Rollout Reward: 0.26 | KL: 0.01759 | ClipFrac: 0.228 | Entropy: 1.846
Step: 286720 | Rollout Reward: 0.17 | KL: 0.01465 | ClipFrac: 0.214 | Entropy: 1.813
Step: 288768 | Rollout Reward: 0.07 | KL: 0.01191 | ClipFrac: 0.159 | Entropy: 1.824
Step: 290816 | Rollout Reward: 0.19 | KL: 0.01462 | ClipFrac: 0.192 | Entropy: 1.837




[Eval @ 290816] Return: 141.04
Saved video: videos/ppo_eval_290816.mp4
Step: 292864 | Rollout Reward: 0.20 | KL: 0.01970 | ClipFrac: 0.228 | Entropy: 1.828
Step: 294912 | Rollout Reward: 0.14 | KL: 0.02608 | ClipFrac: 0.238 | Entropy: 1.833
Step: 296960 | Rollout Reward: 0.22 | KL: 0.01342 | ClipFrac: 0.155 | Entropy: 1.842
Step: 299008 | Rollout Reward: 0.14 | KL: 0.02007 | ClipFrac: 0.142 | Entropy: 1.833
Step: 301056 | Rollout Reward: 0.15 | KL: 0.01728 | ClipFrac: 0.195 | Entropy: 1.830




[Eval @ 301056] Return: 251.81
Saved video: videos/ppo_eval_301056.mp4
Step: 303104 | Rollout Reward: 0.21 | KL: 0.01139 | ClipFrac: 0.116 | Entropy: 1.844
Step: 305152 | Rollout Reward: 0.15 | KL: 0.00997 | ClipFrac: 0.149 | Entropy: 1.836
Step: 307200 | Rollout Reward: 0.17 | KL: 0.01632 | ClipFrac: 0.196 | Entropy: 1.812
Step: 309248 | Rollout Reward: 0.17 | KL: 0.02309 | ClipFrac: 0.237 | Entropy: 1.809
Step: 311296 | Rollout Reward: 0.20 | KL: 0.01090 | ClipFrac: 0.172 | Entropy: 1.830




[Eval @ 311296] Return: 263.59
Saved video: videos/ppo_eval_311296.mp4
Step: 313344 | Rollout Reward: 0.13 | KL: 0.01586 | ClipFrac: 0.209 | Entropy: 1.833
Step: 315392 | Rollout Reward: 0.13 | KL: 0.01421 | ClipFrac: 0.184 | Entropy: 1.828
Step: 317440 | Rollout Reward: 0.15 | KL: 0.01324 | ClipFrac: 0.183 | Entropy: 1.816
Step: 319488 | Rollout Reward: 0.15 | KL: 0.01380 | ClipFrac: 0.169 | Entropy: 1.772
Step: 321536 | Rollout Reward: 0.17 | KL: 0.01339 | ClipFrac: 0.152 | Entropy: 1.776




[Eval @ 321536] Return: 221.22
Saved video: videos/ppo_eval_321536.mp4
Step: 323584 | Rollout Reward: 0.22 | KL: 0.01185 | ClipFrac: 0.143 | Entropy: 1.766
Step: 325632 | Rollout Reward: 0.34 | KL: 0.01945 | ClipFrac: 0.202 | Entropy: 1.765
Step: 327680 | Rollout Reward: 0.19 | KL: 0.01641 | ClipFrac: 0.172 | Entropy: 1.770
Step: 329728 | Rollout Reward: 0.28 | KL: 0.00544 | ClipFrac: 0.135 | Entropy: 1.769
Step: 331776 | Rollout Reward: 0.15 | KL: 0.01906 | ClipFrac: 0.208 | Entropy: 1.749




[Eval @ 331776] Return: 286.78
Saved video: videos/ppo_eval_331776.mp4
Step: 333824 | Rollout Reward: 0.16 | KL: 0.01618 | ClipFrac: 0.200 | Entropy: 1.726
Step: 335872 | Rollout Reward: 0.16 | KL: 0.01773 | ClipFrac: 0.193 | Entropy: 1.689
Step: 337920 | Rollout Reward: 0.15 | KL: 0.01907 | ClipFrac: 0.185 | Entropy: 1.677
Step: 339968 | Rollout Reward: 0.19 | KL: 0.01503 | ClipFrac: 0.150 | Entropy: 1.675
Step: 342016 | Rollout Reward: 0.20 | KL: 0.01211 | ClipFrac: 0.152 | Entropy: 1.672




[Eval @ 342016] Return: 253.12
Saved video: videos/ppo_eval_342016.mp4
Step: 344064 | Rollout Reward: 0.29 | KL: 0.00952 | ClipFrac: 0.149 | Entropy: 1.657
Step: 346112 | Rollout Reward: 0.13 | KL: 0.02552 | ClipFrac: 0.226 | Entropy: 1.663
Step: 348160 | Rollout Reward: 0.15 | KL: 0.01437 | ClipFrac: 0.189 | Entropy: 1.679
Step: 350208 | Rollout Reward: 0.18 | KL: 0.01132 | ClipFrac: 0.162 | Entropy: 1.667




[Eval @ 350208] Return: 233.86
Saved video: videos/ppo_eval_350208.mp4
Step: 352256 | Rollout Reward: 0.27 | KL: 0.01365 | ClipFrac: 0.136 | Entropy: 1.666
Step: 354304 | Rollout Reward: 0.32 | KL: 0.01252 | ClipFrac: 0.201 | Entropy: 1.665
Step: 356352 | Rollout Reward: 0.14 | KL: 0.02351 | ClipFrac: 0.235 | Entropy: 1.670
Step: 358400 | Rollout Reward: 0.27 | KL: 0.01248 | ClipFrac: 0.131 | Entropy: 1.679
Step: 360448 | Rollout Reward: 0.11 | KL: 0.01729 | ClipFrac: 0.200 | Entropy: 1.684




[Eval @ 360448] Return: 239.99
Saved video: videos/ppo_eval_360448.mp4
Step: 362496 | Rollout Reward: 0.20 | KL: 0.01404 | ClipFrac: 0.172 | Entropy: 1.693
Step: 364544 | Rollout Reward: 0.15 | KL: 0.01972 | ClipFrac: 0.235 | Entropy: 1.676
Step: 366592 | Rollout Reward: 0.14 | KL: 0.01981 | ClipFrac: 0.197 | Entropy: 1.681
Step: 368640 | Rollout Reward: 0.18 | KL: 0.01734 | ClipFrac: 0.200 | Entropy: 1.673
Step: 370688 | Rollout Reward: 0.16 | KL: 0.01739 | ClipFrac: 0.218 | Entropy: 1.688




[Eval @ 370688] Return: 257.59
Saved video: videos/ppo_eval_370688.mp4
Step: 372736 | Rollout Reward: 0.13 | KL: 0.02327 | ClipFrac: 0.246 | Entropy: 1.692
Step: 374784 | Rollout Reward: 0.18 | KL: 0.01789 | ClipFrac: 0.277 | Entropy: 1.679
Step: 376832 | Rollout Reward: 0.17 | KL: 0.01946 | ClipFrac: 0.197 | Entropy: 1.633
Step: 378880 | Rollout Reward: 0.13 | KL: 0.01730 | ClipFrac: 0.203 | Entropy: 1.664
Step: 380928 | Rollout Reward: 0.30 | KL: 0.01815 | ClipFrac: 0.177 | Entropy: 1.666




[Eval @ 380928] Return: 227.57
Saved video: videos/ppo_eval_380928.mp4
Step: 382976 | Rollout Reward: 0.52 | KL: 0.01207 | ClipFrac: 0.109 | Entropy: 1.673
Step: 385024 | Rollout Reward: 0.26 | KL: 0.01024 | ClipFrac: 0.153 | Entropy: 1.668
Step: 387072 | Rollout Reward: 0.14 | KL: 0.01743 | ClipFrac: 0.219 | Entropy: 1.657
Step: 389120 | Rollout Reward: 0.15 | KL: 0.02096 | ClipFrac: 0.240 | Entropy: 1.647
Step: 391168 | Rollout Reward: 0.16 | KL: 0.01796 | ClipFrac: 0.214 | Entropy: 1.664




[Eval @ 391168] Return: 189.62
Saved video: videos/ppo_eval_391168.mp4
Step: 393216 | Rollout Reward: 0.13 | KL: 0.03111 | ClipFrac: 0.283 | Entropy: 1.686
Step: 395264 | Rollout Reward: 0.33 | KL: 0.01858 | ClipFrac: 0.199 | Entropy: 1.702
Step: 397312 | Rollout Reward: 0.08 | KL: 0.01501 | ClipFrac: 0.165 | Entropy: 1.712
Step: 399360 | Rollout Reward: 0.21 | KL: 0.01685 | ClipFrac: 0.212 | Entropy: 1.736
Step: 401408 | Rollout Reward: 0.05 | KL: 0.02707 | ClipFrac: 0.265 | Entropy: 1.752




[Eval @ 401408] Return: 259.81
Saved video: videos/ppo_eval_401408.mp4
Step: 403456 | Rollout Reward: 0.25 | KL: 0.01193 | ClipFrac: 0.162 | Entropy: 1.752
Step: 405504 | Rollout Reward: 0.16 | KL: 0.01214 | ClipFrac: 0.153 | Entropy: 1.778
Step: 407552 | Rollout Reward: 0.18 | KL: 0.01593 | ClipFrac: 0.148 | Entropy: 1.781
Step: 409600 | Rollout Reward: 0.22 | KL: 0.01182 | ClipFrac: 0.177 | Entropy: 1.779
Step: 411648 | Rollout Reward: 0.14 | KL: 0.01722 | ClipFrac: 0.192 | Entropy: 1.828




[Eval @ 411648] Return: -19.35
Saved video: videos/ppo_eval_411648.mp4
Step: 413696 | Rollout Reward: 0.16 | KL: 0.01992 | ClipFrac: 0.193 | Entropy: 1.835
Step: 415744 | Rollout Reward: 0.18 | KL: 0.01459 | ClipFrac: 0.195 | Entropy: 1.819
Step: 417792 | Rollout Reward: 0.14 | KL: 0.01780 | ClipFrac: 0.203 | Entropy: 1.847
Step: 419840 | Rollout Reward: 0.32 | KL: 0.00597 | ClipFrac: 0.084 | Entropy: 1.844
Step: 421888 | Rollout Reward: 0.17 | KL: 0.01004 | ClipFrac: 0.073 | Entropy: 1.842




[Eval @ 421888] Return: 264.80
Saved video: videos/ppo_eval_421888.mp4
Step: 423936 | Rollout Reward: 0.17 | KL: 0.00889 | ClipFrac: 0.126 | Entropy: 1.832
Step: 425984 | Rollout Reward: 0.15 | KL: 0.01701 | ClipFrac: 0.230 | Entropy: 1.796
Step: 428032 | Rollout Reward: 0.13 | KL: 0.01689 | ClipFrac: 0.196 | Entropy: 1.767
Step: 430080 | Rollout Reward: 0.17 | KL: 0.02113 | ClipFrac: 0.237 | Entropy: 1.801




[Eval @ 430080] Return: 238.99
Saved video: videos/ppo_eval_430080.mp4
Step: 432128 | Rollout Reward: 0.17 | KL: 0.01797 | ClipFrac: 0.180 | Entropy: 1.824
Step: 434176 | Rollout Reward: 0.19 | KL: 0.02190 | ClipFrac: 0.240 | Entropy: 1.829
Step: 436224 | Rollout Reward: 0.15 | KL: 0.01695 | ClipFrac: 0.230 | Entropy: 1.835
Step: 438272 | Rollout Reward: 0.17 | KL: 0.02547 | ClipFrac: 0.171 | Entropy: 1.836
Step: 440320 | Rollout Reward: 0.17 | KL: 0.00838 | ClipFrac: 0.114 | Entropy: 1.824




[Eval @ 440320] Return: 220.64
Saved video: videos/ppo_eval_440320.mp4
Step: 442368 | Rollout Reward: 0.15 | KL: 0.02388 | ClipFrac: 0.288 | Entropy: 1.803
Step: 444416 | Rollout Reward: 0.26 | KL: 0.01870 | ClipFrac: 0.197 | Entropy: 1.795
Step: 446464 | Rollout Reward: 0.15 | KL: 0.02795 | ClipFrac: 0.279 | Entropy: 1.740
Step: 448512 | Rollout Reward: 0.18 | KL: 0.03696 | ClipFrac: 0.294 | Entropy: 1.737
Step: 450560 | Rollout Reward: 0.14 | KL: 0.02597 | ClipFrac: 0.238 | Entropy: 1.763




[Eval @ 450560] Return: 31.01
Saved video: videos/ppo_eval_450560.mp4
Step: 452608 | Rollout Reward: 0.25 | KL: 0.01443 | ClipFrac: 0.140 | Entropy: 1.761
Step: 454656 | Rollout Reward: 0.16 | KL: 0.02542 | ClipFrac: 0.295 | Entropy: 1.753
Step: 456704 | Rollout Reward: 0.24 | KL: 0.01075 | ClipFrac: 0.120 | Entropy: 1.762
Step: 458752 | Rollout Reward: 0.03 | KL: 0.07893 | ClipFrac: 0.377 | Entropy: 1.768
Step: 460800 | Rollout Reward: 0.24 | KL: 0.02009 | ClipFrac: 0.201 | Entropy: 1.768




[Eval @ 460800] Return: 196.94
Saved video: videos/ppo_eval_460800.mp4
Step: 462848 | Rollout Reward: 0.13 | KL: 0.02674 | ClipFrac: 0.277 | Entropy: 1.783
Step: 464896 | Rollout Reward: 0.13 | KL: 0.02983 | ClipFrac: 0.213 | Entropy: 1.791
Step: 466944 | Rollout Reward: 0.18 | KL: 0.03237 | ClipFrac: 0.273 | Entropy: 1.791
Step: 468992 | Rollout Reward: 0.17 | KL: 0.01811 | ClipFrac: 0.190 | Entropy: 1.777
Step: 471040 | Rollout Reward: 0.27 | KL: 0.01071 | ClipFrac: 0.149 | Entropy: 1.809




[Eval @ 471040] Return: 311.79
Saved video: videos/ppo_eval_471040.mp4
Step: 473088 | Rollout Reward: 0.17 | KL: 0.03672 | ClipFrac: 0.309 | Entropy: 1.864
Step: 475136 | Rollout Reward: 0.15 | KL: 0.02573 | ClipFrac: 0.290 | Entropy: 1.860
Step: 477184 | Rollout Reward: 0.15 | KL: 0.04634 | ClipFrac: 0.409 | Entropy: 1.862
Step: 479232 | Rollout Reward: 0.15 | KL: 0.01407 | ClipFrac: 0.133 | Entropy: 1.873
Step: 481280 | Rollout Reward: 0.12 | KL: 0.01315 | ClipFrac: 0.152 | Entropy: 1.871




[Eval @ 481280] Return: 284.60
Saved video: videos/ppo_eval_481280.mp4
Step: 483328 | Rollout Reward: 0.15 | KL: 0.02417 | ClipFrac: 0.247 | Entropy: 1.828
Step: 485376 | Rollout Reward: 0.17 | KL: 0.01582 | ClipFrac: 0.191 | Entropy: 1.815
Step: 487424 | Rollout Reward: 0.15 | KL: 0.01846 | ClipFrac: 0.200 | Entropy: 1.797
Step: 489472 | Rollout Reward: 0.13 | KL: 0.01408 | ClipFrac: 0.161 | Entropy: 1.779
Step: 491520 | Rollout Reward: 0.14 | KL: 0.01978 | ClipFrac: 0.222 | Entropy: 1.773




[Eval @ 491520] Return: -44.88
Saved video: videos/ppo_eval_491520.mp4
Step: 493568 | Rollout Reward: 0.11 | KL: 0.02547 | ClipFrac: 0.268 | Entropy: 1.780
Step: 495616 | Rollout Reward: 0.10 | KL: 0.04787 | ClipFrac: 0.175 | Entropy: 1.784
Step: 497664 | Rollout Reward: 0.26 | KL: 0.01377 | ClipFrac: 0.129 | Entropy: 1.777
Step: 499712 | Rollout Reward: 0.21 | KL: 0.01798 | ClipFrac: 0.230 | Entropy: 1.758
Step: 501760 | Rollout Reward: 0.09 | KL: 0.02707 | ClipFrac: 0.291 | Entropy: 1.761




[Eval @ 501760] Return: 290.64
Saved video: videos/ppo_eval_501760.mp4
Step: 503808 | Rollout Reward: 0.15 | KL: 0.01372 | ClipFrac: 0.177 | Entropy: 1.771
Step: 505856 | Rollout Reward: 0.40 | KL: 0.02525 | ClipFrac: 0.255 | Entropy: 1.769
Step: 507904 | Rollout Reward: 0.27 | KL: 0.01311 | ClipFrac: 0.169 | Entropy: 1.765
Step: 509952 | Rollout Reward: 0.07 | KL: 0.02686 | ClipFrac: 0.223 | Entropy: 1.769
Step: 512000 | Rollout Reward: 0.09 | KL: 0.02298 | ClipFrac: 0.268 | Entropy: 1.766




[Eval @ 512000] Return: -33.42
Saved video: videos/ppo_eval_512000.mp4
Step: 514048 | Rollout Reward: 0.14 | KL: 0.01800 | ClipFrac: 0.230 | Entropy: 1.734
Step: 516096 | Rollout Reward: 0.13 | KL: 0.01249 | ClipFrac: 0.172 | Entropy: 1.740
Step: 518144 | Rollout Reward: 0.21 | KL: 0.01241 | ClipFrac: 0.135 | Entropy: 1.730
Step: 520192 | Rollout Reward: 0.14 | KL: 0.02172 | ClipFrac: 0.236 | Entropy: 1.743




[Eval @ 520192] Return: 228.98
Saved video: videos/ppo_eval_520192.mp4
Step: 522240 | Rollout Reward: 0.15 | KL: 0.01581 | ClipFrac: 0.210 | Entropy: 1.734
Step: 524288 | Rollout Reward: 0.23 | KL: 0.01028 | ClipFrac: 0.135 | Entropy: 1.727
Step: 526336 | Rollout Reward: 0.17 | KL: 0.01797 | ClipFrac: 0.198 | Entropy: 1.703
Step: 528384 | Rollout Reward: 0.12 | KL: 0.01082 | ClipFrac: 0.164 | Entropy: 1.653
Step: 530432 | Rollout Reward: 0.21 | KL: 0.01079 | ClipFrac: 0.174 | Entropy: 1.654




[Eval @ 530432] Return: 234.40
Saved video: videos/ppo_eval_530432.mp4
Step: 532480 | Rollout Reward: 0.17 | KL: 0.01208 | ClipFrac: 0.152 | Entropy: 1.643
Step: 534528 | Rollout Reward: 0.34 | KL: 0.01016 | ClipFrac: 0.117 | Entropy: 1.633
Step: 536576 | Rollout Reward: 0.12 | KL: 0.02823 | ClipFrac: 0.244 | Entropy: 1.642
Step: 538624 | Rollout Reward: 0.16 | KL: 0.01668 | ClipFrac: 0.184 | Entropy: 1.655
Step: 540672 | Rollout Reward: 0.14 | KL: 0.01610 | ClipFrac: 0.212 | Entropy: 1.663




[Eval @ 540672] Return: -8.44
Saved video: videos/ppo_eval_540672.mp4
Step: 542720 | Rollout Reward: 0.14 | KL: 0.04436 | ClipFrac: 0.345 | Entropy: 1.690
Step: 544768 | Rollout Reward: 0.14 | KL: 0.02217 | ClipFrac: 0.238 | Entropy: 1.682
Step: 546816 | Rollout Reward: 0.13 | KL: 0.01630 | ClipFrac: 0.204 | Entropy: 1.688
Step: 548864 | Rollout Reward: 0.13 | KL: 0.02339 | ClipFrac: 0.158 | Entropy: 1.691
Step: 550912 | Rollout Reward: 0.13 | KL: 0.01521 | ClipFrac: 0.195 | Entropy: 1.695




[Eval @ 550912] Return: -35.01
Saved video: videos/ppo_eval_550912.mp4
Step: 552960 | Rollout Reward: 0.14 | KL: 0.02030 | ClipFrac: 0.238 | Entropy: 1.696
Step: 555008 | Rollout Reward: 0.11 | KL: 0.03085 | ClipFrac: 0.257 | Entropy: 1.708
Step: 557056 | Rollout Reward: 0.30 | KL: 0.04723 | ClipFrac: 0.141 | Entropy: 1.699
Step: 559104 | Rollout Reward: 0.15 | KL: 0.01993 | ClipFrac: 0.252 | Entropy: 1.690
Step: 561152 | Rollout Reward: 0.09 | KL: 0.03121 | ClipFrac: 0.239 | Entropy: 1.704




[Eval @ 561152] Return: 230.55
Saved video: videos/ppo_eval_561152.mp4
Step: 563200 | Rollout Reward: 0.12 | KL: 0.02019 | ClipFrac: 0.202 | Entropy: 1.731
Step: 565248 | Rollout Reward: 0.41 | KL: 0.01703 | ClipFrac: 0.249 | Entropy: 1.732
Step: 567296 | Rollout Reward: 0.19 | KL: 0.03612 | ClipFrac: 0.316 | Entropy: 1.739
Step: 569344 | Rollout Reward: 0.21 | KL: 0.01718 | ClipFrac: 0.193 | Entropy: 1.725
Step: 571392 | Rollout Reward: 0.26 | KL: 0.01264 | ClipFrac: 0.154 | Entropy: 1.732




[Eval @ 571392] Return: 195.79
Saved video: videos/ppo_eval_571392.mp4
Step: 573440 | Rollout Reward: 0.21 | KL: 0.01592 | ClipFrac: 0.193 | Entropy: 1.732
Step: 575488 | Rollout Reward: 0.26 | KL: 0.01443 | ClipFrac: 0.154 | Entropy: 1.733
Step: 577536 | Rollout Reward: 0.17 | KL: 0.01749 | ClipFrac: 0.228 | Entropy: 1.731
Step: 579584 | Rollout Reward: 0.14 | KL: 0.01284 | ClipFrac: 0.133 | Entropy: 1.720
Step: 581632 | Rollout Reward: 0.40 | KL: 0.01517 | ClipFrac: 0.181 | Entropy: 1.716




[Eval @ 581632] Return: 115.55
Saved video: videos/ppo_eval_581632.mp4
Step: 583680 | Rollout Reward: 0.14 | KL: 0.02281 | ClipFrac: 0.272 | Entropy: 1.735
Step: 585728 | Rollout Reward: 0.07 | KL: 0.03814 | ClipFrac: 0.352 | Entropy: 1.739
Step: 587776 | Rollout Reward: 0.30 | KL: 0.01999 | ClipFrac: 0.190 | Entropy: 1.736
Step: 589824 | Rollout Reward: 0.21 | KL: 0.00859 | ClipFrac: 0.110 | Entropy: 1.725
Step: 591872 | Rollout Reward: 0.19 | KL: 0.02111 | ClipFrac: 0.204 | Entropy: 1.730




[Eval @ 591872] Return: 250.45
Saved video: videos/ppo_eval_591872.mp4
Step: 593920 | Rollout Reward: 0.26 | KL: 0.01373 | ClipFrac: 0.155 | Entropy: 1.716
Step: 595968 | Rollout Reward: 0.26 | KL: 0.01847 | ClipFrac: 0.192 | Entropy: 1.744
Step: 598016 | Rollout Reward: 0.12 | KL: 0.02124 | ClipFrac: 0.259 | Entropy: 1.765
Step: 600064 | Rollout Reward: 0.17 | KL: 0.02473 | ClipFrac: 0.276 | Entropy: 1.764




[Eval @ 600064] Return: 246.94
Saved video: videos/ppo_eval_600064.mp4
Step: 602112 | Rollout Reward: 0.15 | KL: 0.01431 | ClipFrac: 0.217 | Entropy: 1.735
Step: 604160 | Rollout Reward: 0.15 | KL: 0.03174 | ClipFrac: 0.310 | Entropy: 1.718
Step: 606208 | Rollout Reward: 0.08 | KL: 0.01737 | ClipFrac: 0.139 | Entropy: 1.714
Step: 608256 | Rollout Reward: 0.41 | KL: 0.01295 | ClipFrac: 0.156 | Entropy: 1.703
Step: 610304 | Rollout Reward: 0.18 | KL: 0.03785 | ClipFrac: 0.324 | Entropy: 1.710




[Eval @ 610304] Return: 240.91
Saved video: videos/ppo_eval_610304.mp4
Step: 612352 | Rollout Reward: 0.24 | KL: 0.01492 | ClipFrac: 0.180 | Entropy: 1.705
Step: 614400 | Rollout Reward: 0.19 | KL: 0.02197 | ClipFrac: 0.242 | Entropy: 1.714
Step: 616448 | Rollout Reward: 0.11 | KL: 0.03845 | ClipFrac: 0.312 | Entropy: 1.697
Step: 618496 | Rollout Reward: 0.14 | KL: 0.01385 | ClipFrac: 0.170 | Entropy: 1.676
Step: 620544 | Rollout Reward: 0.16 | KL: 0.01275 | ClipFrac: 0.128 | Entropy: 1.680




[Eval @ 620544] Return: 15.82
Saved video: videos/ppo_eval_620544.mp4
Step: 622592 | Rollout Reward: 0.24 | KL: 0.01435 | ClipFrac: 0.190 | Entropy: 1.701
Step: 624640 | Rollout Reward: 0.27 | KL: 0.01205 | ClipFrac: 0.153 | Entropy: 1.708
Step: 626688 | Rollout Reward: 0.12 | KL: 0.02670 | ClipFrac: 0.244 | Entropy: 1.715
Step: 628736 | Rollout Reward: 0.16 | KL: 0.01341 | ClipFrac: 0.183 | Entropy: 1.704
Step: 630784 | Rollout Reward: 0.18 | KL: 0.03500 | ClipFrac: 0.300 | Entropy: 1.736




[Eval @ 630784] Return: 240.20
Saved video: videos/ppo_eval_630784.mp4
Step: 632832 | Rollout Reward: 0.35 | KL: 0.09318 | ClipFrac: 0.213 | Entropy: 1.721
Step: 634880 | Rollout Reward: 0.28 | KL: 0.06360 | ClipFrac: 0.264 | Entropy: 1.719
Step: 636928 | Rollout Reward: 0.28 | KL: 0.01852 | ClipFrac: 0.210 | Entropy: 1.750
Step: 638976 | Rollout Reward: 0.11 | KL: 0.06254 | ClipFrac: 0.233 | Entropy: 1.743
Step: 641024 | Rollout Reward: 0.15 | KL: 0.04194 | ClipFrac: 0.241 | Entropy: 1.750




[Eval @ 641024] Return: -50.05
Saved video: videos/ppo_eval_641024.mp4
Step: 643072 | Rollout Reward: 0.23 | KL: 0.01134 | ClipFrac: 0.171 | Entropy: 1.741
Step: 645120 | Rollout Reward: 0.22 | KL: 0.01381 | ClipFrac: 0.168 | Entropy: 1.735
Step: 647168 | Rollout Reward: 0.23 | KL: 0.02351 | ClipFrac: 0.297 | Entropy: 1.724
Step: 649216 | Rollout Reward: 0.09 | KL: 0.02823 | ClipFrac: 0.276 | Entropy: 1.755
Step: 651264 | Rollout Reward: 0.16 | KL: 0.02859 | ClipFrac: 0.197 | Entropy: 1.727




[Eval @ 651264] Return: -41.70
Saved video: videos/ppo_eval_651264.mp4
Step: 653312 | Rollout Reward: 0.17 | KL: 0.01139 | ClipFrac: 0.148 | Entropy: 1.714
Step: 655360 | Rollout Reward: 0.12 | KL: 0.02029 | ClipFrac: 0.240 | Entropy: 1.745
Step: 657408 | Rollout Reward: 0.11 | KL: 0.00946 | ClipFrac: 0.156 | Entropy: 1.728
Step: 659456 | Rollout Reward: 0.07 | KL: 0.01243 | ClipFrac: 0.169 | Entropy: 1.729
Step: 661504 | Rollout Reward: 0.17 | KL: 0.01583 | ClipFrac: 0.221 | Entropy: 1.732




[Eval @ 661504] Return: 253.95
Saved video: videos/ppo_eval_661504.mp4
Step: 663552 | Rollout Reward: 0.14 | KL: 0.04491 | ClipFrac: 0.194 | Entropy: 1.738
Step: 665600 | Rollout Reward: 0.20 | KL: 0.01298 | ClipFrac: 0.177 | Entropy: 1.729
Step: 667648 | Rollout Reward: 0.28 | KL: 0.01141 | ClipFrac: 0.149 | Entropy: 1.724
Step: 669696 | Rollout Reward: 0.04 | KL: 0.02692 | ClipFrac: 0.165 | Entropy: 1.725
Step: 671744 | Rollout Reward: -0.03 | KL: 0.02442 | ClipFrac: 0.251 | Entropy: 1.719




[Eval @ 671744] Return: -98.99
Saved video: videos/ppo_eval_671744.mp4
Step: 673792 | Rollout Reward: -0.07 | KL: 0.01958 | ClipFrac: 0.232 | Entropy: 1.704
Step: 675840 | Rollout Reward: 0.04 | KL: 0.02006 | ClipFrac: 0.224 | Entropy: 1.698
Step: 677888 | Rollout Reward: 0.01 | KL: 0.01463 | ClipFrac: 0.208 | Entropy: 1.704
Step: 679936 | Rollout Reward: 0.09 | KL: 0.01878 | ClipFrac: 0.236 | Entropy: 1.688
Step: 681984 | Rollout Reward: -0.05 | KL: 0.01940 | ClipFrac: 0.225 | Entropy: 1.687




[Eval @ 681984] Return: -118.02
Saved video: videos/ppo_eval_681984.mp4
Step: 684032 | Rollout Reward: 0.13 | KL: 0.01452 | ClipFrac: 0.178 | Entropy: 1.701
Step: 686080 | Rollout Reward: 0.20 | KL: 0.01211 | ClipFrac: 0.176 | Entropy: 1.698
Step: 688128 | Rollout Reward: 0.17 | KL: 0.02106 | ClipFrac: 0.231 | Entropy: 1.698
Step: 690176 | Rollout Reward: 0.14 | KL: 0.01462 | ClipFrac: 0.178 | Entropy: 1.745




[Eval @ 690176] Return: 237.03
Saved video: videos/ppo_eval_690176.mp4
Step: 692224 | Rollout Reward: 0.16 | KL: 0.01513 | ClipFrac: 0.194 | Entropy: 1.753
Step: 694272 | Rollout Reward: 0.15 | KL: 0.01133 | ClipFrac: 0.158 | Entropy: 1.726
Step: 696320 | Rollout Reward: 0.12 | KL: 0.02274 | ClipFrac: 0.200 | Entropy: 1.731
Step: 698368 | Rollout Reward: 0.16 | KL: 0.02468 | ClipFrac: 0.250 | Entropy: 1.724
Step: 700416 | Rollout Reward: 0.14 | KL: 0.01583 | ClipFrac: 0.157 | Entropy: 1.722




[Eval @ 700416] Return: 244.09
Saved video: videos/ppo_eval_700416.mp4
Step: 702464 | Rollout Reward: 0.12 | KL: 0.02141 | ClipFrac: 0.217 | Entropy: 1.708
Step: 704512 | Rollout Reward: 0.15 | KL: 0.01080 | ClipFrac: 0.173 | Entropy: 1.695
Step: 706560 | Rollout Reward: 0.27 | KL: 0.01728 | ClipFrac: 0.177 | Entropy: 1.693
Step: 708608 | Rollout Reward: 0.14 | KL: 0.01931 | ClipFrac: 0.214 | Entropy: 1.706
Step: 710656 | Rollout Reward: 0.16 | KL: 0.02168 | ClipFrac: 0.233 | Entropy: 1.700




[Eval @ 710656] Return: 254.74
Saved video: videos/ppo_eval_710656.mp4
Step: 712704 | Rollout Reward: 0.37 | KL: 0.01461 | ClipFrac: 0.151 | Entropy: 1.700
Step: 714752 | Rollout Reward: 0.12 | KL: 0.01720 | ClipFrac: 0.157 | Entropy: 1.704
Step: 716800 | Rollout Reward: 0.21 | KL: 0.01590 | ClipFrac: 0.190 | Entropy: 1.691
Step: 718848 | Rollout Reward: 0.17 | KL: 0.01794 | ClipFrac: 0.191 | Entropy: 1.690
Step: 720896 | Rollout Reward: 0.14 | KL: 0.02018 | ClipFrac: 0.203 | Entropy: 1.642




[Eval @ 720896] Return: 202.72
Saved video: videos/ppo_eval_720896.mp4
Step: 722944 | Rollout Reward: 0.16 | KL: 0.01846 | ClipFrac: 0.217 | Entropy: 1.628
Step: 724992 | Rollout Reward: 0.13 | KL: 0.01884 | ClipFrac: 0.234 | Entropy: 1.619
Step: 727040 | Rollout Reward: 0.15 | KL: 0.01602 | ClipFrac: 0.214 | Entropy: 1.620
Step: 729088 | Rollout Reward: 0.12 | KL: 0.00942 | ClipFrac: 0.106 | Entropy: 1.623
Step: 731136 | Rollout Reward: 0.13 | KL: 0.01148 | ClipFrac: 0.150 | Entropy: 1.611




[Eval @ 731136] Return: -78.94
Saved video: videos/ppo_eval_731136.mp4
Step: 733184 | Rollout Reward: 0.11 | KL: 0.01566 | ClipFrac: 0.179 | Entropy: 1.613
Step: 735232 | Rollout Reward: 0.20 | KL: 0.01402 | ClipFrac: 0.170 | Entropy: 1.615
Step: 737280 | Rollout Reward: 0.14 | KL: 0.01702 | ClipFrac: 0.186 | Entropy: 1.619
Step: 739328 | Rollout Reward: 0.13 | KL: 0.01918 | ClipFrac: 0.222 | Entropy: 1.611
Step: 741376 | Rollout Reward: 0.17 | KL: 0.01721 | ClipFrac: 0.167 | Entropy: 1.607




[Eval @ 741376] Return: 195.87
Saved video: videos/ppo_eval_741376.mp4
Step: 743424 | Rollout Reward: 0.07 | KL: 0.02715 | ClipFrac: 0.276 | Entropy: 1.621
Step: 745472 | Rollout Reward: 0.21 | KL: 0.02754 | ClipFrac: 0.260 | Entropy: 1.622
Step: 747520 | Rollout Reward: 0.14 | KL: 0.01662 | ClipFrac: 0.196 | Entropy: 1.613
Step: 749568 | Rollout Reward: 0.21 | KL: 0.01395 | ClipFrac: 0.179 | Entropy: 1.609
Step: 751616 | Rollout Reward: 0.09 | KL: 0.01841 | ClipFrac: 0.212 | Entropy: 1.640




[Eval @ 751616] Return: 264.26
Saved video: videos/ppo_eval_751616.mp4
Step: 753664 | Rollout Reward: 0.20 | KL: 0.02494 | ClipFrac: 0.229 | Entropy: 1.633
Step: 755712 | Rollout Reward: 0.14 | KL: 0.02813 | ClipFrac: 0.253 | Entropy: 1.655
Step: 757760 | Rollout Reward: 0.20 | KL: 0.01975 | ClipFrac: 0.181 | Entropy: 1.660
Step: 759808 | Rollout Reward: 0.28 | KL: 0.01326 | ClipFrac: 0.171 | Entropy: 1.647
Step: 761856 | Rollout Reward: 0.38 | KL: 0.01383 | ClipFrac: 0.196 | Entropy: 1.655




[Eval @ 761856] Return: 246.84
Saved video: videos/ppo_eval_761856.mp4
Step: 763904 | Rollout Reward: 0.24 | KL: 0.01779 | ClipFrac: 0.197 | Entropy: 1.620
Step: 765952 | Rollout Reward: 0.15 | KL: 0.03428 | ClipFrac: 0.308 | Entropy: 1.639
Step: 768000 | Rollout Reward: 0.26 | KL: 0.02575 | ClipFrac: 0.175 | Entropy: 1.636
Step: 770048 | Rollout Reward: 0.14 | KL: 0.01682 | ClipFrac: 0.204 | Entropy: 1.636




[Eval @ 770048] Return: 279.03
Saved video: videos/ppo_eval_770048.mp4
Step: 772096 | Rollout Reward: 0.23 | KL: 0.00897 | ClipFrac: 0.128 | Entropy: 1.610
Step: 774144 | Rollout Reward: 0.33 | KL: 0.01379 | ClipFrac: 0.111 | Entropy: 1.599
Step: 776192 | Rollout Reward: 0.23 | KL: 0.01390 | ClipFrac: 0.178 | Entropy: 1.593
Step: 778240 | Rollout Reward: 0.16 | KL: 0.01919 | ClipFrac: 0.242 | Entropy: 1.594
Step: 780288 | Rollout Reward: 0.12 | KL: 0.01098 | ClipFrac: 0.159 | Entropy: 1.612




[Eval @ 780288] Return: 271.17
Saved video: videos/ppo_eval_780288.mp4
Step: 782336 | Rollout Reward: 0.27 | KL: 0.01262 | ClipFrac: 0.158 | Entropy: 1.598
Step: 784384 | Rollout Reward: 0.17 | KL: 0.00848 | ClipFrac: 0.142 | Entropy: 1.598
Step: 786432 | Rollout Reward: 0.28 | KL: 0.00675 | ClipFrac: 0.143 | Entropy: 1.583
Step: 788480 | Rollout Reward: 0.15 | KL: 0.02533 | ClipFrac: 0.257 | Entropy: 1.582
Step: 790528 | Rollout Reward: 0.17 | KL: 0.01415 | ClipFrac: 0.202 | Entropy: 1.610




[Eval @ 790528] Return: 239.24
Saved video: videos/ppo_eval_790528.mp4
Step: 792576 | Rollout Reward: 0.14 | KL: 0.01381 | ClipFrac: 0.185 | Entropy: 1.585
Step: 794624 | Rollout Reward: 0.25 | KL: 0.01918 | ClipFrac: 0.194 | Entropy: 1.577
Step: 796672 | Rollout Reward: 0.11 | KL: 0.01435 | ClipFrac: 0.177 | Entropy: 1.576
Step: 798720 | Rollout Reward: 0.12 | KL: 0.02742 | ClipFrac: 0.264 | Entropy: 1.560
Step: 800768 | Rollout Reward: 0.13 | KL: 0.01633 | ClipFrac: 0.204 | Entropy: 1.529




[Eval @ 800768] Return: -11.98
Saved video: videos/ppo_eval_800768.mp4
Step: 802816 | Rollout Reward: 0.12 | KL: 0.00617 | ClipFrac: 0.081 | Entropy: 1.529
Step: 804864 | Rollout Reward: 0.17 | KL: 0.01302 | ClipFrac: 0.182 | Entropy: 1.510
Step: 806912 | Rollout Reward: 0.13 | KL: 0.01345 | ClipFrac: 0.183 | Entropy: 1.507
Step: 808960 | Rollout Reward: 0.12 | KL: 0.02418 | ClipFrac: 0.229 | Entropy: 1.506
Step: 811008 | Rollout Reward: 0.13 | KL: 0.01269 | ClipFrac: 0.169 | Entropy: 1.469




[Eval @ 811008] Return: 212.10
Saved video: videos/ppo_eval_811008.mp4
Step: 813056 | Rollout Reward: 0.12 | KL: 0.01317 | ClipFrac: 0.196 | Entropy: 1.447
Step: 815104 | Rollout Reward: 0.16 | KL: 0.01213 | ClipFrac: 0.177 | Entropy: 1.457
Step: 817152 | Rollout Reward: 0.15 | KL: 0.02453 | ClipFrac: 0.235 | Entropy: 1.466
Step: 819200 | Rollout Reward: 0.22 | KL: 0.01162 | ClipFrac: 0.146 | Entropy: 1.460
Step: 821248 | Rollout Reward: 0.25 | KL: 0.01981 | ClipFrac: 0.164 | Entropy: 1.458




[Eval @ 821248] Return: 13.87
Saved video: videos/ppo_eval_821248.mp4
Step: 823296 | Rollout Reward: 0.14 | KL: 0.02748 | ClipFrac: 0.283 | Entropy: 1.461
Step: 825344 | Rollout Reward: 0.17 | KL: 0.01571 | ClipFrac: 0.178 | Entropy: 1.465
Step: 827392 | Rollout Reward: 0.44 | KL: 0.00467 | ClipFrac: 0.173 | Entropy: 1.454
Step: 829440 | Rollout Reward: 0.14 | KL: 0.02978 | ClipFrac: 0.324 | Entropy: 1.463
Step: 831488 | Rollout Reward: 0.17 | KL: 0.03079 | ClipFrac: 0.304 | Entropy: 1.467




[Eval @ 831488] Return: 278.21
Saved video: videos/ppo_eval_831488.mp4
Step: 833536 | Rollout Reward: 0.32 | KL: 0.02394 | ClipFrac: 0.198 | Entropy: 1.463
Step: 835584 | Rollout Reward: 0.13 | KL: 0.02773 | ClipFrac: 0.258 | Entropy: 1.468
Step: 837632 | Rollout Reward: 0.13 | KL: 0.01717 | ClipFrac: 0.186 | Entropy: 1.449
Step: 839680 | Rollout Reward: 0.13 | KL: 0.02051 | ClipFrac: 0.236 | Entropy: 1.438
Step: 841728 | Rollout Reward: 0.15 | KL: 0.02215 | ClipFrac: 0.246 | Entropy: 1.439




[Eval @ 841728] Return: 234.10
Saved video: videos/ppo_eval_841728.mp4
Step: 843776 | Rollout Reward: 0.16 | KL: 0.01369 | ClipFrac: 0.172 | Entropy: 1.412
Step: 845824 | Rollout Reward: 0.20 | KL: 0.00649 | ClipFrac: 0.079 | Entropy: 1.402
Step: 847872 | Rollout Reward: 0.27 | KL: 0.00951 | ClipFrac: 0.155 | Entropy: 1.395
Step: 849920 | Rollout Reward: 0.21 | KL: 0.01635 | ClipFrac: 0.179 | Entropy: 1.406
Step: 851968 | Rollout Reward: 0.14 | KL: 0.03152 | ClipFrac: 0.259 | Entropy: 1.407




[Eval @ 851968] Return: 295.40
Saved video: videos/ppo_eval_851968.mp4
Step: 854016 | Rollout Reward: 0.17 | KL: 0.02596 | ClipFrac: 0.257 | Entropy: 1.384
Step: 856064 | Rollout Reward: 0.19 | KL: 0.01978 | ClipFrac: 0.239 | Entropy: 1.381
Step: 858112 | Rollout Reward: 0.16 | KL: 0.01587 | ClipFrac: 0.172 | Entropy: 1.392
Step: 860160 | Rollout Reward: 0.06 | KL: 0.01436 | ClipFrac: 0.171 | Entropy: 1.377




[Eval @ 860160] Return: 241.66
Saved video: videos/ppo_eval_860160.mp4
Step: 862208 | Rollout Reward: 0.29 | KL: 0.01247 | ClipFrac: 0.169 | Entropy: 1.369
Step: 864256 | Rollout Reward: 0.15 | KL: 0.02053 | ClipFrac: 0.223 | Entropy: 1.352
Step: 866304 | Rollout Reward: 0.30 | KL: 0.01825 | ClipFrac: 0.188 | Entropy: 1.344
Step: 868352 | Rollout Reward: 0.20 | KL: 0.00937 | ClipFrac: 0.129 | Entropy: 1.323
Step: 870400 | Rollout Reward: 0.34 | KL: 0.01819 | ClipFrac: 0.191 | Entropy: 1.303




[Eval @ 870400] Return: -32.79
Saved video: videos/ppo_eval_870400.mp4
Step: 872448 | Rollout Reward: 0.16 | KL: 0.02190 | ClipFrac: 0.229 | Entropy: 1.294
Step: 874496 | Rollout Reward: 0.14 | KL: 0.02552 | ClipFrac: 0.249 | Entropy: 1.280
Step: 876544 | Rollout Reward: 0.13 | KL: 0.01780 | ClipFrac: 0.237 | Entropy: 1.273
Step: 878592 | Rollout Reward: 0.20 | KL: 0.00803 | ClipFrac: 0.127 | Entropy: 1.273
Step: 880640 | Rollout Reward: 0.16 | KL: 0.01944 | ClipFrac: 0.203 | Entropy: 1.238




[Eval @ 880640] Return: 257.34
Saved video: videos/ppo_eval_880640.mp4
Step: 882688 | Rollout Reward: 0.16 | KL: 0.02032 | ClipFrac: 0.250 | Entropy: 1.223
Step: 884736 | Rollout Reward: 0.49 | KL: 0.00892 | ClipFrac: 0.142 | Entropy: 1.218
Step: 886784 | Rollout Reward: 0.51 | KL: 0.02301 | ClipFrac: 0.217 | Entropy: 1.222
Step: 888832 | Rollout Reward: 0.33 | KL: 0.01633 | ClipFrac: 0.211 | Entropy: 1.221
Step: 890880 | Rollout Reward: 0.34 | KL: 0.01308 | ClipFrac: 0.153 | Entropy: 1.218




[Eval @ 890880] Return: 249.85
Saved video: videos/ppo_eval_890880.mp4
Step: 892928 | Rollout Reward: 0.23 | KL: 0.01750 | ClipFrac: 0.219 | Entropy: 1.212
Step: 894976 | Rollout Reward: 0.32 | KL: 0.01445 | ClipFrac: 0.178 | Entropy: 1.206
Step: 897024 | Rollout Reward: 0.48 | KL: 0.02896 | ClipFrac: 0.228 | Entropy: 1.192
Step: 899072 | Rollout Reward: 0.33 | KL: 0.03307 | ClipFrac: 0.342 | Entropy: 1.196
Step: 901120 | Rollout Reward: 0.62 | KL: 0.01328 | ClipFrac: 0.155 | Entropy: 1.174




[Eval @ 901120] Return: 282.82
Saved video: videos/ppo_eval_901120.mp4
Step: 903168 | Rollout Reward: 0.33 | KL: 0.02099 | ClipFrac: 0.200 | Entropy: 1.153
Step: 905216 | Rollout Reward: 0.32 | KL: 0.01591 | ClipFrac: 0.198 | Entropy: 1.139
Step: 907264 | Rollout Reward: 0.48 | KL: 0.01553 | ClipFrac: 0.179 | Entropy: 1.138
Step: 909312 | Rollout Reward: 0.39 | KL: 0.02880 | ClipFrac: 0.256 | Entropy: 1.141
Step: 911360 | Rollout Reward: 0.24 | KL: 0.04563 | ClipFrac: 0.295 | Entropy: 1.144




[Eval @ 911360] Return: 253.47
Saved video: videos/ppo_eval_911360.mp4
Step: 913408 | Rollout Reward: 0.42 | KL: 0.02132 | ClipFrac: 0.248 | Entropy: 1.134
Step: 915456 | Rollout Reward: 0.46 | KL: 0.04371 | ClipFrac: 0.304 | Entropy: 1.127
Step: 917504 | Rollout Reward: 0.47 | KL: 0.01748 | ClipFrac: 0.209 | Entropy: 1.106
Step: 919552 | Rollout Reward: 0.38 | KL: 0.03087 | ClipFrac: 0.293 | Entropy: 1.107
Step: 921600 | Rollout Reward: 0.62 | KL: 0.02555 | ClipFrac: 0.265 | Entropy: 1.095




[Eval @ 921600] Return: 225.39
Saved video: videos/ppo_eval_921600.mp4
Step: 923648 | Rollout Reward: 0.35 | KL: 0.01584 | ClipFrac: 0.207 | Entropy: 1.062
Step: 925696 | Rollout Reward: 0.14 | KL: 0.04757 | ClipFrac: 0.341 | Entropy: 1.125
Step: 927744 | Rollout Reward: 0.45 | KL: 0.01122 | ClipFrac: 0.153 | Entropy: 1.114
Step: 929792 | Rollout Reward: 0.42 | KL: 0.02150 | ClipFrac: 0.197 | Entropy: 1.104
Step: 931840 | Rollout Reward: 0.22 | KL: 0.02285 | ClipFrac: 0.270 | Entropy: 1.109




[Eval @ 931840] Return: 278.13
Saved video: videos/ppo_eval_931840.mp4
Step: 933888 | Rollout Reward: 0.51 | KL: 0.02770 | ClipFrac: 0.269 | Entropy: 1.110
Step: 935936 | Rollout Reward: 0.13 | KL: 0.02793 | ClipFrac: 0.274 | Entropy: 1.073
Step: 937984 | Rollout Reward: 0.74 | KL: 0.01804 | ClipFrac: 0.200 | Entropy: 1.057
Step: 940032 | Rollout Reward: 0.43 | KL: 0.01963 | ClipFrac: 0.205 | Entropy: 1.065




[Eval @ 940032] Return: 299.28
Saved video: videos/ppo_eval_940032.mp4
Step: 942080 | Rollout Reward: 0.53 | KL: 0.01120 | ClipFrac: 0.179 | Entropy: 1.061
Step: 944128 | Rollout Reward: 0.57 | KL: 0.00848 | ClipFrac: 0.156 | Entropy: 1.025
Step: 946176 | Rollout Reward: 0.52 | KL: 0.01356 | ClipFrac: 0.176 | Entropy: 1.017
Step: 948224 | Rollout Reward: 0.61 | KL: 0.02071 | ClipFrac: 0.195 | Entropy: 0.987
Step: 950272 | Rollout Reward: 0.70 | KL: 0.02414 | ClipFrac: 0.251 | Entropy: 0.949




[Eval @ 950272] Return: 272.62
Saved video: videos/ppo_eval_950272.mp4
Step: 952320 | Rollout Reward: 0.51 | KL: 0.02122 | ClipFrac: 0.220 | Entropy: 0.910
Step: 954368 | Rollout Reward: 0.42 | KL: 0.01591 | ClipFrac: 0.217 | Entropy: 0.894
Step: 956416 | Rollout Reward: 0.58 | KL: 0.03297 | ClipFrac: 0.306 | Entropy: 0.895
Step: 958464 | Rollout Reward: 0.25 | KL: 0.01672 | ClipFrac: 0.204 | Entropy: 0.901
Step: 960512 | Rollout Reward: 0.52 | KL: 0.01474 | ClipFrac: 0.178 | Entropy: 0.855




[Eval @ 960512] Return: 286.10
Saved video: videos/ppo_eval_960512.mp4
Step: 962560 | Rollout Reward: 0.59 | KL: 0.02146 | ClipFrac: 0.215 | Entropy: 0.828
Step: 964608 | Rollout Reward: 0.58 | KL: 0.02892 | ClipFrac: 0.284 | Entropy: 0.825
Step: 966656 | Rollout Reward: 0.57 | KL: 0.02493 | ClipFrac: 0.243 | Entropy: 0.820
Step: 968704 | Rollout Reward: 0.44 | KL: 0.04417 | ClipFrac: 0.365 | Entropy: 0.827
Step: 970752 | Rollout Reward: 0.66 | KL: 0.02394 | ClipFrac: 0.226 | Entropy: 0.809




[Eval @ 970752] Return: 278.91
Saved video: videos/ppo_eval_970752.mp4
Step: 972800 | Rollout Reward: 0.78 | KL: 0.02072 | ClipFrac: 0.194 | Entropy: 0.804
Step: 974848 | Rollout Reward: 0.73 | KL: 0.03107 | ClipFrac: 0.245 | Entropy: 0.803
Step: 976896 | Rollout Reward: 0.52 | KL: 0.01933 | ClipFrac: 0.226 | Entropy: 0.790
Step: 978944 | Rollout Reward: 0.82 | KL: 0.01723 | ClipFrac: 0.218 | Entropy: 0.767
Step: 980992 | Rollout Reward: 1.02 | KL: 0.02655 | ClipFrac: 0.259 | Entropy: 0.764




[Eval @ 980992] Return: 281.88
Saved video: videos/ppo_eval_980992.mp4
Step: 983040 | Rollout Reward: 0.75 | KL: 0.01377 | ClipFrac: 0.203 | Entropy: 0.779
Step: 985088 | Rollout Reward: 0.26 | KL: 0.03750 | ClipFrac: 0.315 | Entropy: 0.787
Step: 987136 | Rollout Reward: 0.52 | KL: 0.03474 | ClipFrac: 0.273 | Entropy: 0.768
Step: 989184 | Rollout Reward: 0.83 | KL: 0.02513 | ClipFrac: 0.236 | Entropy: 0.775
Step: 991232 | Rollout Reward: 0.83 | KL: 0.02950 | ClipFrac: 0.245 | Entropy: 0.790




[Eval @ 991232] Return: 253.58
Saved video: videos/ppo_eval_991232.mp4
Step: 993280 | Rollout Reward: 0.51 | KL: 0.06855 | ClipFrac: 0.338 | Entropy: 0.789
Step: 995328 | Rollout Reward: 0.46 | KL: 0.05297 | ClipFrac: 0.286 | Entropy: 0.787
Step: 997376 | Rollout Reward: 1.05 | KL: 0.07104 | ClipFrac: 0.351 | Entropy: 0.797
Step: 999424 | Rollout Reward: 0.64 | KL: 0.03417 | ClipFrac: 0.319 | Entropy: 0.769
Step: 1000000 | Rollout Reward: -0.01 | KL: 0.02816 | ClipFrac: 0.351 | Entropy: 0.777




[Eval @ 1000000] Return: 263.20
Saved video: videos/ppo_eval_1000000.mp4


In [None]:
!zip -r videos.zip /content/videos

  adding: content/videos/ (stored 0%)
  adding: content/videos/ppo_eval_61440.mp4 (deflated 13%)
  adding: content/videos/ppo_eval_100352.mp4 (deflated 17%)
  adding: content/videos/ppo_eval_20480.mp4 (deflated 11%)
  adding: content/videos/ppo_eval_620544.mp4 (deflated 14%)
  adding: content/videos/ppo_eval_141312.mp4 (deflated 11%)
  adding: content/videos/ppo_eval_901120.mp4 (deflated 13%)
  adding: content/videos/ppo_eval_512000.mp4 (deflated 9%)
  adding: content/videos/ppo_eval_581632.mp4 (deflated 23%)
  adding: content/videos/ppo_eval_311296.mp4 (deflated 14%)
  adding: content/videos/ppo_eval_270336.mp4 (deflated 14%)
  adding: content/videos/ppo_eval_870400.mp4 (deflated 15%)
  adding: content/videos/ppo_eval_591872.mp4 (deflated 17%)
  adding: content/videos/ppo_eval_81920.mp4 (deflated 13%)
  adding: content/videos/ppo_eval_260096.mp4 (deflated 16%)
  adding: content/videos/ppo_eval_561152.mp4 (deflated 14%)
  adding: content/videos/ppo_eval_790528.mp4 (deflated 14%)
  addi

In [None]:
files.download('/content/videos.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

PPO v2 (with LR annealing ,value clipping and additional logging metrics)

In [None]:
# Training setup

seed = 0
set_seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

env = gym.make("LunarLanderContinuous-v3")
eval_env = gym.make("LunarLanderContinuous-v3", render_mode="rgb_array")

os.makedirs("videos_v2", exist_ok=True)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

initial_lr = 3e-4   # LR annealing initial LR
model = ActorCritic(state_dim, action_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=initial_lr)

# PPO hyperparameters
gamma = 0.99
gae_lambda = 0.95
clip_eps = 0.2
vf_coef = 0.5
ent_coef = 0.005
max_grad_norm = 0.5
ppo_epochs = 10
mini_batch_size = 64

rollout_len = 2048
total_timesteps = 1_000_000
eval_interval = 10_000

global_step = 0
state, _ = env.reset()

# PPO v2 training loop

while global_step < total_timesteps:

    # LR Annealing
    frac = 1.0 - (global_step / total_timesteps)
    lr_now = initial_lr * frac
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr_now

    states, actions, rewards, dones, values, logps = [], [], [], [], [], []
    rollout_rewards = []

    # Rollouts
    for _ in range(rollout_len):

        state_tensor = torch.tensor(state, dtype=torch.float32, device=device)
        action, logp, entropy, value = model.get_action(state_tensor)

        next_state, reward, terminated, truncated, _ = env.step(
            action.detach().cpu().numpy()
        )

        done = terminated or truncated

        states.append(state_tensor)
        actions.append(action.detach())
        rewards.append(torch.tensor(reward, dtype=torch.float32, device=device))
        dones.append(torch.tensor(float(done), dtype=torch.float32, device=device))
        values.append(value.detach())
        logps.append(logp.detach())

        rollout_rewards.append(reward)

        state = next_state
        global_step += 1

        if done:
            state, _ = env.reset()

        if global_step >= total_timesteps:
            break

    mean_rollout_reward = np.mean(rollout_rewards)

    states = torch.stack(states)
    actions = torch.stack(actions)
    rewards = torch.stack(rewards)
    dones = torch.stack(dones)
    values = torch.stack(values)
    logps_old = torch.stack(logps)

    with torch.no_grad():
        last_state_tensor = torch.tensor(state, dtype=torch.float32, device=device)
        _, _, next_value = model.forward(last_state_tensor)

    advantages, returns = compute_gae(
        rewards, dones, values, next_value, gamma, gae_lambda
    )

    # Advantage Norm
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)


    dataset_size = states.size(0)

    # Logging metrics
    approx_kl_total = 0
    clip_frac_total = 0
    explained_var_total = 0
    update_steps = 0

    # PPO V2 main training loop
    for _ in range(ppo_epochs):

        indices = torch.randperm(dataset_size, device=device)

        for start in range(0, dataset_size, mini_batch_size):
            end = start + mini_batch_size
            batch_idx = indices[start:end]

            mb_states = states[batch_idx]
            mb_actions = actions[batch_idx]
            mb_advantages = advantages[batch_idx]
            mb_returns = returns[batch_idx]
            mb_logps_old = logps_old[batch_idx]
            mb_values_old = values[batch_idx]

            logps_new, entropy, values_new = model.evaluate_actions(
                mb_states, mb_actions
            )

            ratio = torch.exp(logps_new - mb_logps_old)

            surr1 = ratio * mb_advantages
            surr2 = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * mb_advantages   # PPO clipping

            policy_loss = -torch.min(surr1, surr2).mean()

            # Value function clipping
            value_clipped = mb_values_old + torch.clamp(
                values_new - mb_values_old,
                -clip_eps,
                clip_eps,
            )

            v_loss1 = (values_new - mb_returns).pow(2)
            v_loss2 = (value_clipped - mb_returns).pow(2)
            value_loss = 0.5 * torch.max(v_loss1, v_loss2).mean()

            entropy_loss = entropy.mean()

            # PPO Loss
            loss = policy_loss + vf_coef * value_loss - ent_coef * entropy_loss

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping
            optimizer.step()

            # Logging stats
            approx_kl = (mb_logps_old - logps_new).mean().item()
            clip_frac = ((ratio - 1.0).abs() > clip_eps).float().mean().item()

            explained_var = 1 - torch.var(mb_returns - values_new) / (
                torch.var(mb_returns) + 1e-8
            )
            explained_var_total += explained_var.item()

            approx_kl_total += approx_kl
            clip_frac_total += clip_frac
            update_steps += 1

    print(
        f"Step: {global_step} | "
        f"LR: {lr_now:.6f} | "
        f"Reward: {mean_rollout_reward:.2f} | "
        f"KL: {approx_kl_total/update_steps:.5f} | "
        f"ClipFrac: {clip_frac_total/update_steps:.3f} | "
        f"ExplVar: {explained_var_total/update_steps:.3f}"
    )

    # Evaluation
    if global_step % eval_interval < rollout_len:

        s, _ = eval_env.reset()
        done = False
        ep_return = 0
        frames = []

        while not done:
            s_tensor = torch.tensor(s, dtype=torch.float32, device=device)

            # Determininstic evaluation
            action = model.act_deterministic(s_tensor)

            s, r, terminated, truncated, _ = eval_env.step(
                action.cpu().numpy()
            )
            done = terminated or truncated
            ep_return += r

            frames.append(eval_env.render())

        print(f"[Eval @ {global_step}] Return: {ep_return:.2f}")

        video_path = f"videos_v2/ppo_eval_{global_step}.mp4"
        imageio.mimsave(video_path, frames, fps=30)
        print(f"Saved video: {video_path}")

env.close()
eval_env.close()


Device: cuda
Step: 2048 | LR: 0.000300 | Reward: -1.38 | KL: 0.00497 | ClipFrac: 0.034 | ExplVar: 0.006
Step: 4096 | LR: 0.000299 | Reward: -1.28 | KL: 0.00492 | ClipFrac: 0.046 | ExplVar: 0.018
Step: 6144 | LR: 0.000299 | Reward: -0.48 | KL: 0.00471 | ClipFrac: 0.066 | ExplVar: 0.032
Step: 8192 | LR: 0.000298 | Reward: -1.00 | KL: 0.00739 | ClipFrac: 0.039 | ExplVar: 0.035
Step: 10240 | LR: 0.000298 | Reward: -0.77 | KL: 0.00129 | ClipFrac: 0.026 | ExplVar: 0.043




[Eval @ 10240] Return: -116.26
Saved video: videos_v2/ppo_eval_10240.mp4
Step: 12288 | LR: 0.000297 | Reward: -0.69 | KL: 0.01074 | ClipFrac: 0.059 | ExplVar: 0.064
Step: 14336 | LR: 0.000296 | Reward: -0.68 | KL: 0.00373 | ClipFrac: 0.050 | ExplVar: 0.081
Step: 16384 | LR: 0.000296 | Reward: -0.62 | KL: 0.00788 | ClipFrac: 0.066 | ExplVar: 0.107
Step: 18432 | LR: 0.000295 | Reward: -0.43 | KL: 0.01254 | ClipFrac: 0.078 | ExplVar: 0.148
Step: 20480 | LR: 0.000294 | Reward: -0.57 | KL: 0.01079 | ClipFrac: 0.056 | ExplVar: 0.128




[Eval @ 20480] Return: -130.31
Saved video: videos_v2/ppo_eval_20480.mp4
Step: 22528 | LR: 0.000294 | Reward: -0.14 | KL: 0.00166 | ClipFrac: 0.041 | ExplVar: 0.167
Step: 24576 | LR: 0.000293 | Reward: -0.12 | KL: 0.00186 | ClipFrac: 0.080 | ExplVar: 0.187
Step: 26624 | LR: 0.000293 | Reward: -0.29 | KL: 0.00689 | ClipFrac: 0.047 | ExplVar: 0.204
Step: 28672 | LR: 0.000292 | Reward: -0.01 | KL: 0.00623 | ClipFrac: 0.063 | ExplVar: 0.214
Step: 30720 | LR: 0.000291 | Reward: -0.03 | KL: 0.00727 | ClipFrac: 0.065 | ExplVar: 0.337




[Eval @ 30720] Return: -161.09
Saved video: videos_v2/ppo_eval_30720.mp4
Step: 32768 | LR: 0.000291 | Reward: 0.01 | KL: 0.00261 | ClipFrac: 0.060 | ExplVar: 0.370
Step: 34816 | LR: 0.000290 | Reward: -0.04 | KL: 0.00516 | ClipFrac: 0.090 | ExplVar: 0.367
Step: 36864 | LR: 0.000290 | Reward: 0.00 | KL: 0.00884 | ClipFrac: 0.085 | ExplVar: 0.461
Step: 38912 | LR: 0.000289 | Reward: -0.03 | KL: 0.00944 | ClipFrac: 0.094 | ExplVar: 0.366
Step: 40960 | LR: 0.000288 | Reward: 0.05 | KL: 0.00195 | ClipFrac: 0.066 | ExplVar: 0.618




[Eval @ 40960] Return: 114.11
Saved video: videos_v2/ppo_eval_40960.mp4
Step: 43008 | LR: 0.000288 | Reward: 0.08 | KL: 0.00531 | ClipFrac: 0.086 | ExplVar: 0.664
Step: 45056 | LR: 0.000287 | Reward: 0.10 | KL: 0.00363 | ClipFrac: 0.094 | ExplVar: 0.680
Step: 47104 | LR: 0.000286 | Reward: 0.10 | KL: 0.00393 | ClipFrac: 0.064 | ExplVar: 0.631
Step: 49152 | LR: 0.000286 | Reward: 0.12 | KL: 0.00803 | ClipFrac: 0.064 | ExplVar: 0.520
Step: 51200 | LR: 0.000285 | Reward: 0.10 | KL: 0.00449 | ClipFrac: 0.061 | ExplVar: 0.551




[Eval @ 51200] Return: 235.26
Saved video: videos_v2/ppo_eval_51200.mp4
Step: 53248 | LR: 0.000285 | Reward: 0.14 | KL: 0.00950 | ClipFrac: 0.074 | ExplVar: 0.500
Step: 55296 | LR: 0.000284 | Reward: 0.11 | KL: 0.00683 | ClipFrac: 0.089 | ExplVar: 0.541
Step: 57344 | LR: 0.000283 | Reward: 0.05 | KL: 0.01403 | ClipFrac: 0.115 | ExplVar: 0.665
Step: 59392 | LR: 0.000283 | Reward: 0.09 | KL: 0.00570 | ClipFrac: 0.070 | ExplVar: 0.636
Step: 61440 | LR: 0.000282 | Reward: 0.06 | KL: 0.00704 | ClipFrac: 0.111 | ExplVar: 0.663




[Eval @ 61440] Return: 209.71
Saved video: videos_v2/ppo_eval_61440.mp4
Step: 63488 | LR: 0.000282 | Reward: 0.12 | KL: 0.01056 | ClipFrac: 0.100 | ExplVar: 0.762
Step: 65536 | LR: 0.000281 | Reward: 0.10 | KL: 0.00364 | ClipFrac: 0.082 | ExplVar: 0.851
Step: 67584 | LR: 0.000280 | Reward: 0.07 | KL: 0.00669 | ClipFrac: 0.097 | ExplVar: 0.799
Step: 69632 | LR: 0.000280 | Reward: 0.07 | KL: 0.01287 | ClipFrac: 0.115 | ExplVar: 0.762
Step: 71680 | LR: 0.000279 | Reward: 0.06 | KL: 0.00265 | ClipFrac: 0.048 | ExplVar: 0.830




[Eval @ 71680] Return: 219.06
Saved video: videos_v2/ppo_eval_71680.mp4
Step: 73728 | LR: 0.000278 | Reward: 0.09 | KL: 0.01030 | ClipFrac: 0.112 | ExplVar: 0.629
Step: 75776 | LR: 0.000278 | Reward: 0.09 | KL: 0.00318 | ClipFrac: 0.050 | ExplVar: 0.834
Step: 77824 | LR: 0.000277 | Reward: -0.04 | KL: 0.00722 | ClipFrac: 0.078 | ExplVar: 0.306
Step: 79872 | LR: 0.000277 | Reward: 0.12 | KL: 0.00482 | ClipFrac: 0.083 | ExplVar: 0.870
Step: 81920 | LR: 0.000276 | Reward: 0.07 | KL: 0.00638 | ClipFrac: 0.078 | ExplVar: 0.711




[Eval @ 81920] Return: 143.15
Saved video: videos_v2/ppo_eval_81920.mp4
Step: 83968 | LR: 0.000275 | Reward: 0.07 | KL: 0.01060 | ClipFrac: 0.086 | ExplVar: 0.838
Step: 86016 | LR: 0.000275 | Reward: 0.13 | KL: 0.00585 | ClipFrac: 0.098 | ExplVar: 0.845
Step: 88064 | LR: 0.000274 | Reward: 0.02 | KL: 0.00548 | ClipFrac: 0.090 | ExplVar: 0.739
Step: 90112 | LR: 0.000274 | Reward: 0.13 | KL: 0.00446 | ClipFrac: 0.066 | ExplVar: 0.670




[Eval @ 90112] Return: 120.07
Saved video: videos_v2/ppo_eval_90112.mp4
Step: 92160 | LR: 0.000273 | Reward: 0.07 | KL: 0.00749 | ClipFrac: 0.096 | ExplVar: 0.889
Step: 94208 | LR: 0.000272 | Reward: 0.07 | KL: 0.00791 | ClipFrac: 0.082 | ExplVar: 0.837
Step: 96256 | LR: 0.000272 | Reward: 0.11 | KL: 0.00495 | ClipFrac: 0.080 | ExplVar: 0.549
Step: 98304 | LR: 0.000271 | Reward: 0.07 | KL: 0.00588 | ClipFrac: 0.093 | ExplVar: 0.594
Step: 100352 | LR: 0.000271 | Reward: 0.08 | KL: 0.00542 | ClipFrac: 0.105 | ExplVar: 0.949




[Eval @ 100352] Return: 170.20
Saved video: videos_v2/ppo_eval_100352.mp4
Step: 102400 | LR: 0.000270 | Reward: 0.17 | KL: 0.01218 | ClipFrac: 0.127 | ExplVar: 0.708
Step: 104448 | LR: 0.000269 | Reward: 0.11 | KL: 0.00293 | ClipFrac: 0.076 | ExplVar: 0.912
Step: 106496 | LR: 0.000269 | Reward: 0.12 | KL: 0.00440 | ClipFrac: 0.094 | ExplVar: 0.887
Step: 108544 | LR: 0.000268 | Reward: 0.12 | KL: 0.00810 | ClipFrac: 0.099 | ExplVar: 0.948
Step: 110592 | LR: 0.000267 | Reward: 0.15 | KL: 0.00443 | ClipFrac: 0.084 | ExplVar: 0.583




[Eval @ 110592] Return: 223.45
Saved video: videos_v2/ppo_eval_110592.mp4
Step: 112640 | LR: 0.000267 | Reward: 0.07 | KL: 0.00791 | ClipFrac: 0.125 | ExplVar: 0.760
Step: 114688 | LR: 0.000266 | Reward: 0.09 | KL: 0.00906 | ClipFrac: 0.124 | ExplVar: 0.728
Step: 116736 | LR: 0.000266 | Reward: 0.08 | KL: 0.00610 | ClipFrac: 0.085 | ExplVar: 0.693
Step: 118784 | LR: 0.000265 | Reward: 0.06 | KL: 0.00988 | ClipFrac: 0.108 | ExplVar: 0.839
Step: 120832 | LR: 0.000264 | Reward: 0.30 | KL: 0.00880 | ClipFrac: 0.087 | ExplVar: 0.506




[Eval @ 120832] Return: -120.98
Saved video: videos_v2/ppo_eval_120832.mp4
Step: 122880 | LR: 0.000264 | Reward: 0.06 | KL: 0.00508 | ClipFrac: 0.087 | ExplVar: 0.883
Step: 124928 | LR: 0.000263 | Reward: -0.00 | KL: 0.00913 | ClipFrac: 0.098 | ExplVar: 0.561
Step: 126976 | LR: 0.000263 | Reward: 0.10 | KL: 0.00553 | ClipFrac: 0.111 | ExplVar: 0.943
Step: 129024 | LR: 0.000262 | Reward: 0.07 | KL: 0.00904 | ClipFrac: 0.106 | ExplVar: 0.950
Step: 131072 | LR: 0.000261 | Reward: 0.10 | KL: 0.00477 | ClipFrac: 0.051 | ExplVar: 0.742




[Eval @ 131072] Return: -84.61
Saved video: videos_v2/ppo_eval_131072.mp4
Step: 133120 | LR: 0.000261 | Reward: 0.16 | KL: 0.01362 | ClipFrac: 0.114 | ExplVar: 0.698
Step: 135168 | LR: 0.000260 | Reward: -0.06 | KL: 0.01157 | ClipFrac: 0.094 | ExplVar: 0.649
Step: 137216 | LR: 0.000259 | Reward: 0.21 | KL: 0.00439 | ClipFrac: 0.082 | ExplVar: 0.622
Step: 139264 | LR: 0.000259 | Reward: 0.12 | KL: 0.00804 | ClipFrac: 0.084 | ExplVar: 0.790
Step: 141312 | LR: 0.000258 | Reward: 0.12 | KL: 0.00596 | ClipFrac: 0.110 | ExplVar: 0.946




[Eval @ 141312] Return: 208.02
Saved video: videos_v2/ppo_eval_141312.mp4
Step: 143360 | LR: 0.000258 | Reward: 0.12 | KL: 0.00570 | ClipFrac: 0.059 | ExplVar: 0.537
Step: 145408 | LR: 0.000257 | Reward: 0.10 | KL: 0.01110 | ClipFrac: 0.118 | ExplVar: 0.942
Step: 147456 | LR: 0.000256 | Reward: 0.24 | KL: 0.00672 | ClipFrac: 0.080 | ExplVar: 0.636
Step: 149504 | LR: 0.000256 | Reward: 0.05 | KL: 0.00539 | ClipFrac: 0.089 | ExplVar: 0.966
Step: 151552 | LR: 0.000255 | Reward: 0.14 | KL: 0.00784 | ClipFrac: 0.103 | ExplVar: 0.708




[Eval @ 151552] Return: -104.78
Saved video: videos_v2/ppo_eval_151552.mp4
Step: 153600 | LR: 0.000255 | Reward: 0.08 | KL: 0.00736 | ClipFrac: 0.124 | ExplVar: 0.943
Step: 155648 | LR: 0.000254 | Reward: 0.03 | KL: 0.00875 | ClipFrac: 0.099 | ExplVar: 0.813
Step: 157696 | LR: 0.000253 | Reward: 0.11 | KL: 0.01283 | ClipFrac: 0.130 | ExplVar: 0.707
Step: 159744 | LR: 0.000253 | Reward: 0.16 | KL: 0.00571 | ClipFrac: 0.067 | ExplVar: 0.782
Step: 161792 | LR: 0.000252 | Reward: 0.07 | KL: 0.01336 | ClipFrac: 0.086 | ExplVar: 0.739




[Eval @ 161792] Return: 138.56
Saved video: videos_v2/ppo_eval_161792.mp4
Step: 163840 | LR: 0.000251 | Reward: 0.17 | KL: 0.01056 | ClipFrac: 0.110 | ExplVar: 0.826
Step: 165888 | LR: 0.000251 | Reward: 0.04 | KL: 0.01831 | ClipFrac: 0.161 | ExplVar: 0.753
Step: 167936 | LR: 0.000250 | Reward: 0.17 | KL: 0.01146 | ClipFrac: 0.122 | ExplVar: 0.851
Step: 169984 | LR: 0.000250 | Reward: 0.10 | KL: 0.00940 | ClipFrac: 0.103 | ExplVar: 0.950
Step: 172032 | LR: 0.000249 | Reward: 0.08 | KL: 0.00623 | ClipFrac: 0.125 | ExplVar: 0.699




[Eval @ 172032] Return: 157.71
Saved video: videos_v2/ppo_eval_172032.mp4
Step: 174080 | LR: 0.000248 | Reward: 0.10 | KL: 0.00672 | ClipFrac: 0.066 | ExplVar: 0.734
Step: 176128 | LR: 0.000248 | Reward: -0.02 | KL: 0.00573 | ClipFrac: 0.089 | ExplVar: 0.861
Step: 178176 | LR: 0.000247 | Reward: 0.09 | KL: 0.00574 | ClipFrac: 0.121 | ExplVar: 0.959
Step: 180224 | LR: 0.000247 | Reward: 0.00 | KL: 0.00294 | ClipFrac: 0.077 | ExplVar: 0.796




[Eval @ 180224] Return: 143.68
Saved video: videos_v2/ppo_eval_180224.mp4
Step: 182272 | LR: 0.000246 | Reward: 0.01 | KL: 0.00530 | ClipFrac: 0.067 | ExplVar: 0.754
Step: 184320 | LR: 0.000245 | Reward: 0.12 | KL: 0.01074 | ClipFrac: 0.118 | ExplVar: 0.821
Step: 186368 | LR: 0.000245 | Reward: 0.22 | KL: 0.01049 | ClipFrac: 0.110 | ExplVar: 0.685
Step: 188416 | LR: 0.000244 | Reward: 0.07 | KL: 0.00762 | ClipFrac: 0.095 | ExplVar: 0.961
Step: 190464 | LR: 0.000243 | Reward: 0.07 | KL: 0.00659 | ClipFrac: 0.116 | ExplVar: 0.966




[Eval @ 190464] Return: 168.72
Saved video: videos_v2/ppo_eval_190464.mp4
Step: 192512 | LR: 0.000243 | Reward: 0.11 | KL: 0.00772 | ClipFrac: 0.095 | ExplVar: 0.964
Step: 194560 | LR: 0.000242 | Reward: 0.20 | KL: 0.00563 | ClipFrac: 0.074 | ExplVar: 0.775
Step: 196608 | LR: 0.000242 | Reward: 0.20 | KL: 0.00973 | ClipFrac: 0.087 | ExplVar: 0.662
Step: 198656 | LR: 0.000241 | Reward: 0.33 | KL: 0.00446 | ClipFrac: 0.044 | ExplVar: 0.562
Step: 200704 | LR: 0.000240 | Reward: 0.02 | KL: 0.01340 | ClipFrac: 0.111 | ExplVar: 0.808




[Eval @ 200704] Return: 155.49
Saved video: videos_v2/ppo_eval_200704.mp4
Step: 202752 | LR: 0.000240 | Reward: 0.25 | KL: 0.00876 | ClipFrac: 0.120 | ExplVar: 0.766
Step: 204800 | LR: 0.000239 | Reward: 0.12 | KL: 0.01425 | ClipFrac: 0.142 | ExplVar: 0.966
Step: 206848 | LR: 0.000239 | Reward: 0.16 | KL: 0.01715 | ClipFrac: 0.148 | ExplVar: 0.876
Step: 208896 | LR: 0.000238 | Reward: 0.14 | KL: 0.00983 | ClipFrac: 0.113 | ExplVar: 0.828
Step: 210944 | LR: 0.000237 | Reward: 0.29 | KL: 0.00740 | ClipFrac: 0.106 | ExplVar: 0.640




[Eval @ 210944] Return: 201.65
Saved video: videos_v2/ppo_eval_210944.mp4
Step: 212992 | LR: 0.000237 | Reward: 0.19 | KL: 0.01047 | ClipFrac: 0.122 | ExplVar: 0.857
Step: 215040 | LR: 0.000236 | Reward: 0.03 | KL: 0.01892 | ClipFrac: 0.163 | ExplVar: 0.804
Step: 217088 | LR: 0.000235 | Reward: 0.05 | KL: 0.00463 | ClipFrac: 0.108 | ExplVar: 0.668
Step: 219136 | LR: 0.000235 | Reward: 0.02 | KL: 0.00902 | ClipFrac: 0.101 | ExplVar: 0.717
Step: 221184 | LR: 0.000234 | Reward: 0.03 | KL: 0.00823 | ClipFrac: 0.081 | ExplVar: 0.818




[Eval @ 221184] Return: -87.58
Saved video: videos_v2/ppo_eval_221184.mp4
Step: 223232 | LR: 0.000234 | Reward: 0.04 | KL: 0.00411 | ClipFrac: 0.065 | ExplVar: 0.851
Step: 225280 | LR: 0.000233 | Reward: 0.05 | KL: 0.00573 | ClipFrac: 0.082 | ExplVar: 0.811
Step: 227328 | LR: 0.000232 | Reward: 0.13 | KL: 0.00426 | ClipFrac: 0.110 | ExplVar: 0.978
Step: 229376 | LR: 0.000232 | Reward: 0.10 | KL: 0.00857 | ClipFrac: 0.060 | ExplVar: 0.811
Step: 231424 | LR: 0.000231 | Reward: 0.09 | KL: 0.00890 | ClipFrac: 0.093 | ExplVar: 0.850




[Eval @ 231424] Return: -98.15
Saved video: videos_v2/ppo_eval_231424.mp4
Step: 233472 | LR: 0.000231 | Reward: -0.01 | KL: 0.01327 | ClipFrac: 0.134 | ExplVar: 0.670
Step: 235520 | LR: 0.000230 | Reward: 0.02 | KL: 0.00744 | ClipFrac: 0.096 | ExplVar: 0.938
Step: 237568 | LR: 0.000229 | Reward: 0.13 | KL: 0.00800 | ClipFrac: 0.092 | ExplVar: 0.804
Step: 239616 | LR: 0.000229 | Reward: 0.22 | KL: 0.00958 | ClipFrac: 0.100 | ExplVar: 0.821
Step: 241664 | LR: 0.000228 | Reward: 0.12 | KL: 0.00670 | ClipFrac: 0.109 | ExplVar: 0.983




[Eval @ 241664] Return: 229.36
Saved video: videos_v2/ppo_eval_241664.mp4
Step: 243712 | LR: 0.000228 | Reward: 0.20 | KL: 0.01036 | ClipFrac: 0.106 | ExplVar: 0.854
Step: 245760 | LR: 0.000227 | Reward: 0.21 | KL: 0.00391 | ClipFrac: 0.057 | ExplVar: 0.755
Step: 247808 | LR: 0.000226 | Reward: 0.05 | KL: 0.00697 | ClipFrac: 0.106 | ExplVar: 0.943
Step: 249856 | LR: 0.000226 | Reward: 0.21 | KL: 0.00471 | ClipFrac: 0.041 | ExplVar: 0.744
Step: 251904 | LR: 0.000225 | Reward: 0.22 | KL: 0.00640 | ClipFrac: 0.084 | ExplVar: 0.899




[Eval @ 251904] Return: 141.52
Saved video: videos_v2/ppo_eval_251904.mp4
Step: 253952 | LR: 0.000224 | Reward: 0.10 | KL: 0.00574 | ClipFrac: 0.069 | ExplVar: 0.858
Step: 256000 | LR: 0.000224 | Reward: 0.08 | KL: 0.00906 | ClipFrac: 0.095 | ExplVar: 0.747
Step: 258048 | LR: 0.000223 | Reward: 0.12 | KL: 0.00907 | ClipFrac: 0.123 | ExplVar: 0.987
Step: 260096 | LR: 0.000223 | Reward: 0.21 | KL: 0.00786 | ClipFrac: 0.085 | ExplVar: 0.891




[Eval @ 260096] Return: 237.29
Saved video: videos_v2/ppo_eval_260096.mp4
Step: 262144 | LR: 0.000222 | Reward: 0.12 | KL: 0.00737 | ClipFrac: 0.092 | ExplVar: 0.985
Step: 264192 | LR: 0.000221 | Reward: 0.23 | KL: 0.01168 | ClipFrac: 0.126 | ExplVar: 0.838
Step: 266240 | LR: 0.000221 | Reward: 0.08 | KL: 0.00534 | ClipFrac: 0.060 | ExplVar: 0.797
Step: 268288 | LR: 0.000220 | Reward: 0.37 | KL: 0.01000 | ClipFrac: 0.108 | ExplVar: 0.732
Step: 270336 | LR: 0.000220 | Reward: 0.13 | KL: 0.00648 | ClipFrac: 0.106 | ExplVar: 0.972




[Eval @ 270336] Return: 224.81
Saved video: videos_v2/ppo_eval_270336.mp4
Step: 272384 | LR: 0.000219 | Reward: 0.34 | KL: 0.00867 | ClipFrac: 0.090 | ExplVar: 0.860
Step: 274432 | LR: 0.000218 | Reward: 0.15 | KL: 0.00415 | ClipFrac: 0.083 | ExplVar: 0.995
Step: 276480 | LR: 0.000218 | Reward: 0.29 | KL: 0.01176 | ClipFrac: 0.125 | ExplVar: 0.863
Step: 278528 | LR: 0.000217 | Reward: 0.25 | KL: 0.01092 | ClipFrac: 0.130 | ExplVar: 0.926
Step: 280576 | LR: 0.000216 | Reward: 0.12 | KL: 0.00544 | ClipFrac: 0.103 | ExplVar: 0.996




[Eval @ 280576] Return: 216.02
Saved video: videos_v2/ppo_eval_280576.mp4
Step: 282624 | LR: 0.000216 | Reward: 0.28 | KL: 0.00660 | ClipFrac: 0.092 | ExplVar: 0.880
Step: 284672 | LR: 0.000215 | Reward: 0.28 | KL: 0.01158 | ClipFrac: 0.133 | ExplVar: 0.832
Step: 286720 | LR: 0.000215 | Reward: 0.10 | KL: 0.00905 | ClipFrac: 0.109 | ExplVar: 0.992
Step: 288768 | LR: 0.000214 | Reward: 0.18 | KL: 0.00859 | ClipFrac: 0.100 | ExplVar: 0.883
Step: 290816 | LR: 0.000213 | Reward: 0.13 | KL: 0.00583 | ClipFrac: 0.086 | ExplVar: 0.882




[Eval @ 290816] Return: 239.22
Saved video: videos_v2/ppo_eval_290816.mp4
Step: 292864 | LR: 0.000213 | Reward: 0.10 | KL: 0.01067 | ClipFrac: 0.112 | ExplVar: 0.995
Step: 294912 | LR: 0.000212 | Reward: 0.36 | KL: 0.01002 | ClipFrac: 0.089 | ExplVar: 0.790
Step: 296960 | LR: 0.000212 | Reward: 0.25 | KL: 0.00651 | ClipFrac: 0.060 | ExplVar: 0.940
Step: 299008 | LR: 0.000211 | Reward: 0.24 | KL: 0.01058 | ClipFrac: 0.122 | ExplVar: 0.928
Step: 301056 | LR: 0.000210 | Reward: 0.19 | KL: 0.00802 | ClipFrac: 0.077 | ExplVar: 0.944




[Eval @ 301056] Return: 263.48
Saved video: videos_v2/ppo_eval_301056.mp4
Step: 303104 | LR: 0.000210 | Reward: 0.32 | KL: 0.00282 | ClipFrac: 0.078 | ExplVar: 0.888
Step: 305152 | LR: 0.000209 | Reward: 0.20 | KL: 0.00457 | ClipFrac: 0.107 | ExplVar: 0.921
Step: 307200 | LR: 0.000208 | Reward: 0.18 | KL: 0.00536 | ClipFrac: 0.100 | ExplVar: 0.997
Step: 309248 | LR: 0.000208 | Reward: 0.23 | KL: 0.00889 | ClipFrac: 0.089 | ExplVar: 0.787
Step: 311296 | LR: 0.000207 | Reward: 0.20 | KL: 0.00716 | ClipFrac: 0.122 | ExplVar: 0.885




[Eval @ 311296] Return: 262.05
Saved video: videos_v2/ppo_eval_311296.mp4
Step: 313344 | LR: 0.000207 | Reward: 0.13 | KL: 0.00667 | ClipFrac: 0.082 | ExplVar: 0.998
Step: 315392 | LR: 0.000206 | Reward: 0.32 | KL: 0.00888 | ClipFrac: 0.132 | ExplVar: 0.730
Step: 317440 | LR: 0.000205 | Reward: 0.13 | KL: 0.00582 | ClipFrac: 0.071 | ExplVar: 0.994
Step: 319488 | LR: 0.000205 | Reward: 0.18 | KL: 0.00395 | ClipFrac: 0.056 | ExplVar: 0.928
Step: 321536 | LR: 0.000204 | Reward: 0.15 | KL: 0.00380 | ClipFrac: 0.074 | ExplVar: 0.997




[Eval @ 321536] Return: 7.01
Saved video: videos_v2/ppo_eval_321536.mp4
Step: 323584 | LR: 0.000204 | Reward: 0.14 | KL: 0.00550 | ClipFrac: 0.095 | ExplVar: 0.998
Step: 325632 | LR: 0.000203 | Reward: 0.36 | KL: 0.01096 | ClipFrac: 0.125 | ExplVar: 0.928
Step: 327680 | LR: 0.000202 | Reward: 0.27 | KL: 0.01175 | ClipFrac: 0.140 | ExplVar: 0.936
Step: 329728 | LR: 0.000202 | Reward: 0.27 | KL: 0.00874 | ClipFrac: 0.100 | ExplVar: 0.874
Step: 331776 | LR: 0.000201 | Reward: 0.22 | KL: 0.01207 | ClipFrac: 0.097 | ExplVar: 0.844




[Eval @ 331776] Return: 253.19
Saved video: videos_v2/ppo_eval_331776.mp4
Step: 333824 | LR: 0.000200 | Reward: 0.30 | KL: 0.00962 | ClipFrac: 0.122 | ExplVar: 0.947
Step: 335872 | LR: 0.000200 | Reward: 0.11 | KL: 0.00632 | ClipFrac: 0.100 | ExplVar: 0.894
Step: 337920 | LR: 0.000199 | Reward: 0.37 | KL: 0.01161 | ClipFrac: 0.107 | ExplVar: 0.828
Step: 339968 | LR: 0.000199 | Reward: 0.33 | KL: 0.00986 | ClipFrac: 0.120 | ExplVar: 0.785
Step: 342016 | LR: 0.000198 | Reward: 0.29 | KL: 0.01007 | ClipFrac: 0.117 | ExplVar: 0.901




[Eval @ 342016] Return: 211.23
Saved video: videos_v2/ppo_eval_342016.mp4
Step: 344064 | LR: 0.000197 | Reward: 0.28 | KL: 0.00624 | ClipFrac: 0.096 | ExplVar: 0.874
Step: 346112 | LR: 0.000197 | Reward: 0.16 | KL: 0.01329 | ClipFrac: 0.127 | ExplVar: 0.997
Step: 348160 | LR: 0.000196 | Reward: 0.49 | KL: 0.00507 | ClipFrac: 0.081 | ExplVar: 0.863
Step: 350208 | LR: 0.000196 | Reward: 0.41 | KL: 0.00969 | ClipFrac: 0.101 | ExplVar: 0.850




[Eval @ 350208] Return: 230.15
Saved video: videos_v2/ppo_eval_350208.mp4
Step: 352256 | LR: 0.000195 | Reward: 0.37 | KL: 0.00759 | ClipFrac: 0.117 | ExplVar: 0.902
Step: 354304 | LR: 0.000194 | Reward: 0.18 | KL: 0.00866 | ClipFrac: 0.090 | ExplVar: 0.910
Step: 356352 | LR: 0.000194 | Reward: 0.67 | KL: 0.01754 | ClipFrac: 0.154 | ExplVar: 0.715
Step: 358400 | LR: 0.000193 | Reward: 0.18 | KL: 0.01237 | ClipFrac: 0.171 | ExplVar: 0.928
Step: 360448 | LR: 0.000192 | Reward: 0.25 | KL: 0.01214 | ClipFrac: 0.138 | ExplVar: 0.842




[Eval @ 360448] Return: 258.80
Saved video: videos_v2/ppo_eval_360448.mp4
Step: 362496 | LR: 0.000192 | Reward: 0.23 | KL: 0.01059 | ClipFrac: 0.124 | ExplVar: 0.944
Step: 364544 | LR: 0.000191 | Reward: 0.17 | KL: 0.00698 | ClipFrac: 0.064 | ExplVar: 0.924
Step: 366592 | LR: 0.000191 | Reward: 0.34 | KL: 0.00353 | ClipFrac: 0.039 | ExplVar: 0.861
Step: 368640 | LR: 0.000190 | Reward: 0.14 | KL: 0.00730 | ClipFrac: 0.104 | ExplVar: 0.918
Step: 370688 | LR: 0.000189 | Reward: 0.40 | KL: 0.01082 | ClipFrac: 0.103 | ExplVar: 0.829




[Eval @ 370688] Return: 258.17
Saved video: videos_v2/ppo_eval_370688.mp4
Step: 372736 | LR: 0.000189 | Reward: 0.11 | KL: 0.00690 | ClipFrac: 0.117 | ExplVar: 0.887
Step: 374784 | LR: 0.000188 | Reward: 0.19 | KL: 0.00714 | ClipFrac: 0.107 | ExplVar: 0.917
Step: 376832 | LR: 0.000188 | Reward: 0.34 | KL: 0.01263 | ClipFrac: 0.071 | ExplVar: 0.780
Step: 378880 | LR: 0.000187 | Reward: 0.16 | KL: 0.00868 | ClipFrac: 0.080 | ExplVar: 0.999
Step: 380928 | LR: 0.000186 | Reward: 0.55 | KL: 0.01180 | ClipFrac: 0.100 | ExplVar: 0.880




[Eval @ 380928] Return: 236.40
Saved video: videos_v2/ppo_eval_380928.mp4
Step: 382976 | LR: 0.000186 | Reward: 0.24 | KL: 0.00929 | ClipFrac: 0.109 | ExplVar: 0.964
Step: 385024 | LR: 0.000185 | Reward: 0.31 | KL: 0.00969 | ClipFrac: 0.102 | ExplVar: 0.907
Step: 387072 | LR: 0.000184 | Reward: 0.09 | KL: 0.00752 | ClipFrac: 0.078 | ExplVar: 0.714
Step: 389120 | LR: 0.000184 | Reward: 0.28 | KL: 0.00353 | ClipFrac: 0.061 | ExplVar: 0.816
Step: 391168 | LR: 0.000183 | Reward: 0.24 | KL: 0.01011 | ClipFrac: 0.108 | ExplVar: 0.873




[Eval @ 391168] Return: 266.28
Saved video: videos_v2/ppo_eval_391168.mp4
Step: 393216 | LR: 0.000183 | Reward: 0.22 | KL: 0.01063 | ClipFrac: 0.134 | ExplVar: 0.822
Step: 395264 | LR: 0.000182 | Reward: 0.14 | KL: 0.01267 | ClipFrac: 0.095 | ExplVar: 0.811
Step: 397312 | LR: 0.000181 | Reward: 0.24 | KL: 0.01068 | ClipFrac: 0.086 | ExplVar: 0.804
Step: 399360 | LR: 0.000181 | Reward: 0.52 | KL: 0.00907 | ClipFrac: 0.108 | ExplVar: 0.756
Step: 401408 | LR: 0.000180 | Reward: 0.37 | KL: 0.00352 | ClipFrac: 0.083 | ExplVar: 0.804




[Eval @ 401408] Return: 256.09
Saved video: videos_v2/ppo_eval_401408.mp4
Step: 403456 | LR: 0.000180 | Reward: 0.26 | KL: 0.00577 | ClipFrac: 0.105 | ExplVar: 0.837
Step: 405504 | LR: 0.000179 | Reward: 0.30 | KL: 0.01325 | ClipFrac: 0.148 | ExplVar: 0.881
Step: 407552 | LR: 0.000178 | Reward: 0.33 | KL: 0.00575 | ClipFrac: 0.101 | ExplVar: 0.839
Step: 409600 | LR: 0.000178 | Reward: 0.36 | KL: 0.00865 | ClipFrac: 0.088 | ExplVar: 0.769
Step: 411648 | LR: 0.000177 | Reward: 0.40 | KL: 0.00890 | ClipFrac: 0.115 | ExplVar: 0.651




[Eval @ 411648] Return: 262.29
Saved video: videos_v2/ppo_eval_411648.mp4
Step: 413696 | LR: 0.000177 | Reward: 0.41 | KL: 0.01189 | ClipFrac: 0.105 | ExplVar: 0.830
Step: 415744 | LR: 0.000176 | Reward: 0.15 | KL: 0.01318 | ClipFrac: 0.123 | ExplVar: 0.864
Step: 417792 | LR: 0.000175 | Reward: 0.24 | KL: 0.00583 | ClipFrac: 0.091 | ExplVar: 0.791
Step: 419840 | LR: 0.000175 | Reward: 0.56 | KL: 0.00296 | ClipFrac: 0.114 | ExplVar: 0.795
Step: 421888 | LR: 0.000174 | Reward: 0.72 | KL: 0.01261 | ClipFrac: 0.101 | ExplVar: 0.714




[Eval @ 421888] Return: 250.23
Saved video: videos_v2/ppo_eval_421888.mp4
Step: 423936 | LR: 0.000173 | Reward: 0.21 | KL: 0.00558 | ClipFrac: 0.100 | ExplVar: 0.892
Step: 425984 | LR: 0.000173 | Reward: 0.49 | KL: 0.00640 | ClipFrac: 0.061 | ExplVar: 0.792
Step: 428032 | LR: 0.000172 | Reward: 0.43 | KL: 0.00720 | ClipFrac: 0.076 | ExplVar: 0.744
Step: 430080 | LR: 0.000172 | Reward: 0.28 | KL: 0.01408 | ClipFrac: 0.166 | ExplVar: 0.922




[Eval @ 430080] Return: 234.07
Saved video: videos_v2/ppo_eval_430080.mp4
Step: 432128 | LR: 0.000171 | Reward: 0.31 | KL: 0.00933 | ClipFrac: 0.117 | ExplVar: 0.829
Step: 434176 | LR: 0.000170 | Reward: 0.62 | KL: 0.00567 | ClipFrac: 0.087 | ExplVar: 0.840
Step: 436224 | LR: 0.000170 | Reward: 0.41 | KL: 0.01371 | ClipFrac: 0.242 | ExplVar: 0.821
Step: 438272 | LR: 0.000169 | Reward: 0.24 | KL: 0.01048 | ClipFrac: 0.100 | ExplVar: 0.888
Step: 440320 | LR: 0.000169 | Reward: 0.21 | KL: 0.00657 | ClipFrac: 0.043 | ExplVar: 0.893




[Eval @ 440320] Return: 38.56
Saved video: videos_v2/ppo_eval_440320.mp4
Step: 442368 | LR: 0.000168 | Reward: 0.27 | KL: 0.01555 | ClipFrac: 0.165 | ExplVar: 0.953
Step: 444416 | LR: 0.000167 | Reward: 0.15 | KL: 0.00686 | ClipFrac: 0.080 | ExplVar: 0.997
Step: 446464 | LR: 0.000167 | Reward: 0.01 | KL: 0.00470 | ClipFrac: 0.094 | ExplVar: 0.987
Step: 448512 | LR: 0.000166 | Reward: 0.05 | KL: 0.00581 | ClipFrac: 0.138 | ExplVar: 0.993
Step: 450560 | LR: 0.000165 | Reward: 0.36 | KL: 0.01207 | ClipFrac: 0.127 | ExplVar: 0.929




[Eval @ 450560] Return: 244.79
Saved video: videos_v2/ppo_eval_450560.mp4
Step: 452608 | LR: 0.000165 | Reward: 0.34 | KL: 0.00970 | ClipFrac: 0.096 | ExplVar: 0.829
Step: 454656 | LR: 0.000164 | Reward: 0.35 | KL: 0.00576 | ClipFrac: 0.083 | ExplVar: 0.880
Step: 456704 | LR: 0.000164 | Reward: 0.16 | KL: 0.00286 | ClipFrac: 0.067 | ExplVar: 0.994
Step: 458752 | LR: 0.000163 | Reward: 0.51 | KL: 0.00512 | ClipFrac: 0.058 | ExplVar: 0.914
Step: 460800 | LR: 0.000162 | Reward: 0.52 | KL: 0.00563 | ClipFrac: 0.088 | ExplVar: 0.825




[Eval @ 460800] Return: 289.73
Saved video: videos_v2/ppo_eval_460800.mp4
Step: 462848 | LR: 0.000162 | Reward: 0.26 | KL: 0.00723 | ClipFrac: 0.057 | ExplVar: 0.945
Step: 464896 | LR: 0.000161 | Reward: 0.12 | KL: 0.00802 | ClipFrac: 0.100 | ExplVar: 0.998
Step: 466944 | LR: 0.000161 | Reward: 0.58 | KL: 0.01085 | ClipFrac: 0.089 | ExplVar: 0.897
Step: 468992 | LR: 0.000160 | Reward: 0.13 | KL: 0.01174 | ClipFrac: 0.131 | ExplVar: 0.994
Step: 471040 | LR: 0.000159 | Reward: 0.17 | KL: 0.01016 | ClipFrac: 0.086 | ExplVar: 0.992




[Eval @ 471040] Return: 248.71
Saved video: videos_v2/ppo_eval_471040.mp4
Step: 473088 | LR: 0.000159 | Reward: 0.13 | KL: 0.00717 | ClipFrac: 0.091 | ExplVar: 0.997
Step: 475136 | LR: 0.000158 | Reward: 0.24 | KL: 0.00683 | ClipFrac: 0.090 | ExplVar: 0.973
Step: 477184 | LR: 0.000157 | Reward: 0.23 | KL: 0.00246 | ClipFrac: 0.034 | ExplVar: 0.955
Step: 479232 | LR: 0.000157 | Reward: 0.28 | KL: 0.00867 | ClipFrac: 0.128 | ExplVar: 0.851
Step: 481280 | LR: 0.000156 | Reward: 0.20 | KL: 0.00902 | ClipFrac: 0.067 | ExplVar: 0.899




[Eval @ 481280] Return: 263.04
Saved video: videos_v2/ppo_eval_481280.mp4
Step: 483328 | LR: 0.000156 | Reward: 0.40 | KL: 0.00525 | ClipFrac: 0.048 | ExplVar: 0.951
Step: 485376 | LR: 0.000155 | Reward: 0.58 | KL: 0.00692 | ClipFrac: 0.080 | ExplVar: 0.873
Step: 487424 | LR: 0.000154 | Reward: 0.28 | KL: 0.01549 | ClipFrac: 0.190 | ExplVar: 0.922
Step: 489472 | LR: 0.000154 | Reward: 0.22 | KL: 0.01256 | ClipFrac: 0.126 | ExplVar: 0.866
Step: 491520 | LR: 0.000153 | Reward: 0.53 | KL: 0.00779 | ClipFrac: 0.108 | ExplVar: 0.916




[Eval @ 491520] Return: 254.36
Saved video: videos_v2/ppo_eval_491520.mp4
Step: 493568 | LR: 0.000153 | Reward: 0.44 | KL: 0.01240 | ClipFrac: 0.138 | ExplVar: 0.833
Step: 495616 | LR: 0.000152 | Reward: 0.21 | KL: 0.01267 | ClipFrac: 0.127 | ExplVar: 0.877
Step: 497664 | LR: 0.000151 | Reward: 0.27 | KL: 0.00679 | ClipFrac: 0.129 | ExplVar: 0.897
Step: 499712 | LR: 0.000151 | Reward: 0.29 | KL: 0.00753 | ClipFrac: 0.133 | ExplVar: 0.877
Step: 501760 | LR: 0.000150 | Reward: 0.61 | KL: 0.00577 | ClipFrac: 0.078 | ExplVar: 0.883




[Eval @ 501760] Return: 210.73
Saved video: videos_v2/ppo_eval_501760.mp4
Step: 503808 | LR: 0.000149 | Reward: 0.17 | KL: 0.00267 | ClipFrac: 0.052 | ExplVar: 0.933
Step: 505856 | LR: 0.000149 | Reward: 0.45 | KL: 0.00737 | ClipFrac: 0.100 | ExplVar: 0.865
Step: 507904 | LR: 0.000148 | Reward: 0.45 | KL: 0.00459 | ClipFrac: 0.064 | ExplVar: 0.870
Step: 509952 | LR: 0.000148 | Reward: 0.60 | KL: 0.00903 | ClipFrac: 0.105 | ExplVar: 0.859
Step: 512000 | LR: 0.000147 | Reward: 0.45 | KL: 0.00637 | ClipFrac: 0.102 | ExplVar: 0.881




[Eval @ 512000] Return: 234.13
Saved video: videos_v2/ppo_eval_512000.mp4
Step: 514048 | LR: 0.000146 | Reward: 0.62 | KL: 0.00977 | ClipFrac: 0.077 | ExplVar: 0.914
Step: 516096 | LR: 0.000146 | Reward: 0.35 | KL: 0.01144 | ClipFrac: 0.126 | ExplVar: 0.949
Step: 518144 | LR: 0.000145 | Reward: 0.31 | KL: 0.01430 | ClipFrac: 0.156 | ExplVar: 0.826
Step: 520192 | LR: 0.000145 | Reward: 0.17 | KL: 0.00751 | ClipFrac: 0.081 | ExplVar: 0.939




[Eval @ 520192] Return: 263.76
Saved video: videos_v2/ppo_eval_520192.mp4
Step: 522240 | LR: 0.000144 | Reward: 0.45 | KL: 0.00806 | ClipFrac: 0.133 | ExplVar: 0.887
Step: 524288 | LR: 0.000143 | Reward: 0.28 | KL: 0.01330 | ClipFrac: 0.136 | ExplVar: 0.966
Step: 526336 | LR: 0.000143 | Reward: 0.39 | KL: 0.01003 | ClipFrac: 0.158 | ExplVar: 0.851
Step: 528384 | LR: 0.000142 | Reward: 0.53 | KL: 0.00486 | ClipFrac: 0.100 | ExplVar: 0.857
Step: 530432 | LR: 0.000141 | Reward: 0.44 | KL: 0.00897 | ClipFrac: 0.113 | ExplVar: 0.765




[Eval @ 530432] Return: 276.11
Saved video: videos_v2/ppo_eval_530432.mp4
Step: 532480 | LR: 0.000141 | Reward: 0.54 | KL: 0.00885 | ClipFrac: 0.117 | ExplVar: 0.890
Step: 534528 | LR: 0.000140 | Reward: 0.22 | KL: 0.00591 | ClipFrac: 0.066 | ExplVar: 0.849
Step: 536576 | LR: 0.000140 | Reward: 0.48 | KL: 0.00774 | ClipFrac: 0.072 | ExplVar: 0.806
Step: 538624 | LR: 0.000139 | Reward: 0.62 | KL: 0.00466 | ClipFrac: 0.041 | ExplVar: 0.799
Step: 540672 | LR: 0.000138 | Reward: 0.43 | KL: 0.00830 | ClipFrac: 0.113 | ExplVar: 0.893




[Eval @ 540672] Return: 34.55
Saved video: videos_v2/ppo_eval_540672.mp4
Step: 542720 | LR: 0.000138 | Reward: 0.51 | KL: 0.01221 | ClipFrac: 0.098 | ExplVar: 0.813
Step: 544768 | LR: 0.000137 | Reward: 0.47 | KL: 0.00372 | ClipFrac: 0.047 | ExplVar: 0.912
Step: 546816 | LR: 0.000137 | Reward: 0.59 | KL: 0.01694 | ClipFrac: 0.134 | ExplVar: 0.892
Step: 548864 | LR: 0.000136 | Reward: 0.29 | KL: 0.01154 | ClipFrac: 0.109 | ExplVar: 0.919
Step: 550912 | LR: 0.000135 | Reward: 0.43 | KL: 0.01270 | ClipFrac: 0.165 | ExplVar: 0.846




[Eval @ 550912] Return: -11.21
Saved video: videos_v2/ppo_eval_550912.mp4
Step: 552960 | LR: 0.000135 | Reward: 0.42 | KL: 0.00980 | ClipFrac: 0.151 | ExplVar: 0.937
Step: 555008 | LR: 0.000134 | Reward: 0.82 | KL: 0.00957 | ClipFrac: 0.071 | ExplVar: 0.870
Step: 557056 | LR: 0.000133 | Reward: 0.26 | KL: 0.00695 | ClipFrac: 0.085 | ExplVar: 0.978
Step: 559104 | LR: 0.000133 | Reward: 0.65 | KL: 0.00774 | ClipFrac: 0.110 | ExplVar: 0.808
Step: 561152 | LR: 0.000132 | Reward: 0.35 | KL: 0.01230 | ClipFrac: 0.115 | ExplVar: 0.824




[Eval @ 561152] Return: 297.35
Saved video: videos_v2/ppo_eval_561152.mp4
Step: 563200 | LR: 0.000132 | Reward: 0.46 | KL: 0.00603 | ClipFrac: 0.080 | ExplVar: 0.866
Step: 565248 | LR: 0.000131 | Reward: 0.23 | KL: 0.01093 | ClipFrac: 0.071 | ExplVar: 0.916
Step: 567296 | LR: 0.000130 | Reward: 0.42 | KL: 0.00924 | ClipFrac: 0.101 | ExplVar: 0.771
Step: 569344 | LR: 0.000130 | Reward: 0.43 | KL: 0.01440 | ClipFrac: 0.147 | ExplVar: 0.919
Step: 571392 | LR: 0.000129 | Reward: 0.26 | KL: 0.00823 | ClipFrac: 0.103 | ExplVar: 0.868




[Eval @ 571392] Return: 279.20
Saved video: videos_v2/ppo_eval_571392.mp4
Step: 573440 | LR: 0.000129 | Reward: 0.59 | KL: 0.01071 | ClipFrac: 0.139 | ExplVar: 0.904
Step: 575488 | LR: 0.000128 | Reward: 0.58 | KL: 0.01015 | ClipFrac: 0.102 | ExplVar: 0.778
Step: 577536 | LR: 0.000127 | Reward: 0.57 | KL: 0.00624 | ClipFrac: 0.055 | ExplVar: 0.891
Step: 579584 | LR: 0.000127 | Reward: 0.66 | KL: 0.00657 | ClipFrac: 0.106 | ExplVar: 0.698
Step: 581632 | LR: 0.000126 | Reward: 0.63 | KL: 0.00732 | ClipFrac: 0.090 | ExplVar: 0.827




[Eval @ 581632] Return: 253.79
Saved video: videos_v2/ppo_eval_581632.mp4
Step: 583680 | LR: 0.000126 | Reward: 0.57 | KL: 0.01123 | ClipFrac: 0.101 | ExplVar: 0.852
Step: 585728 | LR: 0.000125 | Reward: 0.32 | KL: 0.00756 | ClipFrac: 0.096 | ExplVar: 0.936
Step: 587776 | LR: 0.000124 | Reward: 0.73 | KL: 0.00868 | ClipFrac: 0.127 | ExplVar: 0.799
Step: 589824 | LR: 0.000124 | Reward: 0.23 | KL: 0.01080 | ClipFrac: 0.119 | ExplVar: 0.892
Step: 591872 | LR: 0.000123 | Reward: 0.48 | KL: 0.01540 | ClipFrac: 0.171 | ExplVar: 0.900




[Eval @ 591872] Return: 268.80
Saved video: videos_v2/ppo_eval_591872.mp4
Step: 593920 | LR: 0.000122 | Reward: 0.60 | KL: 0.00938 | ClipFrac: 0.096 | ExplVar: 0.875
Step: 595968 | LR: 0.000122 | Reward: 0.20 | KL: 0.00827 | ClipFrac: 0.103 | ExplVar: 0.892
Step: 598016 | LR: 0.000121 | Reward: 0.56 | KL: 0.00805 | ClipFrac: 0.117 | ExplVar: 0.859
Step: 600064 | LR: 0.000121 | Reward: 0.30 | KL: 0.00702 | ClipFrac: 0.081 | ExplVar: 0.889




[Eval @ 600064] Return: 293.71
Saved video: videos_v2/ppo_eval_600064.mp4
Step: 602112 | LR: 0.000120 | Reward: 0.74 | KL: 0.01024 | ClipFrac: 0.077 | ExplVar: 0.890
Step: 604160 | LR: 0.000119 | Reward: 0.20 | KL: 0.00639 | ClipFrac: 0.097 | ExplVar: 0.991
Step: 606208 | LR: 0.000119 | Reward: 0.36 | KL: 0.00673 | ClipFrac: 0.104 | ExplVar: 0.862
Step: 608256 | LR: 0.000118 | Reward: 0.37 | KL: 0.00867 | ClipFrac: 0.108 | ExplVar: 0.935
Step: 610304 | LR: 0.000118 | Reward: 0.57 | KL: 0.00866 | ClipFrac: 0.082 | ExplVar: 0.915




[Eval @ 610304] Return: 263.63
Saved video: videos_v2/ppo_eval_610304.mp4
Step: 612352 | LR: 0.000117 | Reward: 0.45 | KL: 0.00564 | ClipFrac: 0.077 | ExplVar: 0.902
Step: 614400 | LR: 0.000116 | Reward: 0.17 | KL: 0.01161 | ClipFrac: 0.110 | ExplVar: 0.969
Step: 616448 | LR: 0.000116 | Reward: 0.42 | KL: 0.00799 | ClipFrac: 0.120 | ExplVar: 0.870
Step: 618496 | LR: 0.000115 | Reward: 0.59 | KL: 0.01002 | ClipFrac: 0.137 | ExplVar: 0.889
Step: 620544 | LR: 0.000114 | Reward: 0.59 | KL: 0.00743 | ClipFrac: 0.059 | ExplVar: 0.884




[Eval @ 620544] Return: 241.55
Saved video: videos_v2/ppo_eval_620544.mp4
Step: 622592 | LR: 0.000114 | Reward: 0.66 | KL: 0.00753 | ClipFrac: 0.076 | ExplVar: 0.844
Step: 624640 | LR: 0.000113 | Reward: 0.80 | KL: 0.00770 | ClipFrac: 0.087 | ExplVar: 0.832
Step: 626688 | LR: 0.000113 | Reward: 0.95 | KL: 0.00558 | ClipFrac: 0.084 | ExplVar: 0.840
Step: 628736 | LR: 0.000112 | Reward: 0.39 | KL: 0.01146 | ClipFrac: 0.127 | ExplVar: 0.831
Step: 630784 | LR: 0.000111 | Reward: 0.61 | KL: 0.00787 | ClipFrac: 0.063 | ExplVar: 0.829




[Eval @ 630784] Return: 265.04
Saved video: videos_v2/ppo_eval_630784.mp4
Step: 632832 | LR: 0.000111 | Reward: 0.50 | KL: 0.00325 | ClipFrac: 0.065 | ExplVar: 0.851
Step: 634880 | LR: 0.000110 | Reward: 0.56 | KL: 0.00702 | ClipFrac: 0.100 | ExplVar: 0.878
Step: 636928 | LR: 0.000110 | Reward: 0.82 | KL: 0.00552 | ClipFrac: 0.103 | ExplVar: 0.839
Step: 638976 | LR: 0.000109 | Reward: 0.43 | KL: 0.00942 | ClipFrac: 0.110 | ExplVar: 0.850
Step: 641024 | LR: 0.000108 | Reward: 0.61 | KL: 0.00583 | ClipFrac: 0.114 | ExplVar: 0.900




[Eval @ 641024] Return: 228.36
Saved video: videos_v2/ppo_eval_641024.mp4
Step: 643072 | LR: 0.000108 | Reward: 0.78 | KL: 0.01438 | ClipFrac: 0.127 | ExplVar: 0.785
Step: 645120 | LR: 0.000107 | Reward: 0.27 | KL: 0.01096 | ClipFrac: 0.133 | ExplVar: 0.968
Step: 647168 | LR: 0.000106 | Reward: 0.61 | KL: 0.00799 | ClipFrac: 0.095 | ExplVar: 0.834
Step: 649216 | LR: 0.000106 | Reward: 0.63 | KL: 0.00676 | ClipFrac: 0.086 | ExplVar: 0.919
Step: 651264 | LR: 0.000105 | Reward: 1.16 | KL: 0.01055 | ClipFrac: 0.089 | ExplVar: 0.715




[Eval @ 651264] Return: 264.93
Saved video: videos_v2/ppo_eval_651264.mp4
Step: 653312 | LR: 0.000105 | Reward: 0.27 | KL: 0.00722 | ClipFrac: 0.041 | ExplVar: 0.837
Step: 655360 | LR: 0.000104 | Reward: 0.38 | KL: 0.00519 | ClipFrac: 0.047 | ExplVar: 0.941
Step: 657408 | LR: 0.000103 | Reward: 0.61 | KL: 0.01779 | ClipFrac: 0.129 | ExplVar: 0.823
Step: 659456 | LR: 0.000103 | Reward: 0.65 | KL: 0.00707 | ClipFrac: 0.091 | ExplVar: 0.828
Step: 661504 | LR: 0.000102 | Reward: 0.36 | KL: 0.00512 | ClipFrac: 0.090 | ExplVar: 0.891




[Eval @ 661504] Return: 236.53
Saved video: videos_v2/ppo_eval_661504.mp4
Step: 663552 | LR: 0.000102 | Reward: 0.29 | KL: 0.01537 | ClipFrac: 0.143 | ExplVar: 0.926
Step: 665600 | LR: 0.000101 | Reward: 0.40 | KL: 0.00390 | ClipFrac: 0.064 | ExplVar: 0.849
Step: 667648 | LR: 0.000100 | Reward: 0.95 | KL: 0.00775 | ClipFrac: 0.082 | ExplVar: 0.842
Step: 669696 | LR: 0.000100 | Reward: 0.53 | KL: 0.01295 | ClipFrac: 0.113 | ExplVar: 0.913
Step: 671744 | LR: 0.000099 | Reward: 0.68 | KL: 0.01005 | ClipFrac: 0.096 | ExplVar: 0.841




[Eval @ 671744] Return: 246.96
Saved video: videos_v2/ppo_eval_671744.mp4
Step: 673792 | LR: 0.000098 | Reward: 0.58 | KL: 0.00568 | ClipFrac: 0.049 | ExplVar: 0.913
Step: 675840 | LR: 0.000098 | Reward: 0.85 | KL: 0.01076 | ClipFrac: 0.094 | ExplVar: 0.840
Step: 677888 | LR: 0.000097 | Reward: 0.85 | KL: 0.00866 | ClipFrac: 0.065 | ExplVar: 0.874
Step: 679936 | LR: 0.000097 | Reward: 0.31 | KL: 0.00716 | ClipFrac: 0.083 | ExplVar: 0.931
Step: 681984 | LR: 0.000096 | Reward: 0.12 | KL: 0.00854 | ClipFrac: 0.103 | ExplVar: 0.996




[Eval @ 681984] Return: 251.14
Saved video: videos_v2/ppo_eval_681984.mp4
Step: 684032 | LR: 0.000095 | Reward: 0.68 | KL: 0.00634 | ClipFrac: 0.055 | ExplVar: 0.894
Step: 686080 | LR: 0.000095 | Reward: 0.60 | KL: 0.00653 | ClipFrac: 0.057 | ExplVar: 0.863
Step: 688128 | LR: 0.000094 | Reward: 0.21 | KL: 0.00640 | ClipFrac: 0.106 | ExplVar: 0.884
Step: 690176 | LR: 0.000094 | Reward: 0.67 | KL: 0.00812 | ClipFrac: 0.094 | ExplVar: 0.871




[Eval @ 690176] Return: 242.19
Saved video: videos_v2/ppo_eval_690176.mp4
Step: 692224 | LR: 0.000093 | Reward: 0.41 | KL: 0.00602 | ClipFrac: 0.035 | ExplVar: 0.904
Step: 694272 | LR: 0.000092 | Reward: 0.16 | KL: 0.00680 | ClipFrac: 0.088 | ExplVar: 0.974
Step: 696320 | LR: 0.000092 | Reward: 0.81 | KL: 0.00284 | ClipFrac: 0.035 | ExplVar: 0.806
Step: 698368 | LR: 0.000091 | Reward: 0.55 | KL: 0.00466 | ClipFrac: 0.061 | ExplVar: 0.903
Step: 700416 | LR: 0.000090 | Reward: 0.77 | KL: 0.00381 | ClipFrac: 0.059 | ExplVar: 0.862




[Eval @ 700416] Return: 253.17
Saved video: videos_v2/ppo_eval_700416.mp4
Step: 702464 | LR: 0.000090 | Reward: 0.75 | KL: 0.00565 | ClipFrac: 0.064 | ExplVar: 0.876
Step: 704512 | LR: 0.000089 | Reward: 0.49 | KL: 0.00612 | ClipFrac: 0.064 | ExplVar: 0.903
Step: 706560 | LR: 0.000089 | Reward: 0.25 | KL: 0.00241 | ClipFrac: 0.059 | ExplVar: 0.976
Step: 708608 | LR: 0.000088 | Reward: 0.70 | KL: 0.00645 | ClipFrac: 0.034 | ExplVar: 0.896
Step: 710656 | LR: 0.000087 | Reward: 0.65 | KL: 0.00404 | ClipFrac: 0.043 | ExplVar: 0.899




[Eval @ 710656] Return: 231.77
Saved video: videos_v2/ppo_eval_710656.mp4
Step: 712704 | LR: 0.000087 | Reward: 0.22 | KL: 0.00734 | ClipFrac: 0.089 | ExplVar: 0.895
Step: 714752 | LR: 0.000086 | Reward: 0.79 | KL: 0.00216 | ClipFrac: 0.025 | ExplVar: 0.857
Step: 716800 | LR: 0.000086 | Reward: 0.37 | KL: 0.01044 | ClipFrac: 0.083 | ExplVar: 0.871
Step: 718848 | LR: 0.000085 | Reward: 0.55 | KL: 0.00721 | ClipFrac: 0.046 | ExplVar: 0.901
Step: 720896 | LR: 0.000084 | Reward: 0.65 | KL: 0.00557 | ClipFrac: 0.067 | ExplVar: 0.902




[Eval @ 720896] Return: -4.10
Saved video: videos_v2/ppo_eval_720896.mp4
Step: 722944 | LR: 0.000084 | Reward: 0.89 | KL: 0.01080 | ClipFrac: 0.065 | ExplVar: 0.850
Step: 724992 | LR: 0.000083 | Reward: 0.66 | KL: 0.00661 | ClipFrac: 0.082 | ExplVar: 0.904
Step: 727040 | LR: 0.000083 | Reward: 0.37 | KL: 0.00494 | ClipFrac: 0.065 | ExplVar: 0.844
Step: 729088 | LR: 0.000082 | Reward: 0.82 | KL: 0.00439 | ClipFrac: 0.029 | ExplVar: 0.846
Step: 731136 | LR: 0.000081 | Reward: 0.47 | KL: 0.01203 | ClipFrac: 0.098 | ExplVar: 0.891




[Eval @ 731136] Return: 249.76
Saved video: videos_v2/ppo_eval_731136.mp4
Step: 733184 | LR: 0.000081 | Reward: 0.61 | KL: 0.00752 | ClipFrac: 0.062 | ExplVar: 0.873
Step: 735232 | LR: 0.000080 | Reward: 0.97 | KL: 0.00315 | ClipFrac: 0.026 | ExplVar: 0.840
Step: 737280 | LR: 0.000079 | Reward: 0.19 | KL: 0.00221 | ClipFrac: 0.105 | ExplVar: 0.929
Step: 739328 | LR: 0.000079 | Reward: 0.58 | KL: 0.00417 | ClipFrac: 0.031 | ExplVar: 0.852
Step: 741376 | LR: 0.000078 | Reward: 0.56 | KL: 0.00603 | ClipFrac: 0.064 | ExplVar: 0.921




[Eval @ 741376] Return: 255.02
Saved video: videos_v2/ppo_eval_741376.mp4
Step: 743424 | LR: 0.000078 | Reward: 0.52 | KL: 0.00316 | ClipFrac: 0.052 | ExplVar: 0.835
Step: 745472 | LR: 0.000077 | Reward: 0.41 | KL: 0.01023 | ClipFrac: 0.105 | ExplVar: 0.927
Step: 747520 | LR: 0.000076 | Reward: 0.69 | KL: 0.00442 | ClipFrac: 0.059 | ExplVar: 0.881
Step: 749568 | LR: 0.000076 | Reward: 0.65 | KL: 0.00679 | ClipFrac: 0.073 | ExplVar: 0.908
Step: 751616 | LR: 0.000075 | Reward: 0.63 | KL: 0.00620 | ClipFrac: 0.055 | ExplVar: 0.858




[Eval @ 751616] Return: 287.31
Saved video: videos_v2/ppo_eval_751616.mp4
Step: 753664 | LR: 0.000075 | Reward: 0.97 | KL: 0.00936 | ClipFrac: 0.069 | ExplVar: 0.838
Step: 755712 | LR: 0.000074 | Reward: 0.60 | KL: 0.01015 | ClipFrac: 0.081 | ExplVar: 0.907
Step: 757760 | LR: 0.000073 | Reward: 0.95 | KL: 0.00675 | ClipFrac: 0.071 | ExplVar: 0.865
Step: 759808 | LR: 0.000073 | Reward: 0.30 | KL: 0.00696 | ClipFrac: 0.087 | ExplVar: 0.972
Step: 761856 | LR: 0.000072 | Reward: 0.58 | KL: 0.00912 | ClipFrac: 0.085 | ExplVar: 0.902




[Eval @ 761856] Return: 243.58
Saved video: videos_v2/ppo_eval_761856.mp4
Step: 763904 | LR: 0.000071 | Reward: 0.69 | KL: 0.00388 | ClipFrac: 0.039 | ExplVar: 0.867
Step: 765952 | LR: 0.000071 | Reward: 1.17 | KL: 0.00861 | ClipFrac: 0.081 | ExplVar: 0.809
Step: 768000 | LR: 0.000070 | Reward: 0.86 | KL: 0.00859 | ClipFrac: 0.087 | ExplVar: 0.909
Step: 770048 | LR: 0.000070 | Reward: 1.04 | KL: 0.00481 | ClipFrac: 0.035 | ExplVar: 0.825




[Eval @ 770048] Return: 243.58
Saved video: videos_v2/ppo_eval_770048.mp4
Step: 772096 | LR: 0.000069 | Reward: 0.62 | KL: 0.00597 | ClipFrac: 0.075 | ExplVar: 0.899
Step: 774144 | LR: 0.000068 | Reward: 0.97 | KL: 0.00695 | ClipFrac: 0.056 | ExplVar: 0.855
Step: 776192 | LR: 0.000068 | Reward: 0.95 | KL: 0.00720 | ClipFrac: 0.055 | ExplVar: 0.865
Step: 778240 | LR: 0.000067 | Reward: 0.46 | KL: 0.01303 | ClipFrac: 0.071 | ExplVar: 0.916
Step: 780288 | LR: 0.000067 | Reward: 0.34 | KL: 0.00888 | ClipFrac: 0.097 | ExplVar: 0.885




[Eval @ 780288] Return: 247.41
Saved video: videos_v2/ppo_eval_780288.mp4
Step: 782336 | LR: 0.000066 | Reward: 0.31 | KL: 0.00541 | ClipFrac: 0.068 | ExplVar: 0.966
Step: 784384 | LR: 0.000065 | Reward: 1.02 | KL: 0.00857 | ClipFrac: 0.117 | ExplVar: 0.737
Step: 786432 | LR: 0.000065 | Reward: 0.54 | KL: 0.00336 | ClipFrac: 0.074 | ExplVar: 0.880
Step: 788480 | LR: 0.000064 | Reward: 0.89 | KL: 0.00560 | ClipFrac: 0.036 | ExplVar: 0.893
Step: 790528 | LR: 0.000063 | Reward: 0.15 | KL: 0.00235 | ClipFrac: 0.092 | ExplVar: 0.991




[Eval @ 790528] Return: 231.35
Saved video: videos_v2/ppo_eval_790528.mp4
Step: 792576 | LR: 0.000063 | Reward: 0.59 | KL: 0.01244 | ClipFrac: 0.121 | ExplVar: 0.909
Step: 794624 | LR: 0.000062 | Reward: 0.87 | KL: 0.00693 | ClipFrac: 0.062 | ExplVar: 0.861
Step: 796672 | LR: 0.000062 | Reward: 0.62 | KL: 0.00319 | ClipFrac: 0.056 | ExplVar: 0.876
Step: 798720 | LR: 0.000061 | Reward: 0.42 | KL: 0.00305 | ClipFrac: 0.065 | ExplVar: 0.866
Step: 800768 | LR: 0.000060 | Reward: 0.87 | KL: 0.00778 | ClipFrac: 0.038 | ExplVar: 0.893




[Eval @ 800768] Return: 253.08
Saved video: videos_v2/ppo_eval_800768.mp4
Step: 802816 | LR: 0.000060 | Reward: 0.39 | KL: 0.01072 | ClipFrac: 0.084 | ExplVar: 0.952
Step: 804864 | LR: 0.000059 | Reward: 1.13 | KL: 0.00928 | ClipFrac: 0.070 | ExplVar: 0.760
Step: 806912 | LR: 0.000059 | Reward: 0.32 | KL: 0.00182 | ClipFrac: 0.059 | ExplVar: 0.936
Step: 808960 | LR: 0.000058 | Reward: 0.70 | KL: 0.00227 | ClipFrac: 0.033 | ExplVar: 0.893
Step: 811008 | LR: 0.000057 | Reward: 1.04 | KL: 0.00755 | ClipFrac: 0.043 | ExplVar: 0.839




[Eval @ 811008] Return: 239.24
Saved video: videos_v2/ppo_eval_811008.mp4
Step: 813056 | LR: 0.000057 | Reward: 0.93 | KL: 0.00864 | ClipFrac: 0.087 | ExplVar: 0.832
Step: 815104 | LR: 0.000056 | Reward: 0.53 | KL: 0.00301 | ClipFrac: 0.041 | ExplVar: 0.936
Step: 817152 | LR: 0.000055 | Reward: 0.67 | KL: 0.00721 | ClipFrac: 0.052 | ExplVar: 0.938
Step: 819200 | LR: 0.000055 | Reward: 0.33 | KL: 0.00885 | ClipFrac: 0.104 | ExplVar: 0.899
Step: 821248 | LR: 0.000054 | Reward: 0.42 | KL: 0.01053 | ClipFrac: 0.125 | ExplVar: 0.918




[Eval @ 821248] Return: 223.05
Saved video: videos_v2/ppo_eval_821248.mp4
Step: 823296 | LR: 0.000054 | Reward: 0.34 | KL: 0.00740 | ClipFrac: 0.095 | ExplVar: 0.895
Step: 825344 | LR: 0.000053 | Reward: 0.45 | KL: 0.00408 | ClipFrac: 0.046 | ExplVar: 0.858
Step: 827392 | LR: 0.000052 | Reward: 0.82 | KL: 0.00466 | ClipFrac: 0.018 | ExplVar: 0.857
Step: 829440 | LR: 0.000052 | Reward: 1.11 | KL: 0.01395 | ClipFrac: 0.098 | ExplVar: 0.822
Step: 831488 | LR: 0.000051 | Reward: 0.47 | KL: 0.00545 | ClipFrac: 0.069 | ExplVar: 0.844




[Eval @ 831488] Return: 245.81
Saved video: videos_v2/ppo_eval_831488.mp4
Step: 833536 | LR: 0.000051 | Reward: 0.18 | KL: 0.00922 | ClipFrac: 0.065 | ExplVar: 0.944
Step: 835584 | LR: 0.000050 | Reward: 0.51 | KL: 0.00319 | ClipFrac: 0.051 | ExplVar: 0.943
Step: 837632 | LR: 0.000049 | Reward: 0.64 | KL: 0.00327 | ClipFrac: 0.039 | ExplVar: 0.939
Step: 839680 | LR: 0.000049 | Reward: 0.77 | KL: 0.00664 | ClipFrac: 0.050 | ExplVar: 0.919
Step: 841728 | LR: 0.000048 | Reward: 0.96 | KL: 0.01578 | ClipFrac: 0.120 | ExplVar: 0.857




[Eval @ 841728] Return: 263.97
Saved video: videos_v2/ppo_eval_841728.mp4
Step: 843776 | LR: 0.000047 | Reward: 0.47 | KL: 0.00988 | ClipFrac: 0.072 | ExplVar: 0.932
Step: 845824 | LR: 0.000047 | Reward: 0.67 | KL: 0.00519 | ClipFrac: 0.050 | ExplVar: 0.908
Step: 847872 | LR: 0.000046 | Reward: 1.02 | KL: 0.00552 | ClipFrac: 0.061 | ExplVar: 0.784
Step: 849920 | LR: 0.000046 | Reward: 1.01 | KL: 0.00688 | ClipFrac: 0.047 | ExplVar: 0.860
Step: 851968 | LR: 0.000045 | Reward: 0.33 | KL: 0.00613 | ClipFrac: 0.070 | ExplVar: 0.939




[Eval @ 851968] Return: 263.15
Saved video: videos_v2/ppo_eval_851968.mp4
Step: 854016 | LR: 0.000044 | Reward: 0.68 | KL: 0.01224 | ClipFrac: 0.090 | ExplVar: 0.807
Step: 856064 | LR: 0.000044 | Reward: 0.56 | KL: 0.00727 | ClipFrac: 0.062 | ExplVar: 0.956
Step: 858112 | LR: 0.000043 | Reward: 0.49 | KL: 0.00644 | ClipFrac: 0.047 | ExplVar: 0.897
Step: 860160 | LR: 0.000043 | Reward: 0.46 | KL: 0.00918 | ClipFrac: 0.096 | ExplVar: 0.952




[Eval @ 860160] Return: 276.46
Saved video: videos_v2/ppo_eval_860160.mp4
Step: 862208 | LR: 0.000042 | Reward: 0.98 | KL: 0.00668 | ClipFrac: 0.056 | ExplVar: 0.875
Step: 864256 | LR: 0.000041 | Reward: 0.83 | KL: 0.00095 | ClipFrac: 0.042 | ExplVar: 0.918
Step: 866304 | LR: 0.000041 | Reward: 0.37 | KL: 0.00685 | ClipFrac: 0.071 | ExplVar: 0.917
Step: 868352 | LR: 0.000040 | Reward: 0.59 | KL: 0.00483 | ClipFrac: 0.044 | ExplVar: 0.879
Step: 870400 | LR: 0.000039 | Reward: 0.59 | KL: 0.00445 | ClipFrac: 0.044 | ExplVar: 0.886




[Eval @ 870400] Return: 250.40
Saved video: videos_v2/ppo_eval_870400.mp4
Step: 872448 | LR: 0.000039 | Reward: 0.59 | KL: 0.00261 | ClipFrac: 0.049 | ExplVar: 0.888
Step: 874496 | LR: 0.000038 | Reward: 0.65 | KL: 0.00634 | ClipFrac: 0.041 | ExplVar: 0.939
Step: 876544 | LR: 0.000038 | Reward: 0.78 | KL: 0.00474 | ClipFrac: 0.018 | ExplVar: 0.857
Step: 878592 | LR: 0.000037 | Reward: 0.78 | KL: 0.00541 | ClipFrac: 0.032 | ExplVar: 0.864
Step: 880640 | LR: 0.000036 | Reward: 0.37 | KL: 0.00716 | ClipFrac: 0.057 | ExplVar: 0.860




[Eval @ 880640] Return: 263.53
Saved video: videos_v2/ppo_eval_880640.mp4
Step: 882688 | LR: 0.000036 | Reward: 0.63 | KL: 0.00396 | ClipFrac: 0.050 | ExplVar: 0.842
Step: 884736 | LR: 0.000035 | Reward: 0.76 | KL: 0.00450 | ClipFrac: 0.047 | ExplVar: 0.900
Step: 886784 | LR: 0.000035 | Reward: 0.71 | KL: 0.00520 | ClipFrac: 0.056 | ExplVar: 0.889
Step: 888832 | LR: 0.000034 | Reward: 0.58 | KL: 0.00359 | ClipFrac: 0.023 | ExplVar: 0.919
Step: 890880 | LR: 0.000033 | Reward: 0.43 | KL: 0.00149 | ClipFrac: 0.035 | ExplVar: 0.929




[Eval @ 890880] Return: 242.12
Saved video: videos_v2/ppo_eval_890880.mp4
Step: 892928 | LR: 0.000033 | Reward: 0.72 | KL: 0.00410 | ClipFrac: 0.029 | ExplVar: 0.856
Step: 894976 | LR: 0.000032 | Reward: 0.64 | KL: 0.00355 | ClipFrac: 0.038 | ExplVar: 0.930
Step: 897024 | LR: 0.000032 | Reward: 0.57 | KL: 0.00181 | ClipFrac: 0.028 | ExplVar: 0.921
Step: 899072 | LR: 0.000031 | Reward: 0.63 | KL: 0.00258 | ClipFrac: 0.034 | ExplVar: 0.887
Step: 901120 | LR: 0.000030 | Reward: 0.49 | KL: 0.00564 | ClipFrac: 0.040 | ExplVar: 0.946




[Eval @ 901120] Return: 274.94
Saved video: videos_v2/ppo_eval_901120.mp4
Step: 903168 | LR: 0.000030 | Reward: 0.96 | KL: 0.00346 | ClipFrac: 0.024 | ExplVar: 0.873
Step: 905216 | LR: 0.000029 | Reward: 0.44 | KL: 0.00753 | ClipFrac: 0.042 | ExplVar: 0.845
Step: 907264 | LR: 0.000028 | Reward: 0.92 | KL: 0.00558 | ClipFrac: 0.034 | ExplVar: 0.868
Step: 909312 | LR: 0.000028 | Reward: 0.77 | KL: 0.00068 | ClipFrac: 0.023 | ExplVar: 0.910
Step: 911360 | LR: 0.000027 | Reward: 0.73 | KL: 0.01346 | ClipFrac: 0.110 | ExplVar: 0.855




[Eval @ 911360] Return: 264.45
Saved video: videos_v2/ppo_eval_911360.mp4
Step: 913408 | LR: 0.000027 | Reward: 0.62 | KL: 0.00139 | ClipFrac: 0.022 | ExplVar: 0.870
Step: 915456 | LR: 0.000026 | Reward: 0.61 | KL: 0.00399 | ClipFrac: 0.040 | ExplVar: 0.832
Step: 917504 | LR: 0.000025 | Reward: 0.78 | KL: 0.00268 | ClipFrac: 0.021 | ExplVar: 0.754
Step: 919552 | LR: 0.000025 | Reward: 0.50 | KL: 0.00406 | ClipFrac: 0.037 | ExplVar: 0.886
Step: 921600 | LR: 0.000024 | Reward: 0.63 | KL: 0.00232 | ClipFrac: 0.012 | ExplVar: 0.898




[Eval @ 921600] Return: 237.75
Saved video: videos_v2/ppo_eval_921600.mp4
Step: 923648 | LR: 0.000024 | Reward: 0.40 | KL: 0.00440 | ClipFrac: 0.054 | ExplVar: 0.967
Step: 925696 | LR: 0.000023 | Reward: 0.64 | KL: 0.00571 | ClipFrac: 0.043 | ExplVar: 0.930
Step: 927744 | LR: 0.000022 | Reward: 0.80 | KL: 0.00274 | ClipFrac: 0.016 | ExplVar: 0.928
Step: 929792 | LR: 0.000022 | Reward: 1.02 | KL: 0.00223 | ClipFrac: 0.004 | ExplVar: 0.848
Step: 931840 | LR: 0.000021 | Reward: 0.62 | KL: 0.00458 | ClipFrac: 0.037 | ExplVar: 0.939




[Eval @ 931840] Return: 253.25
Saved video: videos_v2/ppo_eval_931840.mp4
Step: 933888 | LR: 0.000020 | Reward: 0.31 | KL: 0.00089 | ClipFrac: 0.018 | ExplVar: 0.943
Step: 935936 | LR: 0.000020 | Reward: 0.98 | KL: 0.00141 | ClipFrac: 0.005 | ExplVar: 0.817
Step: 937984 | LR: 0.000019 | Reward: 1.21 | KL: 0.00250 | ClipFrac: 0.039 | ExplVar: 0.811
Step: 940032 | LR: 0.000019 | Reward: 0.72 | KL: 0.00723 | ClipFrac: 0.055 | ExplVar: 0.901




[Eval @ 940032] Return: 267.89
Saved video: videos_v2/ppo_eval_940032.mp4
Step: 942080 | LR: 0.000018 | Reward: 0.98 | KL: 0.00314 | ClipFrac: 0.034 | ExplVar: 0.795
Step: 944128 | LR: 0.000017 | Reward: 0.65 | KL: 0.00461 | ClipFrac: 0.042 | ExplVar: 0.856
Step: 946176 | LR: 0.000017 | Reward: 0.71 | KL: 0.00708 | ClipFrac: 0.046 | ExplVar: 0.876
Step: 948224 | LR: 0.000016 | Reward: 1.22 | KL: 0.00610 | ClipFrac: 0.046 | ExplVar: 0.785
Step: 950272 | LR: 0.000016 | Reward: 1.03 | KL: 0.00147 | ClipFrac: 0.010 | ExplVar: 0.804




[Eval @ 950272] Return: 260.39
Saved video: videos_v2/ppo_eval_950272.mp4
Step: 952320 | LR: 0.000015 | Reward: 0.77 | KL: 0.00046 | ClipFrac: 0.008 | ExplVar: 0.824
Step: 954368 | LR: 0.000014 | Reward: 0.76 | KL: 0.00176 | ClipFrac: 0.003 | ExplVar: 0.888
Step: 956416 | LR: 0.000014 | Reward: 0.57 | KL: 0.00204 | ClipFrac: 0.018 | ExplVar: 0.927
Step: 958464 | LR: 0.000013 | Reward: 0.80 | KL: 0.00458 | ClipFrac: 0.033 | ExplVar: 0.942
Step: 960512 | LR: 0.000012 | Reward: 1.10 | KL: 0.00785 | ClipFrac: 0.055 | ExplVar: 0.843




[Eval @ 960512] Return: 262.03
Saved video: videos_v2/ppo_eval_960512.mp4
Step: 962560 | LR: 0.000012 | Reward: 1.21 | KL: 0.00217 | ClipFrac: 0.014 | ExplVar: 0.880
Step: 964608 | LR: 0.000011 | Reward: 0.77 | KL: 0.00191 | ClipFrac: 0.038 | ExplVar: 0.915
Step: 966656 | LR: 0.000011 | Reward: 0.68 | KL: 0.00086 | ClipFrac: 0.022 | ExplVar: 0.788
Step: 968704 | LR: 0.000010 | Reward: 1.23 | KL: 0.00292 | ClipFrac: 0.006 | ExplVar: 0.824
Step: 970752 | LR: 0.000009 | Reward: 0.63 | KL: -0.00099 | ClipFrac: 0.010 | ExplVar: 0.861




[Eval @ 970752] Return: 294.51
Saved video: videos_v2/ppo_eval_970752.mp4
Step: 972800 | LR: 0.000009 | Reward: 0.42 | KL: 0.00029 | ClipFrac: 0.001 | ExplVar: 0.940
Step: 974848 | LR: 0.000008 | Reward: 0.24 | KL: 0.00262 | ClipFrac: 0.021 | ExplVar: 0.846
Step: 976896 | LR: 0.000008 | Reward: 0.61 | KL: -0.00000 | ClipFrac: 0.000 | ExplVar: 0.777
Step: 978944 | LR: 0.000007 | Reward: 0.78 | KL: 0.00054 | ClipFrac: 0.000 | ExplVar: 0.776
Step: 980992 | LR: 0.000006 | Reward: 0.64 | KL: 0.00026 | ClipFrac: 0.000 | ExplVar: 0.798




[Eval @ 980992] Return: 287.29
Saved video: videos_v2/ppo_eval_980992.mp4
Step: 983040 | LR: 0.000006 | Reward: 0.84 | KL: 0.00115 | ClipFrac: 0.001 | ExplVar: 0.799
Step: 985088 | LR: 0.000005 | Reward: 1.00 | KL: 0.00011 | ClipFrac: 0.000 | ExplVar: 0.721
Step: 987136 | LR: 0.000004 | Reward: 0.66 | KL: 0.00288 | ClipFrac: 0.019 | ExplVar: 0.812
Step: 989184 | LR: 0.000004 | Reward: 0.90 | KL: 0.00321 | ClipFrac: 0.010 | ExplVar: 0.895
Step: 991232 | LR: 0.000003 | Reward: 0.51 | KL: -0.00018 | ClipFrac: 0.004 | ExplVar: 0.858




[Eval @ 991232] Return: 228.75
Saved video: videos_v2/ppo_eval_991232.mp4
Step: 993280 | LR: 0.000003 | Reward: 0.58 | KL: -0.00036 | ClipFrac: 0.000 | ExplVar: 0.881
Step: 995328 | LR: 0.000002 | Reward: 0.93 | KL: 0.00042 | ClipFrac: 0.000 | ExplVar: 0.881
Step: 997376 | LR: 0.000001 | Reward: 0.57 | KL: 0.00062 | ClipFrac: 0.000 | ExplVar: 0.885
Step: 999424 | LR: 0.000001 | Reward: 0.30 | KL: 0.00029 | ClipFrac: 0.000 | ExplVar: 0.932
Step: 1000000 | LR: 0.000000 | Reward: 1.34 | KL: -0.00006 | ClipFrac: 0.000 | ExplVar: 0.879




[Eval @ 1000000] Return: 255.50
Saved video: videos_v2/ppo_eval_1000000.mp4


In [None]:
!zip -r videos_v2.zip /content/videos_v2

In [None]:
files.download('/content/videos_v2.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>