PPO solving 2D Walker

In [None]:
!pip install "gymnasium[box2d]"

Collecting box2d==2.3.10 (from gymnasium[box2d])
  Downloading Box2D-2.3.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (573 bytes)
Collecting swig==4.* (from gymnasium[box2d])
  Downloading swig-4.4.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading Box2D-2.3.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading swig-4.4.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig, box2d
Successfully installed box2d-2.3.10 swig-4.4.1


In [None]:
import gymnasium as gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import imageio
import os
from google.colab import files

In [None]:
def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
# Actor Critic Network (shared backbone architecture)

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden=256):
        super().__init__()

        self.shared = nn.Sequential(
            nn.Linear(state_dim, hidden),
            nn.Tanh(),
            nn.Linear(hidden, hidden),
            nn.Tanh(),
        )

        self.mu_head = nn.Linear(hidden, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        self.value_head = nn.Linear(hidden, 1)

    def forward(self, x):
        z = self.shared(x)
        mu = self.mu_head(z)
        log_std = torch.clamp(self.log_std, -20, 2)
        std = torch.exp(log_std)
        value = self.value_head(z).squeeze(-1)
        return mu, std, value

    def get_action(self, state):
        mu, std, value = self.forward(state)
        dist = torch.distributions.Normal(mu, std)

        u = dist.rsample()
        action = torch.tanh(u)

        logp = dist.log_prob(u)
        logp -= torch.log(1 - action.pow(2) + 1e-6)
        logp = logp.sum(-1)

        entropy = dist.entropy().sum(-1)
        return action, logp, entropy, value

    def evaluate_actions(self, states, actions):
        mu, std, value = self.forward(states)
        dist = torch.distributions.Normal(mu, std)

        eps = 1e-6
        actions = torch.clamp(actions, -1 + eps, 1 - eps)
        u = 0.5 * torch.log((1 + actions) / (1 - actions))

        logp = dist.log_prob(u)
        logp -= torch.log(1 - actions.pow(2) + 1e-6)
        logp = logp.sum(-1)

        entropy = dist.entropy().sum(-1)
        return logp, entropy, value

    def act_deterministic(self, state):
        with torch.no_grad():
            mu, _, _ = self.forward(state)
            return torch.tanh(mu)


# Computing GAE

@torch.no_grad()
def compute_gae(rewards, dones, values, next_value, gamma=0.99, gae_lambda=0.95):
    T = rewards.shape[0]
    advantages = torch.zeros(T, device=rewards.device)

    gae = 0
    for t in reversed(range(T)):
        mask = 1.0 - dones[t]
        v_next = next_value if t == T - 1 else values[t + 1]
        delta = rewards[t] + gamma * v_next * mask - values[t]
        gae = delta + gamma * gae_lambda * mask * gae
        advantages[t] = gae

    returns = advantages + values
    return advantages.detach(), returns.detach()


# Training setup

seed = 0
set_seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

env = gym.make("BipedalWalker-v3")
eval_env = gym.make("BipedalWalker-v3", render_mode="rgb_array")

os.makedirs("videos", exist_ok=True)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

initial_lr = 2.5e-4
model = ActorCritic(state_dim, action_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=initial_lr)

# PPO hyperparameters
gamma = 0.99
gae_lambda = 0.95
clip_eps = 0.2
vf_coef = 0.5
ent_coef = 0.0
max_grad_norm = 0.5
ppo_epochs = 10
mini_batch_size = 64

rollout_len = 2048
total_timesteps = 1_000_000
eval_interval = 20_000

global_step = 0
state, _ = env.reset()

# PPO training loop

while global_step < total_timesteps:

    # LR Annealing
    frac = 1.0 - (global_step / total_timesteps)
    lr_now = initial_lr * frac
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr_now

    states, actions, rewards, dones, values, logps = [], [], [], [], [], []
    rollout_rewards = []

    # Rollouts
    for _ in range(rollout_len):

        state_tensor = torch.tensor(state, dtype=torch.float32, device=device)
        action, logp, entropy, value = model.get_action(state_tensor)

        next_state, reward, terminated, truncated, _ = env.step(
            action.detach().cpu().numpy()
        )

        done = terminated or truncated

        states.append(state_tensor)
        actions.append(action.detach())
        rewards.append(torch.tensor(reward, dtype=torch.float32, device=device))
        dones.append(torch.tensor(float(done), dtype=torch.float32, device=device))
        values.append(value.detach())
        logps.append(logp.detach())

        rollout_rewards.append(reward)

        state = next_state
        global_step += 1

        if done:
            state, _ = env.reset()

        if global_step >= total_timesteps:
            break

    mean_rollout_reward = np.mean(rollout_rewards)

    states = torch.stack(states)
    actions = torch.stack(actions)
    rewards = torch.stack(rewards)
    dones = torch.stack(dones)
    values = torch.stack(values)
    logps_old = torch.stack(logps)

    with torch.no_grad():
        last_state_tensor = torch.tensor(state, dtype=torch.float32, device=device)
        _, _, next_value = model.forward(last_state_tensor)

    advantages, returns = compute_gae(
        rewards, dones, values, next_value, gamma, gae_lambda
    )

    # Advantage Norm
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)


    dataset_size = states.size(0)

    # Logging metrics
    approx_kl_total = 0
    clip_frac_total = 0
    explained_var_total = 0
    update_steps = 0

    # PPO main training loop
    for _ in range(ppo_epochs):

        indices = torch.randperm(dataset_size, device=device)

        for start in range(0, dataset_size, mini_batch_size):
            end = start + mini_batch_size
            batch_idx = indices[start:end]

            mb_states = states[batch_idx]
            mb_actions = actions[batch_idx]
            mb_advantages = advantages[batch_idx]
            mb_returns = returns[batch_idx]
            mb_logps_old = logps_old[batch_idx]
            mb_values_old = values[batch_idx]

            logps_new, entropy, values_new = model.evaluate_actions(
                mb_states, mb_actions
            )

            ratio = torch.exp(logps_new - mb_logps_old)

            surr1 = ratio * mb_advantages
            surr2 = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * mb_advantages    # PPO clipping

            policy_loss = -torch.min(surr1, surr2).mean()

            # Value function clipping
            value_clipped = mb_values_old + torch.clamp(
                values_new - mb_values_old,
                -clip_eps,
                clip_eps,
            )

            v_loss1 = (values_new - mb_returns).pow(2)
            v_loss2 = (value_clipped - mb_returns).pow(2)
            value_loss = 0.5 * torch.max(v_loss1, v_loss2).mean()

            entropy_loss = entropy.mean()

            # PPO Loss
            loss = policy_loss + vf_coef * value_loss - ent_coef * entropy_loss

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)      # Gradient clipping
            optimizer.step()

            # Logging stats
            approx_kl = (mb_logps_old - logps_new).mean().item()
            clip_frac = ((ratio - 1.0).abs() > clip_eps).float().mean().item()
            explained_var = 1 - torch.var(mb_returns - values_new) / (
                torch.var(mb_returns) + 1e-8
            )

            approx_kl_total += approx_kl
            clip_frac_total += clip_frac
            explained_var_total += explained_var.item()
            update_steps += 1

    print(
        f"Step: {global_step} | "
        f"LR: {lr_now:.6f} | "
        f"Reward: {mean_rollout_reward:.2f} | "
        f"KL: {approx_kl_total/update_steps:.5f} | "
        f"ClipFrac: {clip_frac_total/update_steps:.3f} | "
        f"ExplVar: {explained_var_total/update_steps:.3f}"
    )

    # Evaluation
    if global_step % eval_interval < rollout_len:

        s, _ = eval_env.reset()
        done = False
        ep_return = 0
        frames = []

        while not done:
            s_tensor = torch.tensor(s, dtype=torch.float32, device=device)

            # Determininstic evaluation
            action = model.act_deterministic(s_tensor)

            s, r, terminated, truncated, _ = eval_env.step(
                action.cpu().numpy()
            )
            done = terminated or truncated
            ep_return += r

            frame = eval_env.render()
            if frame is not None:
                frames.append(frame)

        print(f"[Eval @ {global_step}] Return: {ep_return:.2f}")

        video_path = f"videos/ppo_bipedal_eval_{global_step}.mp4"
        imageio.mimsave(video_path, frames, fps=30)
        print(f"Saved video: {video_path}")

env.close()
eval_env.close()


Device: cuda
Step: 2048 | LR: 0.000250 | Reward: -0.23 | KL: 0.02304 | ClipFrac: 0.235 | ExplVar: -0.000
Step: 4096 | LR: 0.000249 | Reward: -0.05 | KL: 0.01894 | ClipFrac: 0.209 | ExplVar: 0.170
Step: 6144 | LR: 0.000249 | Reward: -0.06 | KL: 0.02325 | ClipFrac: 0.240 | ExplVar: 0.210
Step: 8192 | LR: 0.000248 | Reward: -0.05 | KL: 0.01584 | ClipFrac: 0.208 | ExplVar: 0.319
Step: 10240 | LR: 0.000248 | Reward: -0.10 | KL: 0.01594 | ClipFrac: 0.172 | ExplVar: 0.014
Step: 12288 | LR: 0.000247 | Reward: -0.44 | KL: 0.01262 | ClipFrac: 0.177 | ExplVar: 0.015
Step: 14336 | LR: 0.000247 | Reward: -0.09 | KL: 0.01221 | ClipFrac: 0.156 | ExplVar: 0.130
Step: 16384 | LR: 0.000246 | Reward: -0.54 | KL: 0.01381 | ClipFrac: 0.160 | ExplVar: 0.034
Step: 18432 | LR: 0.000246 | Reward: -0.56 | KL: 0.00609 | ClipFrac: 0.171 | ExplVar: 0.019
Step: 20480 | LR: 0.000245 | Reward: -0.20 | KL: 0.01577 | ClipFrac: 0.182 | ExplVar: 0.051
[Eval @ 20480] Return: -95.56




Saved video: videos/ppo_bipedal_eval_20480.mp4
Step: 22528 | LR: 0.000245 | Reward: -0.05 | KL: 0.01056 | ClipFrac: 0.202 | ExplVar: 0.430
Step: 24576 | LR: 0.000244 | Reward: -0.09 | KL: 0.00944 | ClipFrac: 0.152 | ExplVar: 0.227
Step: 26624 | LR: 0.000244 | Reward: -0.18 | KL: 0.01009 | ClipFrac: 0.142 | ExplVar: 0.121
Step: 28672 | LR: 0.000243 | Reward: -0.04 | KL: 0.00915 | ClipFrac: 0.161 | ExplVar: 0.741
Step: 30720 | LR: 0.000243 | Reward: -0.05 | KL: 0.01667 | ClipFrac: 0.211 | ExplVar: 0.792
Step: 32768 | LR: 0.000242 | Reward: -0.15 | KL: 0.00566 | ClipFrac: 0.155 | ExplVar: 0.189
Step: 34816 | LR: 0.000242 | Reward: -0.14 | KL: 0.00936 | ClipFrac: 0.119 | ExplVar: 0.145
Step: 36864 | LR: 0.000241 | Reward: -0.05 | KL: 0.01305 | ClipFrac: 0.199 | ExplVar: 0.643
Step: 38912 | LR: 0.000241 | Reward: -0.04 | KL: 0.01315 | ClipFrac: 0.190 | ExplVar: 0.567
Step: 40960 | LR: 0.000240 | Reward: -0.09 | KL: 0.00915 | ClipFrac: 0.120 | ExplVar: 0.314




[Eval @ 40960] Return: -47.32
Saved video: videos/ppo_bipedal_eval_40960.mp4
Step: 43008 | LR: 0.000240 | Reward: -0.04 | KL: 0.01217 | ClipFrac: 0.183 | ExplVar: 0.592
Step: 45056 | LR: 0.000239 | Reward: -0.04 | KL: 0.00998 | ClipFrac: 0.145 | ExplVar: 0.861
Step: 47104 | LR: 0.000239 | Reward: -0.13 | KL: 0.00950 | ClipFrac: 0.105 | ExplVar: 0.285
Step: 49152 | LR: 0.000238 | Reward: -0.04 | KL: 0.01856 | ClipFrac: 0.204 | ExplVar: 0.842
Step: 51200 | LR: 0.000238 | Reward: -0.04 | KL: 0.01312 | ClipFrac: 0.175 | ExplVar: 0.874
Step: 53248 | LR: 0.000237 | Reward: -0.06 | KL: 0.01027 | ClipFrac: 0.137 | ExplVar: 0.822
Step: 55296 | LR: 0.000237 | Reward: -0.08 | KL: 0.00462 | ClipFrac: 0.145 | ExplVar: 0.415
Step: 57344 | LR: 0.000236 | Reward: -0.08 | KL: 0.01078 | ClipFrac: 0.141 | ExplVar: -0.000
Step: 59392 | LR: 0.000236 | Reward: -0.04 | KL: 0.01048 | ClipFrac: 0.170 | ExplVar: 0.760
Step: 61440 | LR: 0.000235 | Reward: -0.03 | KL: 0.01300 | ClipFrac: 0.178 | ExplVar: 0.903




[Eval @ 61440] Return: -27.27
Saved video: videos/ppo_bipedal_eval_61440.mp4
Step: 63488 | LR: 0.000235 | Reward: -0.09 | KL: 0.00149 | ClipFrac: 0.102 | ExplVar: 0.461
Step: 65536 | LR: 0.000234 | Reward: -0.03 | KL: 0.00608 | ClipFrac: 0.143 | ExplVar: 0.773
Step: 67584 | LR: 0.000234 | Reward: -0.02 | KL: 0.01257 | ClipFrac: 0.180 | ExplVar: 0.857
Step: 69632 | LR: 0.000233 | Reward: -0.02 | KL: 0.01288 | ClipFrac: 0.181 | ExplVar: 0.780
Step: 71680 | LR: 0.000233 | Reward: -0.01 | KL: 0.00810 | ClipFrac: 0.186 | ExplVar: 0.664
Step: 73728 | LR: 0.000232 | Reward: -0.01 | KL: 0.01480 | ClipFrac: 0.168 | ExplVar: 0.833
Step: 75776 | LR: 0.000232 | Reward: -0.01 | KL: 0.01177 | ClipFrac: 0.189 | ExplVar: 0.665
Step: 77824 | LR: 0.000231 | Reward: -0.01 | KL: 0.01634 | ClipFrac: 0.148 | ExplVar: 0.776
Step: 79872 | LR: 0.000231 | Reward: -0.00 | KL: 0.01815 | ClipFrac: 0.192 | ExplVar: 0.709
Step: 81920 | LR: 0.000230 | Reward: 0.01 | KL: 0.01244 | ClipFrac: 0.192 | ExplVar: 0.789




[Eval @ 81920] Return: 80.12
Saved video: videos/ppo_bipedal_eval_81920.mp4
Step: 83968 | LR: 0.000230 | Reward: -0.04 | KL: 0.01631 | ClipFrac: 0.150 | ExplVar: 0.396
Step: 86016 | LR: 0.000229 | Reward: 0.01 | KL: 0.01126 | ClipFrac: 0.186 | ExplVar: 0.810
Step: 88064 | LR: 0.000228 | Reward: 0.02 | KL: 0.01201 | ClipFrac: 0.189 | ExplVar: 0.783
Step: 90112 | LR: 0.000228 | Reward: 0.03 | KL: 0.01190 | ClipFrac: 0.164 | ExplVar: 0.826
Step: 92160 | LR: 0.000227 | Reward: 0.03 | KL: 0.01999 | ClipFrac: 0.215 | ExplVar: 0.823
Step: 94208 | LR: 0.000227 | Reward: 0.04 | KL: 0.01466 | ClipFrac: 0.199 | ExplVar: 0.880
Step: 96256 | LR: 0.000226 | Reward: 0.04 | KL: 0.01056 | ClipFrac: 0.181 | ExplVar: 0.797
Step: 98304 | LR: 0.000226 | Reward: -0.06 | KL: 0.01331 | ClipFrac: 0.175 | ExplVar: 0.320
Step: 100352 | LR: 0.000225 | Reward: 0.04 | KL: 0.01444 | ClipFrac: 0.185 | ExplVar: 0.862




[Eval @ 100352] Return: 157.13
Saved video: videos/ppo_bipedal_eval_100352.mp4
Step: 102400 | LR: 0.000225 | Reward: 0.05 | KL: 0.01262 | ClipFrac: 0.189 | ExplVar: 0.811
Step: 104448 | LR: 0.000224 | Reward: 0.04 | KL: 0.01154 | ClipFrac: 0.206 | ExplVar: 0.845
Step: 106496 | LR: 0.000224 | Reward: 0.05 | KL: 0.01487 | ClipFrac: 0.180 | ExplVar: 0.865
Step: 108544 | LR: 0.000223 | Reward: 0.06 | KL: 0.01997 | ClipFrac: 0.220 | ExplVar: 0.886
Step: 110592 | LR: 0.000223 | Reward: -0.05 | KL: 0.01136 | ClipFrac: 0.227 | ExplVar: 0.152
Step: 112640 | LR: 0.000222 | Reward: 0.06 | KL: 0.01269 | ClipFrac: 0.180 | ExplVar: 0.859
Step: 114688 | LR: 0.000222 | Reward: 0.02 | KL: 0.01202 | ClipFrac: 0.194 | ExplVar: 0.347
Step: 116736 | LR: 0.000221 | Reward: 0.07 | KL: 0.01802 | ClipFrac: 0.193 | ExplVar: 0.876
Step: 118784 | LR: 0.000221 | Reward: 0.07 | KL: 0.01686 | ClipFrac: 0.205 | ExplVar: 0.803
Step: 120832 | LR: 0.000220 | Reward: 0.02 | KL: 0.01485 | ClipFrac: 0.198 | ExplVar: 0.512




[Eval @ 120832] Return: 183.33
Saved video: videos/ppo_bipedal_eval_120832.mp4
Step: 122880 | LR: 0.000220 | Reward: 0.02 | KL: 0.01005 | ClipFrac: 0.165 | ExplVar: 0.356
Step: 124928 | LR: 0.000219 | Reward: 0.01 | KL: 0.02114 | ClipFrac: 0.204 | ExplVar: 0.373
Step: 126976 | LR: 0.000219 | Reward: 0.01 | KL: 0.01094 | ClipFrac: 0.197 | ExplVar: 0.326
Step: 129024 | LR: 0.000218 | Reward: 0.07 | KL: 0.01869 | ClipFrac: 0.241 | ExplVar: 0.787
Step: 131072 | LR: 0.000218 | Reward: -0.03 | KL: 0.01145 | ClipFrac: 0.209 | ExplVar: 0.210
Step: 133120 | LR: 0.000217 | Reward: -0.04 | KL: 0.02309 | ClipFrac: 0.205 | ExplVar: 0.398
Step: 135168 | LR: 0.000217 | Reward: 0.01 | KL: 0.01307 | ClipFrac: 0.213 | ExplVar: 0.558
Step: 137216 | LR: 0.000216 | Reward: -0.27 | KL: 0.01747 | ClipFrac: 0.216 | ExplVar: 0.315
Step: 139264 | LR: 0.000216 | Reward: -0.04 | KL: 0.01384 | ClipFrac: 0.176 | ExplVar: 0.337
Step: 141312 | LR: 0.000215 | Reward: -0.18 | KL: 0.00704 | ClipFrac: 0.155 | ExplVar: 0.



[Eval @ 141312] Return: -103.17
Saved video: videos/ppo_bipedal_eval_141312.mp4
Step: 143360 | LR: 0.000215 | Reward: 0.00 | KL: 0.01026 | ClipFrac: 0.211 | ExplVar: 0.684
Step: 145408 | LR: 0.000214 | Reward: -0.10 | KL: 0.02518 | ClipFrac: 0.254 | ExplVar: -0.087
Step: 147456 | LR: 0.000214 | Reward: -0.02 | KL: 0.01195 | ClipFrac: 0.187 | ExplVar: 0.750
Step: 149504 | LR: 0.000213 | Reward: -0.19 | KL: 0.01421 | ClipFrac: 0.192 | ExplVar: 0.451
Step: 151552 | LR: 0.000213 | Reward: 0.09 | KL: 0.02202 | ClipFrac: 0.229 | ExplVar: 0.857
Step: 153600 | LR: 0.000212 | Reward: -0.28 | KL: 0.01941 | ClipFrac: 0.214 | ExplVar: 0.353
Step: 155648 | LR: 0.000212 | Reward: -0.52 | KL: 0.01189 | ClipFrac: 0.180 | ExplVar: 0.324
Step: 157696 | LR: 0.000211 | Reward: 0.03 | KL: 0.01847 | ClipFrac: 0.197 | ExplVar: 0.677
Step: 159744 | LR: 0.000211 | Reward: -0.21 | KL: 0.01850 | ClipFrac: 0.175 | ExplVar: 0.429
Step: 161792 | LR: 0.000210 | Reward: -0.10 | KL: 0.01426 | ClipFrac: 0.240 | ExplVar



[Eval @ 161792] Return: -104.80
Saved video: videos/ppo_bipedal_eval_161792.mp4
Step: 163840 | LR: 0.000210 | Reward: -0.89 | KL: 0.01511 | ClipFrac: 0.199 | ExplVar: 0.105
Step: 165888 | LR: 0.000209 | Reward: -0.35 | KL: 0.02161 | ClipFrac: 0.213 | ExplVar: 0.044
Step: 167936 | LR: 0.000209 | Reward: -0.60 | KL: 0.01992 | ClipFrac: 0.185 | ExplVar: 0.112
Step: 169984 | LR: 0.000208 | Reward: -0.16 | KL: 0.01387 | ClipFrac: 0.169 | ExplVar: 0.451
Step: 172032 | LR: 0.000208 | Reward: 0.02 | KL: 0.01272 | ClipFrac: 0.190 | ExplVar: 0.664
Step: 174080 | LR: 0.000207 | Reward: 0.03 | KL: 0.00888 | ClipFrac: 0.190 | ExplVar: 0.542
Step: 176128 | LR: 0.000206 | Reward: 0.08 | KL: 0.01375 | ClipFrac: 0.219 | ExplVar: 0.824
Step: 178176 | LR: 0.000206 | Reward: -0.19 | KL: 0.01952 | ClipFrac: 0.176 | ExplVar: 0.502
Step: 180224 | LR: 0.000205 | Reward: -0.08 | KL: 0.00643 | ClipFrac: 0.171 | ExplVar: 0.527




[Eval @ 180224] Return: -108.11
Saved video: videos/ppo_bipedal_eval_180224.mp4
Step: 182272 | LR: 0.000205 | Reward: 0.05 | KL: 0.01691 | ClipFrac: 0.217 | ExplVar: 0.513
Step: 184320 | LR: 0.000204 | Reward: -0.40 | KL: 0.01962 | ClipFrac: 0.194 | ExplVar: 0.508
Step: 186368 | LR: 0.000204 | Reward: -0.13 | KL: 0.01137 | ClipFrac: 0.182 | ExplVar: 0.499
Step: 188416 | LR: 0.000203 | Reward: -0.40 | KL: 0.01341 | ClipFrac: 0.179 | ExplVar: 0.540
Step: 190464 | LR: 0.000203 | Reward: -0.24 | KL: 0.01397 | ClipFrac: 0.165 | ExplVar: 0.568
Step: 192512 | LR: 0.000202 | Reward: -0.30 | KL: 0.01255 | ClipFrac: 0.170 | ExplVar: 0.442
Step: 194560 | LR: 0.000202 | Reward: -0.13 | KL: 0.01416 | ClipFrac: 0.150 | ExplVar: 0.614
Step: 196608 | LR: 0.000201 | Reward: -0.07 | KL: 0.02065 | ClipFrac: 0.161 | ExplVar: 0.641
Step: 198656 | LR: 0.000201 | Reward: 0.04 | KL: 0.01615 | ClipFrac: 0.191 | ExplVar: 0.592
Step: 200704 | LR: 0.000200 | Reward: -0.13 | KL: 0.01053 | ClipFrac: 0.140 | ExplVar



[Eval @ 200704] Return: -101.59
Saved video: videos/ppo_bipedal_eval_200704.mp4
Step: 202752 | LR: 0.000200 | Reward: -0.24 | KL: 0.01213 | ClipFrac: 0.164 | ExplVar: 0.444
Step: 204800 | LR: 0.000199 | Reward: 0.09 | KL: 0.01775 | ClipFrac: 0.243 | ExplVar: 0.798
Step: 206848 | LR: 0.000199 | Reward: -0.24 | KL: 0.01262 | ClipFrac: 0.146 | ExplVar: 0.491
Step: 208896 | LR: 0.000198 | Reward: -0.28 | KL: 0.01103 | ClipFrac: 0.155 | ExplVar: 0.498
Step: 210944 | LR: 0.000198 | Reward: -0.53 | KL: 0.00788 | ClipFrac: 0.154 | ExplVar: 0.512
Step: 212992 | LR: 0.000197 | Reward: -0.29 | KL: 0.02446 | ClipFrac: 0.171 | ExplVar: 0.501
Step: 215040 | LR: 0.000197 | Reward: -0.02 | KL: 0.01730 | ClipFrac: 0.168 | ExplVar: 0.658
Step: 217088 | LR: 0.000196 | Reward: 0.03 | KL: 0.01123 | ClipFrac: 0.202 | ExplVar: 0.736
Step: 219136 | LR: 0.000196 | Reward: -0.24 | KL: 0.01677 | ClipFrac: 0.169 | ExplVar: 0.645
Step: 221184 | LR: 0.000195 | Reward: 0.10 | KL: 0.02081 | ClipFrac: 0.242 | ExplVar:



[Eval @ 221184] Return: -114.26
Saved video: videos/ppo_bipedal_eval_221184.mp4
Step: 223232 | LR: 0.000195 | Reward: -0.18 | KL: 0.01606 | ClipFrac: 0.164 | ExplVar: 0.545
Step: 225280 | LR: 0.000194 | Reward: -0.18 | KL: 0.01364 | ClipFrac: 0.175 | ExplVar: 0.534
Step: 227328 | LR: 0.000194 | Reward: -0.34 | KL: 0.01040 | ClipFrac: 0.162 | ExplVar: 0.571
Step: 229376 | LR: 0.000193 | Reward: -0.06 | KL: 0.01224 | ClipFrac: 0.165 | ExplVar: 0.509
Step: 231424 | LR: 0.000193 | Reward: -0.07 | KL: 0.01824 | ClipFrac: 0.164 | ExplVar: 0.707
Step: 233472 | LR: 0.000192 | Reward: 0.04 | KL: 0.00999 | ClipFrac: 0.162 | ExplVar: 0.784
Step: 235520 | LR: 0.000192 | Reward: -0.06 | KL: 0.01123 | ClipFrac: 0.151 | ExplVar: 0.713
Step: 237568 | LR: 0.000191 | Reward: 0.05 | KL: 0.01500 | ClipFrac: 0.178 | ExplVar: 0.774
Step: 239616 | LR: 0.000191 | Reward: 0.01 | KL: 0.01136 | ClipFrac: 0.162 | ExplVar: 0.759
Step: 241664 | LR: 0.000190 | Reward: -0.07 | KL: 0.01922 | ClipFrac: 0.174 | ExplVar:



[Eval @ 241664] Return: -102.98
Saved video: videos/ppo_bipedal_eval_241664.mp4
Step: 243712 | LR: 0.000190 | Reward: -0.12 | KL: 0.01594 | ClipFrac: 0.192 | ExplVar: 0.602
Step: 245760 | LR: 0.000189 | Reward: -0.02 | KL: 0.01167 | ClipFrac: 0.184 | ExplVar: 0.371
Step: 247808 | LR: 0.000189 | Reward: -0.14 | KL: 0.00715 | ClipFrac: 0.165 | ExplVar: 0.528
Step: 249856 | LR: 0.000188 | Reward: 0.10 | KL: 0.01717 | ClipFrac: 0.217 | ExplVar: 0.733
Step: 251904 | LR: 0.000188 | Reward: 0.10 | KL: 0.01223 | ClipFrac: 0.196 | ExplVar: 0.781
Step: 253952 | LR: 0.000187 | Reward: -0.02 | KL: 0.01368 | ClipFrac: 0.148 | ExplVar: 0.781
Step: 256000 | LR: 0.000187 | Reward: -0.22 | KL: 0.01040 | ClipFrac: 0.147 | ExplVar: 0.603
Step: 258048 | LR: 0.000186 | Reward: -0.39 | KL: 0.01229 | ClipFrac: 0.131 | ExplVar: 0.597
Step: 260096 | LR: 0.000185 | Reward: -0.06 | KL: 0.01569 | ClipFrac: 0.169 | ExplVar: 0.614




[Eval @ 260096] Return: -105.94
Saved video: videos/ppo_bipedal_eval_260096.mp4
Step: 262144 | LR: 0.000185 | Reward: -0.01 | KL: 0.00813 | ClipFrac: 0.131 | ExplVar: 0.717
Step: 264192 | LR: 0.000184 | Reward: -0.11 | KL: 0.01089 | ClipFrac: 0.146 | ExplVar: 0.747
Step: 266240 | LR: 0.000184 | Reward: -0.01 | KL: 0.01283 | ClipFrac: 0.138 | ExplVar: 0.588
Step: 268288 | LR: 0.000183 | Reward: -0.41 | KL: 0.01189 | ClipFrac: 0.154 | ExplVar: 0.545
Step: 270336 | LR: 0.000183 | Reward: -0.17 | KL: 0.01067 | ClipFrac: 0.125 | ExplVar: 0.712
Step: 272384 | LR: 0.000182 | Reward: -0.02 | KL: 0.00849 | ClipFrac: 0.143 | ExplVar: 0.724
Step: 274432 | LR: 0.000182 | Reward: -0.08 | KL: 0.01756 | ClipFrac: 0.136 | ExplVar: 0.391
Step: 276480 | LR: 0.000181 | Reward: 0.10 | KL: 0.01276 | ClipFrac: 0.202 | ExplVar: 0.787
Step: 278528 | LR: 0.000181 | Reward: 0.09 | KL: 0.01651 | ClipFrac: 0.203 | ExplVar: 0.792
Step: 280576 | LR: 0.000180 | Reward: 0.11 | KL: 0.01407 | ClipFrac: 0.203 | ExplVar:



[Eval @ 280576] Return: -87.79
Saved video: videos/ppo_bipedal_eval_280576.mp4
Step: 282624 | LR: 0.000180 | Reward: 0.10 | KL: 0.01710 | ClipFrac: 0.232 | ExplVar: 0.758
Step: 284672 | LR: 0.000179 | Reward: 0.07 | KL: 0.01497 | ClipFrac: 0.165 | ExplVar: 0.640
Step: 286720 | LR: 0.000179 | Reward: 0.11 | KL: 0.01520 | ClipFrac: 0.211 | ExplVar: 0.817
Step: 288768 | LR: 0.000178 | Reward: 0.02 | KL: 0.01639 | ClipFrac: 0.140 | ExplVar: 0.713
Step: 290816 | LR: 0.000178 | Reward: -0.05 | KL: 0.00887 | ClipFrac: 0.135 | ExplVar: 0.780
Step: 292864 | LR: 0.000177 | Reward: 0.00 | KL: 0.01749 | ClipFrac: 0.192 | ExplVar: 0.220
Step: 294912 | LR: 0.000177 | Reward: 0.06 | KL: 0.01795 | ClipFrac: 0.163 | ExplVar: 0.730
Step: 296960 | LR: 0.000176 | Reward: 0.00 | KL: 0.01661 | ClipFrac: 0.161 | ExplVar: 0.587
Step: 299008 | LR: 0.000176 | Reward: 0.12 | KL: 0.01922 | ClipFrac: 0.223 | ExplVar: 0.834
Step: 301056 | LR: 0.000175 | Reward: 0.07 | KL: 0.00783 | ClipFrac: 0.172 | ExplVar: 0.305




[Eval @ 301056] Return: -106.34
Saved video: videos/ppo_bipedal_eval_301056.mp4
Step: 303104 | LR: 0.000175 | Reward: 0.06 | KL: 0.01547 | ClipFrac: 0.136 | ExplVar: 0.721
Step: 305152 | LR: 0.000174 | Reward: 0.11 | KL: 0.01251 | ClipFrac: 0.192 | ExplVar: 0.740
Step: 307200 | LR: 0.000174 | Reward: 0.06 | KL: 0.00815 | ClipFrac: 0.151 | ExplVar: 0.371
Step: 309248 | LR: 0.000173 | Reward: 0.11 | KL: 0.01299 | ClipFrac: 0.225 | ExplVar: 0.824
Step: 311296 | LR: 0.000173 | Reward: 0.12 | KL: 0.01391 | ClipFrac: 0.174 | ExplVar: 0.823
Step: 313344 | LR: 0.000172 | Reward: 0.11 | KL: 0.01426 | ClipFrac: 0.197 | ExplVar: 0.754
Step: 315392 | LR: 0.000172 | Reward: 0.13 | KL: 0.01450 | ClipFrac: 0.177 | ExplVar: 0.852
Step: 317440 | LR: 0.000171 | Reward: 0.11 | KL: 0.01135 | ClipFrac: 0.184 | ExplVar: 0.773
Step: 319488 | LR: 0.000171 | Reward: 0.06 | KL: 0.00822 | ClipFrac: 0.147 | ExplVar: 0.614
Step: 321536 | LR: 0.000170 | Reward: 0.13 | KL: 0.01628 | ClipFrac: 0.208 | ExplVar: 0.837




[Eval @ 321536] Return: -122.76
Saved video: videos/ppo_bipedal_eval_321536.mp4
Step: 323584 | LR: 0.000170 | Reward: 0.07 | KL: 0.01082 | ClipFrac: 0.134 | ExplVar: 0.745
Step: 325632 | LR: 0.000169 | Reward: 0.13 | KL: 0.01850 | ClipFrac: 0.212 | ExplVar: 0.850
Step: 327680 | LR: 0.000169 | Reward: 0.14 | KL: 0.02088 | ClipFrac: 0.221 | ExplVar: 0.822
Step: 329728 | LR: 0.000168 | Reward: 0.13 | KL: 0.01422 | ClipFrac: 0.177 | ExplVar: 0.866
Step: 331776 | LR: 0.000168 | Reward: 0.15 | KL: 0.01132 | ClipFrac: 0.181 | ExplVar: 0.812
Step: 333824 | LR: 0.000167 | Reward: 0.09 | KL: 0.01157 | ClipFrac: 0.169 | ExplVar: 0.656
Step: 335872 | LR: 0.000167 | Reward: 0.09 | KL: 0.01064 | ClipFrac: 0.142 | ExplVar: 0.769
Step: 337920 | LR: 0.000166 | Reward: 0.04 | KL: 0.01519 | ClipFrac: 0.184 | ExplVar: 0.477
Step: 339968 | LR: 0.000166 | Reward: 0.15 | KL: 0.01367 | ClipFrac: 0.183 | ExplVar: 0.767
Step: 342016 | LR: 0.000165 | Reward: 0.14 | KL: 0.01022 | ClipFrac: 0.183 | ExplVar: 0.769




[Eval @ 342016] Return: 109.51
Saved video: videos/ppo_bipedal_eval_342016.mp4
Step: 344064 | LR: 0.000164 | Reward: 0.15 | KL: 0.01265 | ClipFrac: 0.186 | ExplVar: 0.850
Step: 346112 | LR: 0.000164 | Reward: 0.14 | KL: 0.01520 | ClipFrac: 0.210 | ExplVar: 0.819
Step: 348160 | LR: 0.000163 | Reward: 0.14 | KL: 0.01205 | ClipFrac: 0.208 | ExplVar: 0.663
Step: 350208 | LR: 0.000163 | Reward: -0.01 | KL: 0.01605 | ClipFrac: 0.190 | ExplVar: 0.534
Step: 352256 | LR: 0.000162 | Reward: 0.04 | KL: 0.01630 | ClipFrac: 0.146 | ExplVar: 0.736
Step: 354304 | LR: 0.000162 | Reward: 0.13 | KL: 0.01284 | ClipFrac: 0.176 | ExplVar: 0.708
Step: 356352 | LR: 0.000161 | Reward: 0.14 | KL: 0.01380 | ClipFrac: 0.198 | ExplVar: 0.827
Step: 358400 | LR: 0.000161 | Reward: 0.16 | KL: 0.01527 | ClipFrac: 0.177 | ExplVar: 0.753
Step: 360448 | LR: 0.000160 | Reward: 0.15 | KL: 0.00986 | ClipFrac: 0.196 | ExplVar: 0.766




[Eval @ 360448] Return: 249.46
Saved video: videos/ppo_bipedal_eval_360448.mp4
Step: 362496 | LR: 0.000160 | Reward: 0.13 | KL: 0.01840 | ClipFrac: 0.201 | ExplVar: 0.774
Step: 364544 | LR: 0.000159 | Reward: 0.03 | KL: 0.01496 | ClipFrac: 0.188 | ExplVar: 0.552
Step: 366592 | LR: 0.000159 | Reward: 0.09 | KL: 0.01234 | ClipFrac: 0.122 | ExplVar: 0.762
Step: 368640 | LR: 0.000158 | Reward: 0.15 | KL: 0.01487 | ClipFrac: 0.212 | ExplVar: 0.713
Step: 370688 | LR: 0.000158 | Reward: 0.15 | KL: 0.01465 | ClipFrac: 0.198 | ExplVar: 0.828
Step: 372736 | LR: 0.000157 | Reward: -0.13 | KL: 0.01501 | ClipFrac: 0.163 | ExplVar: 0.302
Step: 374784 | LR: 0.000157 | Reward: 0.03 | KL: 0.00929 | ClipFrac: 0.171 | ExplVar: 0.290
Step: 376832 | LR: 0.000156 | Reward: 0.16 | KL: 0.00569 | ClipFrac: 0.169 | ExplVar: 0.744
Step: 378880 | LR: 0.000156 | Reward: 0.10 | KL: 0.01032 | ClipFrac: 0.139 | ExplVar: 0.777
Step: 380928 | LR: 0.000155 | Reward: 0.10 | KL: 0.01174 | ClipFrac: 0.172 | ExplVar: 0.365




[Eval @ 380928] Return: 248.04
Saved video: videos/ppo_bipedal_eval_380928.mp4
Step: 382976 | LR: 0.000155 | Reward: 0.15 | KL: 0.00934 | ClipFrac: 0.201 | ExplVar: 0.812
Step: 385024 | LR: 0.000154 | Reward: 0.09 | KL: 0.00876 | ClipFrac: 0.158 | ExplVar: 0.415
Step: 387072 | LR: 0.000154 | Reward: 0.04 | KL: 0.01110 | ClipFrac: 0.177 | ExplVar: 0.511
Step: 389120 | LR: 0.000153 | Reward: 0.04 | KL: 0.01170 | ClipFrac: 0.149 | ExplVar: 0.592
Step: 391168 | LR: 0.000153 | Reward: 0.04 | KL: 0.01943 | ClipFrac: 0.163 | ExplVar: 0.517
Step: 393216 | LR: 0.000152 | Reward: 0.09 | KL: 0.00854 | ClipFrac: 0.152 | ExplVar: 0.797
Step: 395264 | LR: 0.000152 | Reward: 0.15 | KL: 0.01168 | ClipFrac: 0.185 | ExplVar: 0.731
Step: 397312 | LR: 0.000151 | Reward: 0.16 | KL: 0.01419 | ClipFrac: 0.193 | ExplVar: 0.743
Step: 399360 | LR: 0.000151 | Reward: 0.04 | KL: 0.01255 | ClipFrac: 0.154 | ExplVar: 0.535
Step: 401408 | LR: 0.000150 | Reward: 0.16 | KL: 0.01556 | ClipFrac: 0.217 | ExplVar: 0.801




[Eval @ 401408] Return: 254.73
Saved video: videos/ppo_bipedal_eval_401408.mp4
Step: 403456 | LR: 0.000150 | Reward: 0.15 | KL: 0.01222 | ClipFrac: 0.184 | ExplVar: 0.799
Step: 405504 | LR: 0.000149 | Reward: 0.15 | KL: 0.01016 | ClipFrac: 0.168 | ExplVar: 0.771
Step: 407552 | LR: 0.000149 | Reward: 0.15 | KL: 0.01037 | ClipFrac: 0.197 | ExplVar: 0.792
Step: 409600 | LR: 0.000148 | Reward: 0.13 | KL: 0.01630 | ClipFrac: 0.168 | ExplVar: 0.819
Step: 411648 | LR: 0.000148 | Reward: 0.07 | KL: 0.00790 | ClipFrac: 0.132 | ExplVar: 0.433
Step: 413696 | LR: 0.000147 | Reward: 0.16 | KL: 0.00970 | ClipFrac: 0.181 | ExplVar: 0.852
Step: 415744 | LR: 0.000147 | Reward: 0.15 | KL: 0.01862 | ClipFrac: 0.218 | ExplVar: 0.882
Step: 417792 | LR: 0.000146 | Reward: 0.15 | KL: 0.00937 | ClipFrac: 0.209 | ExplVar: 0.673
Step: 419840 | LR: 0.000146 | Reward: 0.10 | KL: 0.00927 | ClipFrac: 0.165 | ExplVar: 0.694
Step: 421888 | LR: 0.000145 | Reward: 0.15 | KL: 0.01196 | ClipFrac: 0.180 | ExplVar: 0.772




[Eval @ 421888] Return: 254.91
Saved video: videos/ppo_bipedal_eval_421888.mp4
Step: 423936 | LR: 0.000145 | Reward: 0.15 | KL: 0.01383 | ClipFrac: 0.195 | ExplVar: 0.803
Step: 425984 | LR: 0.000144 | Reward: 0.17 | KL: 0.01379 | ClipFrac: 0.212 | ExplVar: 0.734
Step: 428032 | LR: 0.000144 | Reward: 0.15 | KL: 0.01186 | ClipFrac: 0.195 | ExplVar: 0.721
Step: 430080 | LR: 0.000143 | Reward: 0.15 | KL: 0.01531 | ClipFrac: 0.181 | ExplVar: 0.858
Step: 432128 | LR: 0.000142 | Reward: 0.09 | KL: 0.01346 | ClipFrac: 0.154 | ExplVar: 0.412
Step: 434176 | LR: 0.000142 | Reward: 0.11 | KL: 0.01154 | ClipFrac: 0.151 | ExplVar: 0.713
Step: 436224 | LR: 0.000141 | Reward: 0.16 | KL: 0.01735 | ClipFrac: 0.196 | ExplVar: 0.755
Step: 438272 | LR: 0.000141 | Reward: 0.16 | KL: 0.01602 | ClipFrac: 0.178 | ExplVar: 0.831
Step: 440320 | LR: 0.000140 | Reward: 0.17 | KL: 0.01392 | ClipFrac: 0.184 | ExplVar: 0.671




[Eval @ 440320] Return: 263.08
Saved video: videos/ppo_bipedal_eval_440320.mp4
Step: 442368 | LR: 0.000140 | Reward: 0.05 | KL: 0.01597 | ClipFrac: 0.164 | ExplVar: 0.691
Step: 444416 | LR: 0.000139 | Reward: 0.17 | KL: 0.00905 | ClipFrac: 0.190 | ExplVar: 0.836
Step: 446464 | LR: 0.000139 | Reward: 0.16 | KL: 0.01500 | ClipFrac: 0.216 | ExplVar: 0.748
Step: 448512 | LR: 0.000138 | Reward: 0.06 | KL: 0.01551 | ClipFrac: 0.160 | ExplVar: 0.747
Step: 450560 | LR: 0.000138 | Reward: 0.06 | KL: 0.00880 | ClipFrac: 0.148 | ExplVar: 0.707
Step: 452608 | LR: 0.000137 | Reward: 0.11 | KL: 0.01110 | ClipFrac: 0.159 | ExplVar: 0.715
Step: 454656 | LR: 0.000137 | Reward: 0.00 | KL: 0.01289 | ClipFrac: 0.123 | ExplVar: 0.769
Step: 456704 | LR: 0.000136 | Reward: 0.11 | KL: 0.00519 | ClipFrac: 0.141 | ExplVar: 0.658
Step: 458752 | LR: 0.000136 | Reward: 0.05 | KL: 0.01172 | ClipFrac: 0.131 | ExplVar: 0.762
Step: 460800 | LR: 0.000135 | Reward: 0.18 | KL: 0.01590 | ClipFrac: 0.188 | ExplVar: 0.850




[Eval @ 460800] Return: 254.90
Saved video: videos/ppo_bipedal_eval_460800.mp4
Step: 462848 | LR: 0.000135 | Reward: 0.12 | KL: 0.00484 | ClipFrac: 0.166 | ExplVar: 0.726
Step: 464896 | LR: 0.000134 | Reward: 0.12 | KL: 0.01007 | ClipFrac: 0.156 | ExplVar: 0.728
Step: 466944 | LR: 0.000134 | Reward: 0.12 | KL: 0.00947 | ClipFrac: 0.143 | ExplVar: 0.808
Step: 468992 | LR: 0.000133 | Reward: 0.02 | KL: 0.01030 | ClipFrac: 0.117 | ExplVar: 0.777
Step: 471040 | LR: 0.000133 | Reward: 0.11 | KL: 0.01172 | ClipFrac: 0.126 | ExplVar: 0.850
Step: 473088 | LR: 0.000132 | Reward: 0.11 | KL: 0.00711 | ClipFrac: 0.166 | ExplVar: 0.793
Step: 475136 | LR: 0.000132 | Reward: 0.13 | KL: 0.00629 | ClipFrac: 0.132 | ExplVar: 0.730
Step: 477184 | LR: 0.000131 | Reward: 0.11 | KL: 0.00964 | ClipFrac: 0.145 | ExplVar: 0.342
Step: 479232 | LR: 0.000131 | Reward: 0.01 | KL: 0.00804 | ClipFrac: 0.138 | ExplVar: 0.607
Step: 481280 | LR: 0.000130 | Reward: -0.18 | KL: 0.00870 | ClipFrac: 0.118 | ExplVar: 0.706




[Eval @ 481280] Return: 31.15
Saved video: videos/ppo_bipedal_eval_481280.mp4
Step: 483328 | LR: 0.000130 | Reward: 0.06 | KL: 0.01082 | ClipFrac: 0.142 | ExplVar: 0.778
Step: 485376 | LR: 0.000129 | Reward: -0.07 | KL: 0.00684 | ClipFrac: 0.125 | ExplVar: 0.783
Step: 487424 | LR: 0.000129 | Reward: 0.05 | KL: 0.00844 | ClipFrac: 0.137 | ExplVar: 0.593
Step: 489472 | LR: 0.000128 | Reward: 0.16 | KL: 0.01357 | ClipFrac: 0.178 | ExplVar: 0.776
Step: 491520 | LR: 0.000128 | Reward: -0.06 | KL: 0.00831 | ClipFrac: 0.124 | ExplVar: 0.689
Step: 493568 | LR: 0.000127 | Reward: 0.12 | KL: 0.00575 | ClipFrac: 0.127 | ExplVar: 0.780
Step: 495616 | LR: 0.000127 | Reward: 0.11 | KL: 0.01137 | ClipFrac: 0.129 | ExplVar: 0.739
Step: 497664 | LR: 0.000126 | Reward: 0.12 | KL: 0.00719 | ClipFrac: 0.113 | ExplVar: 0.816
Step: 499712 | LR: 0.000126 | Reward: -0.02 | KL: 0.01216 | ClipFrac: 0.118 | ExplVar: 0.640
Step: 501760 | LR: 0.000125 | Reward: 0.17 | KL: 0.01299 | ClipFrac: 0.191 | ExplVar: 0.804



[Eval @ 501760] Return: -124.95
Saved video: videos/ppo_bipedal_eval_501760.mp4
Step: 503808 | LR: 0.000125 | Reward: 0.10 | KL: 0.00696 | ClipFrac: 0.153 | ExplVar: 0.725
Step: 505856 | LR: 0.000124 | Reward: -0.18 | KL: 0.01033 | ClipFrac: 0.122 | ExplVar: 0.813
Step: 507904 | LR: 0.000124 | Reward: 0.06 | KL: 0.00717 | ClipFrac: 0.095 | ExplVar: 0.806
Step: 509952 | LR: 0.000123 | Reward: 0.16 | KL: 0.01685 | ClipFrac: 0.155 | ExplVar: 0.789
Step: 512000 | LR: 0.000123 | Reward: 0.16 | KL: 0.00815 | ClipFrac: 0.164 | ExplVar: 0.773
Step: 514048 | LR: 0.000122 | Reward: 0.09 | KL: 0.00692 | ClipFrac: 0.126 | ExplVar: 0.279
Step: 516096 | LR: 0.000121 | Reward: -0.06 | KL: 0.00619 | ClipFrac: 0.136 | ExplVar: 0.695
Step: 518144 | LR: 0.000121 | Reward: -0.12 | KL: 0.00495 | ClipFrac: 0.114 | ExplVar: 0.812
Step: 520192 | LR: 0.000120 | Reward: 0.11 | KL: 0.01378 | ClipFrac: 0.143 | ExplVar: 0.873




[Eval @ 520192] Return: 262.03
Saved video: videos/ppo_bipedal_eval_520192.mp4
Step: 522240 | LR: 0.000120 | Reward: 0.13 | KL: 0.01418 | ClipFrac: 0.130 | ExplVar: 0.811
Step: 524288 | LR: 0.000119 | Reward: -0.01 | KL: 0.01025 | ClipFrac: 0.127 | ExplVar: 0.851
Step: 526336 | LR: 0.000119 | Reward: 0.13 | KL: 0.01367 | ClipFrac: 0.131 | ExplVar: 0.804
Step: 528384 | LR: 0.000118 | Reward: 0.12 | KL: 0.01538 | ClipFrac: 0.132 | ExplVar: 0.852
Step: 530432 | LR: 0.000118 | Reward: 0.10 | KL: 0.01187 | ClipFrac: 0.134 | ExplVar: 0.822
Step: 532480 | LR: 0.000117 | Reward: 0.02 | KL: 0.00843 | ClipFrac: 0.114 | ExplVar: 0.825
Step: 534528 | LR: 0.000117 | Reward: 0.06 | KL: 0.00628 | ClipFrac: 0.102 | ExplVar: 0.802
Step: 536576 | LR: 0.000116 | Reward: 0.17 | KL: 0.00389 | ClipFrac: 0.143 | ExplVar: 0.762
Step: 538624 | LR: 0.000116 | Reward: 0.06 | KL: 0.00980 | ClipFrac: 0.131 | ExplVar: 0.816
Step: 540672 | LR: 0.000115 | Reward: 0.11 | KL: 0.00895 | ClipFrac: 0.146 | ExplVar: 0.780




[Eval @ 540672] Return: 263.08
Saved video: videos/ppo_bipedal_eval_540672.mp4
Step: 542720 | LR: 0.000115 | Reward: 0.16 | KL: 0.01292 | ClipFrac: 0.157 | ExplVar: 0.658
Step: 544768 | LR: 0.000114 | Reward: 0.11 | KL: 0.00973 | ClipFrac: 0.131 | ExplVar: 0.211
Step: 546816 | LR: 0.000114 | Reward: 0.17 | KL: 0.01562 | ClipFrac: 0.201 | ExplVar: 0.636
Step: 548864 | LR: 0.000113 | Reward: 0.17 | KL: 0.01886 | ClipFrac: 0.205 | ExplVar: 0.672
Step: 550912 | LR: 0.000113 | Reward: 0.18 | KL: 0.00548 | ClipFrac: 0.137 | ExplVar: 0.804
Step: 552960 | LR: 0.000112 | Reward: 0.12 | KL: 0.00683 | ClipFrac: 0.127 | ExplVar: 0.722
Step: 555008 | LR: 0.000112 | Reward: 0.18 | KL: 0.01101 | ClipFrac: 0.148 | ExplVar: 0.606
Step: 557056 | LR: 0.000111 | Reward: 0.17 | KL: 0.00820 | ClipFrac: 0.153 | ExplVar: 0.781
Step: 559104 | LR: 0.000111 | Reward: 0.12 | KL: 0.00616 | ClipFrac: 0.141 | ExplVar: 0.756
Step: 561152 | LR: 0.000110 | Reward: 0.06 | KL: 0.00382 | ClipFrac: 0.131 | ExplVar: 0.759




[Eval @ 561152] Return: -9.51
Saved video: videos/ppo_bipedal_eval_561152.mp4
Step: 563200 | LR: 0.000110 | Reward: 0.13 | KL: 0.01129 | ClipFrac: 0.136 | ExplVar: 0.743
Step: 565248 | LR: 0.000109 | Reward: -0.19 | KL: 0.01456 | ClipFrac: 0.131 | ExplVar: 0.528
Step: 567296 | LR: 0.000109 | Reward: 0.10 | KL: 0.00662 | ClipFrac: 0.129 | ExplVar: 0.379
Step: 569344 | LR: 0.000108 | Reward: 0.18 | KL: 0.01488 | ClipFrac: 0.173 | ExplVar: 0.640
Step: 571392 | LR: 0.000108 | Reward: 0.01 | KL: 0.00802 | ClipFrac: 0.114 | ExplVar: 0.803
Step: 573440 | LR: 0.000107 | Reward: 0.13 | KL: 0.01053 | ClipFrac: 0.103 | ExplVar: 0.750
Step: 575488 | LR: 0.000107 | Reward: 0.20 | KL: 0.01160 | ClipFrac: 0.144 | ExplVar: 0.788
Step: 577536 | LR: 0.000106 | Reward: -0.00 | KL: 0.00696 | ClipFrac: 0.133 | ExplVar: 0.503
Step: 579584 | LR: 0.000106 | Reward: 0.19 | KL: 0.00688 | ClipFrac: 0.132 | ExplVar: 0.585
Step: 581632 | LR: 0.000105 | Reward: 0.06 | KL: 0.00604 | ClipFrac: 0.108 | ExplVar: 0.545




[Eval @ 581632] Return: -24.90
Saved video: videos/ppo_bipedal_eval_581632.mp4
Step: 583680 | LR: 0.000105 | Reward: 0.13 | KL: 0.00965 | ClipFrac: 0.116 | ExplVar: 0.739
Step: 585728 | LR: 0.000104 | Reward: 0.19 | KL: 0.01115 | ClipFrac: 0.132 | ExplVar: 0.771
Step: 587776 | LR: 0.000104 | Reward: 0.13 | KL: 0.01202 | ClipFrac: 0.120 | ExplVar: 0.779
Step: 589824 | LR: 0.000103 | Reward: 0.07 | KL: 0.01024 | ClipFrac: 0.107 | ExplVar: 0.838
Step: 591872 | LR: 0.000103 | Reward: 0.13 | KL: 0.01289 | ClipFrac: 0.111 | ExplVar: 0.820
Step: 593920 | LR: 0.000102 | Reward: 0.08 | KL: 0.00629 | ClipFrac: 0.109 | ExplVar: 0.833
Step: 595968 | LR: 0.000102 | Reward: 0.15 | KL: 0.00586 | ClipFrac: 0.100 | ExplVar: 0.740
Step: 598016 | LR: 0.000101 | Reward: 0.17 | KL: 0.01001 | ClipFrac: 0.115 | ExplVar: 0.504
Step: 600064 | LR: 0.000100 | Reward: 0.12 | KL: 0.00516 | ClipFrac: 0.131 | ExplVar: 0.838




[Eval @ 600064] Return: 273.46
Saved video: videos/ppo_bipedal_eval_600064.mp4
Step: 602112 | LR: 0.000100 | Reward: 0.14 | KL: 0.00654 | ClipFrac: 0.084 | ExplVar: 0.774
Step: 604160 | LR: 0.000099 | Reward: 0.15 | KL: 0.00992 | ClipFrac: 0.124 | ExplVar: 0.736
Step: 606208 | LR: 0.000099 | Reward: 0.08 | KL: 0.01301 | ClipFrac: 0.126 | ExplVar: 0.876
Step: 608256 | LR: 0.000098 | Reward: 0.19 | KL: 0.01013 | ClipFrac: 0.139 | ExplVar: 0.808
Step: 610304 | LR: 0.000098 | Reward: 0.20 | KL: 0.01040 | ClipFrac: 0.150 | ExplVar: 0.682
Step: 612352 | LR: 0.000097 | Reward: 0.08 | KL: 0.00870 | ClipFrac: 0.125 | ExplVar: 0.828
Step: 614400 | LR: 0.000097 | Reward: 0.13 | KL: 0.00912 | ClipFrac: 0.112 | ExplVar: 0.748
Step: 616448 | LR: 0.000096 | Reward: 0.17 | KL: 0.01050 | ClipFrac: 0.131 | ExplVar: 0.554
Step: 618496 | LR: 0.000096 | Reward: 0.15 | KL: 0.00708 | ClipFrac: 0.115 | ExplVar: 0.782
Step: 620544 | LR: 0.000095 | Reward: 0.07 | KL: 0.01278 | ClipFrac: 0.106 | ExplVar: 0.559




[Eval @ 620544] Return: -16.13
Saved video: videos/ppo_bipedal_eval_620544.mp4
Step: 622592 | LR: 0.000095 | Reward: -0.16 | KL: 0.00990 | ClipFrac: 0.099 | ExplVar: 0.684
Step: 624640 | LR: 0.000094 | Reward: 0.10 | KL: 0.00844 | ClipFrac: 0.104 | ExplVar: 0.559
Step: 626688 | LR: 0.000094 | Reward: 0.14 | KL: 0.00964 | ClipFrac: 0.125 | ExplVar: 0.738
Step: 628736 | LR: 0.000093 | Reward: 0.10 | KL: 0.01310 | ClipFrac: 0.113 | ExplVar: 0.822
Step: 630784 | LR: 0.000093 | Reward: 0.19 | KL: 0.00893 | ClipFrac: 0.124 | ExplVar: 0.501
Step: 632832 | LR: 0.000092 | Reward: 0.09 | KL: 0.01161 | ClipFrac: 0.122 | ExplVar: 0.805
Step: 634880 | LR: 0.000092 | Reward: 0.08 | KL: 0.00998 | ClipFrac: 0.152 | ExplVar: 0.800
Step: 636928 | LR: 0.000091 | Reward: 0.13 | KL: 0.00797 | ClipFrac: 0.104 | ExplVar: 0.746
Step: 638976 | LR: 0.000091 | Reward: 0.15 | KL: 0.00779 | ClipFrac: 0.103 | ExplVar: 0.627
Step: 641024 | LR: 0.000090 | Reward: 0.20 | KL: 0.01057 | ClipFrac: 0.115 | ExplVar: 0.748




[Eval @ 641024] Return: 132.64
Saved video: videos/ppo_bipedal_eval_641024.mp4
Step: 643072 | LR: 0.000090 | Reward: 0.19 | KL: 0.00296 | ClipFrac: 0.116 | ExplVar: 0.594
Step: 645120 | LR: 0.000089 | Reward: 0.21 | KL: 0.00841 | ClipFrac: 0.126 | ExplVar: 0.338
Step: 647168 | LR: 0.000089 | Reward: 0.20 | KL: 0.00391 | ClipFrac: 0.142 | ExplVar: 0.659
Step: 649216 | LR: 0.000088 | Reward: 0.20 | KL: 0.00812 | ClipFrac: 0.158 | ExplVar: 0.533
Step: 651264 | LR: 0.000088 | Reward: 0.15 | KL: 0.00282 | ClipFrac: 0.116 | ExplVar: 0.787
Step: 653312 | LR: 0.000087 | Reward: 0.19 | KL: 0.01110 | ClipFrac: 0.155 | ExplVar: 0.724
Step: 655360 | LR: 0.000087 | Reward: 0.13 | KL: 0.01082 | ClipFrac: 0.096 | ExplVar: 0.704
Step: 657408 | LR: 0.000086 | Reward: 0.19 | KL: 0.00490 | ClipFrac: 0.124 | ExplVar: 0.730
Step: 659456 | LR: 0.000086 | Reward: 0.07 | KL: 0.00977 | ClipFrac: 0.111 | ExplVar: 0.810
Step: 661504 | LR: 0.000085 | Reward: -0.01 | KL: 0.00482 | ClipFrac: 0.098 | ExplVar: 0.819




[Eval @ 661504] Return: 278.62
Saved video: videos/ppo_bipedal_eval_661504.mp4
Step: 663552 | LR: 0.000085 | Reward: 0.15 | KL: 0.00931 | ClipFrac: 0.092 | ExplVar: 0.704
Step: 665600 | LR: 0.000084 | Reward: 0.14 | KL: 0.00752 | ClipFrac: 0.106 | ExplVar: 0.782
Step: 667648 | LR: 0.000084 | Reward: 0.09 | KL: 0.00698 | ClipFrac: 0.094 | ExplVar: 0.826
Step: 669696 | LR: 0.000083 | Reward: 0.20 | KL: 0.00691 | ClipFrac: 0.125 | ExplVar: 0.737
Step: 671744 | LR: 0.000083 | Reward: 0.19 | KL: 0.00336 | ClipFrac: 0.128 | ExplVar: 0.584
Step: 673792 | LR: 0.000082 | Reward: 0.09 | KL: 0.00733 | ClipFrac: 0.087 | ExplVar: 0.845
Step: 675840 | LR: 0.000082 | Reward: 0.14 | KL: 0.00527 | ClipFrac: 0.111 | ExplVar: 0.767
Step: 677888 | LR: 0.000081 | Reward: 0.08 | KL: 0.00819 | ClipFrac: 0.109 | ExplVar: 0.810
Step: 679936 | LR: 0.000081 | Reward: 0.13 | KL: 0.00671 | ClipFrac: 0.103 | ExplVar: 0.729
Step: 681984 | LR: 0.000080 | Reward: 0.09 | KL: 0.01701 | ClipFrac: 0.140 | ExplVar: 0.838




[Eval @ 681984] Return: 273.54
Saved video: videos/ppo_bipedal_eval_681984.mp4
Step: 684032 | LR: 0.000080 | Reward: 0.13 | KL: 0.00600 | ClipFrac: 0.113 | ExplVar: 0.674
Step: 686080 | LR: 0.000079 | Reward: 0.02 | KL: 0.00727 | ClipFrac: 0.106 | ExplVar: 0.835
Step: 688128 | LR: 0.000078 | Reward: 0.02 | KL: 0.00491 | ClipFrac: 0.107 | ExplVar: 0.873
Step: 690176 | LR: 0.000078 | Reward: -0.10 | KL: 0.00888 | ClipFrac: 0.123 | ExplVar: 0.880
Step: 692224 | LR: 0.000077 | Reward: 0.08 | KL: 0.00752 | ClipFrac: 0.115 | ExplVar: 0.846
Step: 694272 | LR: 0.000077 | Reward: 0.01 | KL: 0.00619 | ClipFrac: 0.082 | ExplVar: 0.891
Step: 696320 | LR: 0.000076 | Reward: 0.08 | KL: 0.00492 | ClipFrac: 0.070 | ExplVar: 0.846
Step: 698368 | LR: 0.000076 | Reward: 0.13 | KL: 0.00324 | ClipFrac: 0.106 | ExplVar: 0.803
Step: 700416 | LR: 0.000075 | Reward: -0.10 | KL: 0.00660 | ClipFrac: 0.081 | ExplVar: 0.896




[Eval @ 700416] Return: -54.51
Saved video: videos/ppo_bipedal_eval_700416.mp4
Step: 702464 | LR: 0.000075 | Reward: 0.14 | KL: 0.00668 | ClipFrac: 0.151 | ExplVar: 0.796
Step: 704512 | LR: 0.000074 | Reward: 0.06 | KL: 0.00388 | ClipFrac: 0.075 | ExplVar: 0.665
Step: 706560 | LR: 0.000074 | Reward: 0.08 | KL: 0.00569 | ClipFrac: 0.093 | ExplVar: 0.857
Step: 708608 | LR: 0.000073 | Reward: 0.10 | KL: 0.01076 | ClipFrac: 0.090 | ExplVar: 0.822
Step: 710656 | LR: 0.000073 | Reward: 0.01 | KL: 0.00439 | ClipFrac: 0.082 | ExplVar: 0.866
Step: 712704 | LR: 0.000072 | Reward: 0.08 | KL: 0.00306 | ClipFrac: 0.074 | ExplVar: 0.864
Step: 714752 | LR: 0.000072 | Reward: 0.08 | KL: 0.01127 | ClipFrac: 0.108 | ExplVar: 0.834
Step: 716800 | LR: 0.000071 | Reward: 0.14 | KL: 0.00768 | ClipFrac: 0.107 | ExplVar: 0.862
Step: 718848 | LR: 0.000071 | Reward: 0.14 | KL: 0.00875 | ClipFrac: 0.113 | ExplVar: 0.754
Step: 720896 | LR: 0.000070 | Reward: 0.20 | KL: 0.00683 | ClipFrac: 0.123 | ExplVar: 0.660




[Eval @ 720896] Return: 275.30
Saved video: videos/ppo_bipedal_eval_720896.mp4
Step: 722944 | LR: 0.000070 | Reward: 0.09 | KL: 0.00583 | ClipFrac: 0.112 | ExplVar: 0.860
Step: 724992 | LR: 0.000069 | Reward: 0.15 | KL: 0.00622 | ClipFrac: 0.095 | ExplVar: 0.765
Step: 727040 | LR: 0.000069 | Reward: 0.09 | KL: 0.00827 | ClipFrac: 0.074 | ExplVar: 0.847
Step: 729088 | LR: 0.000068 | Reward: 0.21 | KL: 0.00580 | ClipFrac: 0.095 | ExplVar: 0.656
Step: 731136 | LR: 0.000068 | Reward: 0.21 | KL: 0.00620 | ClipFrac: 0.117 | ExplVar: 0.642
Step: 733184 | LR: 0.000067 | Reward: 0.08 | KL: 0.00699 | ClipFrac: 0.070 | ExplVar: 0.830
Step: 735232 | LR: 0.000067 | Reward: 0.21 | KL: 0.00570 | ClipFrac: 0.111 | ExplVar: 0.730
Step: 737280 | LR: 0.000066 | Reward: 0.21 | KL: 0.00313 | ClipFrac: 0.119 | ExplVar: 0.451
Step: 739328 | LR: 0.000066 | Reward: 0.16 | KL: 0.00761 | ClipFrac: 0.101 | ExplVar: 0.828
Step: 741376 | LR: 0.000065 | Reward: 0.15 | KL: 0.00913 | ClipFrac: 0.106 | ExplVar: 0.823




[Eval @ 741376] Return: 125.17
Saved video: videos/ppo_bipedal_eval_741376.mp4
Step: 743424 | LR: 0.000065 | Reward: 0.05 | KL: 0.00811 | ClipFrac: 0.080 | ExplVar: 0.865
Step: 745472 | LR: 0.000064 | Reward: 0.22 | KL: 0.00390 | ClipFrac: 0.097 | ExplVar: 0.553
Step: 747520 | LR: 0.000064 | Reward: 0.09 | KL: 0.00393 | ClipFrac: 0.097 | ExplVar: 0.893
Step: 749568 | LR: 0.000063 | Reward: 0.10 | KL: 0.00301 | ClipFrac: 0.116 | ExplVar: 0.815
Step: 751616 | LR: 0.000063 | Reward: 0.04 | KL: 0.00525 | ClipFrac: 0.085 | ExplVar: 0.864
Step: 753664 | LR: 0.000062 | Reward: 0.17 | KL: 0.00407 | ClipFrac: 0.112 | ExplVar: 0.803
Step: 755712 | LR: 0.000062 | Reward: 0.21 | KL: 0.00490 | ClipFrac: 0.110 | ExplVar: 0.690
Step: 757760 | LR: 0.000061 | Reward: 0.03 | KL: 0.00824 | ClipFrac: 0.079 | ExplVar: 0.884
Step: 759808 | LR: 0.000061 | Reward: 0.21 | KL: 0.01092 | ClipFrac: 0.104 | ExplVar: 0.656
Step: 761856 | LR: 0.000060 | Reward: 0.13 | KL: 0.00696 | ClipFrac: 0.069 | ExplVar: 0.387




[Eval @ 761856] Return: 35.49
Saved video: videos/ppo_bipedal_eval_761856.mp4
Step: 763904 | LR: 0.000060 | Reward: 0.16 | KL: 0.00741 | ClipFrac: 0.090 | ExplVar: 0.845
Step: 765952 | LR: 0.000059 | Reward: 0.16 | KL: 0.00666 | ClipFrac: 0.102 | ExplVar: 0.793
Step: 768000 | LR: 0.000059 | Reward: 0.06 | KL: 0.00551 | ClipFrac: 0.071 | ExplVar: 0.872
Step: 770048 | LR: 0.000058 | Reward: 0.16 | KL: 0.00587 | ClipFrac: 0.098 | ExplVar: 0.776
Step: 772096 | LR: 0.000057 | Reward: 0.10 | KL: 0.00400 | ClipFrac: 0.124 | ExplVar: 0.828
Step: 774144 | LR: 0.000057 | Reward: 0.22 | KL: 0.01129 | ClipFrac: 0.154 | ExplVar: 0.560
Step: 776192 | LR: 0.000056 | Reward: 0.22 | KL: 0.00333 | ClipFrac: 0.098 | ExplVar: 0.730
Step: 778240 | LR: 0.000056 | Reward: 0.05 | KL: 0.00766 | ClipFrac: 0.073 | ExplVar: 0.881
Step: 780288 | LR: 0.000055 | Reward: 0.10 | KL: 0.00550 | ClipFrac: 0.087 | ExplVar: 0.832




[Eval @ 780288] Return: 272.07
Saved video: videos/ppo_bipedal_eval_780288.mp4
Step: 782336 | LR: 0.000055 | Reward: 0.12 | KL: 0.00652 | ClipFrac: 0.110 | ExplVar: 0.892
Step: 784384 | LR: 0.000054 | Reward: 0.23 | KL: 0.00750 | ClipFrac: 0.116 | ExplVar: 0.566
Step: 786432 | LR: 0.000054 | Reward: 0.16 | KL: 0.00524 | ClipFrac: 0.071 | ExplVar: 0.870
Step: 788480 | LR: 0.000053 | Reward: 0.12 | KL: 0.00406 | ClipFrac: 0.071 | ExplVar: 0.886
Step: 790528 | LR: 0.000053 | Reward: 0.23 | KL: 0.00538 | ClipFrac: 0.100 | ExplVar: 0.682
Step: 792576 | LR: 0.000052 | Reward: 0.11 | KL: 0.00720 | ClipFrac: 0.079 | ExplVar: 0.896
Step: 794624 | LR: 0.000052 | Reward: 0.18 | KL: 0.00673 | ClipFrac: 0.062 | ExplVar: 0.883
Step: 796672 | LR: 0.000051 | Reward: 0.25 | KL: 0.00811 | ClipFrac: 0.126 | ExplVar: 0.581
Step: 798720 | LR: 0.000051 | Reward: 0.07 | KL: 0.00640 | ClipFrac: 0.081 | ExplVar: 0.872
Step: 800768 | LR: 0.000050 | Reward: 0.18 | KL: 0.00552 | ClipFrac: 0.112 | ExplVar: 0.783




[Eval @ 800768] Return: 272.68
Saved video: videos/ppo_bipedal_eval_800768.mp4
Step: 802816 | LR: 0.000050 | Reward: 0.24 | KL: 0.00185 | ClipFrac: 0.079 | ExplVar: 0.697
Step: 804864 | LR: 0.000049 | Reward: 0.08 | KL: 0.00949 | ClipFrac: 0.078 | ExplVar: 0.699
Step: 806912 | LR: 0.000049 | Reward: 0.23 | KL: 0.00727 | ClipFrac: 0.078 | ExplVar: 0.763
Step: 808960 | LR: 0.000048 | Reward: 0.19 | KL: 0.00936 | ClipFrac: 0.084 | ExplVar: 0.765
Step: 811008 | LR: 0.000048 | Reward: 0.25 | KL: 0.00611 | ClipFrac: 0.108 | ExplVar: 0.570
Step: 813056 | LR: 0.000047 | Reward: 0.25 | KL: 0.00813 | ClipFrac: 0.085 | ExplVar: 0.577
Step: 815104 | LR: 0.000047 | Reward: 0.12 | KL: 0.00846 | ClipFrac: 0.089 | ExplVar: 0.843
Step: 817152 | LR: 0.000046 | Reward: 0.18 | KL: 0.00589 | ClipFrac: 0.078 | ExplVar: 0.853
Step: 819200 | LR: 0.000046 | Reward: 0.18 | KL: 0.00626 | ClipFrac: 0.070 | ExplVar: 0.790
Step: 821248 | LR: 0.000045 | Reward: 0.10 | KL: 0.00418 | ClipFrac: 0.044 | ExplVar: 0.849




[Eval @ 821248] Return: 282.71
Saved video: videos/ppo_bipedal_eval_821248.mp4
Step: 823296 | LR: 0.000045 | Reward: 0.17 | KL: 0.00493 | ClipFrac: 0.075 | ExplVar: 0.795
Step: 825344 | LR: 0.000044 | Reward: 0.13 | KL: 0.00339 | ClipFrac: 0.097 | ExplVar: 0.880
Step: 827392 | LR: 0.000044 | Reward: 0.24 | KL: 0.00782 | ClipFrac: 0.086 | ExplVar: 0.578
Step: 829440 | LR: 0.000043 | Reward: 0.02 | KL: 0.00456 | ClipFrac: 0.106 | ExplVar: 0.885
Step: 831488 | LR: 0.000043 | Reward: 0.18 | KL: 0.00661 | ClipFrac: 0.068 | ExplVar: 0.807
Step: 833536 | LR: 0.000042 | Reward: 0.18 | KL: 0.00705 | ClipFrac: 0.096 | ExplVar: 0.773
Step: 835584 | LR: 0.000042 | Reward: 0.24 | KL: 0.00526 | ClipFrac: 0.086 | ExplVar: 0.709
Step: 837632 | LR: 0.000041 | Reward: 0.24 | KL: 0.00732 | ClipFrac: 0.136 | ExplVar: 0.504
Step: 839680 | LR: 0.000041 | Reward: 0.25 | KL: 0.00725 | ClipFrac: 0.117 | ExplVar: 0.540
Step: 841728 | LR: 0.000040 | Reward: 0.18 | KL: 0.00461 | ClipFrac: 0.061 | ExplVar: 0.819




[Eval @ 841728] Return: 279.48
Saved video: videos/ppo_bipedal_eval_841728.mp4
Step: 843776 | LR: 0.000040 | Reward: 0.23 | KL: 0.00630 | ClipFrac: 0.079 | ExplVar: 0.624
Step: 845824 | LR: 0.000039 | Reward: 0.24 | KL: 0.00896 | ClipFrac: 0.082 | ExplVar: 0.638
Step: 847872 | LR: 0.000039 | Reward: 0.20 | KL: 0.00578 | ClipFrac: 0.052 | ExplVar: 0.826
Step: 849920 | LR: 0.000038 | Reward: 0.24 | KL: 0.00762 | ClipFrac: 0.060 | ExplVar: 0.631
Step: 851968 | LR: 0.000038 | Reward: 0.14 | KL: 0.00466 | ClipFrac: 0.056 | ExplVar: 0.862
Step: 854016 | LR: 0.000037 | Reward: 0.11 | KL: 0.00353 | ClipFrac: 0.057 | ExplVar: 0.831
Step: 856064 | LR: 0.000036 | Reward: 0.14 | KL: 0.00748 | ClipFrac: 0.085 | ExplVar: 0.825
Step: 858112 | LR: 0.000036 | Reward: 0.19 | KL: 0.00307 | ClipFrac: 0.077 | ExplVar: 0.802
Step: 860160 | LR: 0.000035 | Reward: 0.19 | KL: 0.00307 | ClipFrac: 0.058 | ExplVar: 0.750




[Eval @ 860160] Return: 285.40
Saved video: videos/ppo_bipedal_eval_860160.mp4
Step: 862208 | LR: 0.000035 | Reward: 0.20 | KL: 0.00756 | ClipFrac: 0.059 | ExplVar: 0.682
Step: 864256 | LR: 0.000034 | Reward: 0.24 | KL: 0.00679 | ClipFrac: 0.072 | ExplVar: 0.447
Step: 866304 | LR: 0.000034 | Reward: 0.25 | KL: 0.00788 | ClipFrac: 0.076 | ExplVar: 0.595
Step: 868352 | LR: 0.000033 | Reward: 0.25 | KL: 0.00527 | ClipFrac: 0.068 | ExplVar: 0.606
Step: 870400 | LR: 0.000033 | Reward: 0.24 | KL: 0.00701 | ClipFrac: 0.081 | ExplVar: 0.697
Step: 872448 | LR: 0.000032 | Reward: 0.18 | KL: 0.00527 | ClipFrac: 0.060 | ExplVar: 0.794
Step: 874496 | LR: 0.000032 | Reward: 0.19 | KL: 0.00290 | ClipFrac: 0.065 | ExplVar: 0.675
Step: 876544 | LR: 0.000031 | Reward: 0.25 | KL: 0.01084 | ClipFrac: 0.083 | ExplVar: 0.607
Step: 878592 | LR: 0.000031 | Reward: 0.25 | KL: 0.00851 | ClipFrac: 0.059 | ExplVar: 0.733
Step: 880640 | LR: 0.000030 | Reward: 0.25 | KL: 0.00662 | ClipFrac: 0.095 | ExplVar: 0.575




[Eval @ 880640] Return: 278.96
Saved video: videos/ppo_bipedal_eval_880640.mp4
Step: 882688 | LR: 0.000030 | Reward: 0.19 | KL: 0.00473 | ClipFrac: 0.060 | ExplVar: 0.794
Step: 884736 | LR: 0.000029 | Reward: 0.26 | KL: 0.00537 | ClipFrac: 0.110 | ExplVar: 0.545
Step: 886784 | LR: 0.000029 | Reward: 0.13 | KL: 0.00685 | ClipFrac: 0.062 | ExplVar: 0.830
Step: 888832 | LR: 0.000028 | Reward: 0.26 | KL: 0.00320 | ClipFrac: 0.087 | ExplVar: 0.720
Step: 890880 | LR: 0.000028 | Reward: 0.19 | KL: 0.00734 | ClipFrac: 0.090 | ExplVar: 0.792
Step: 892928 | LR: 0.000027 | Reward: 0.20 | KL: 0.00556 | ClipFrac: 0.046 | ExplVar: 0.825
Step: 894976 | LR: 0.000027 | Reward: 0.21 | KL: 0.00269 | ClipFrac: 0.042 | ExplVar: 0.747
Step: 897024 | LR: 0.000026 | Reward: 0.18 | KL: 0.00524 | ClipFrac: 0.065 | ExplVar: 0.836
Step: 899072 | LR: 0.000026 | Reward: 0.25 | KL: 0.00086 | ClipFrac: 0.058 | ExplVar: 0.574
Step: 901120 | LR: 0.000025 | Reward: 0.26 | KL: 0.00487 | ClipFrac: 0.065 | ExplVar: 0.641




[Eval @ 901120] Return: 279.36
Saved video: videos/ppo_bipedal_eval_901120.mp4
Step: 903168 | LR: 0.000025 | Reward: 0.26 | KL: 0.00431 | ClipFrac: 0.073 | ExplVar: 0.610
Step: 905216 | LR: 0.000024 | Reward: 0.25 | KL: 0.00687 | ClipFrac: 0.076 | ExplVar: 0.703
Step: 907264 | LR: 0.000024 | Reward: 0.15 | KL: 0.00526 | ClipFrac: 0.063 | ExplVar: 0.841
Step: 909312 | LR: 0.000023 | Reward: 0.06 | KL: 0.00710 | ClipFrac: 0.059 | ExplVar: 0.707
Step: 911360 | LR: 0.000023 | Reward: 0.27 | KL: 0.00789 | ClipFrac: 0.063 | ExplVar: 0.614
Step: 913408 | LR: 0.000022 | Reward: 0.26 | KL: 0.00344 | ClipFrac: 0.053 | ExplVar: 0.613
Step: 915456 | LR: 0.000022 | Reward: 0.27 | KL: 0.00691 | ClipFrac: 0.061 | ExplVar: 0.626
Step: 917504 | LR: 0.000021 | Reward: 0.26 | KL: 0.00825 | ClipFrac: 0.075 | ExplVar: 0.783
Step: 919552 | LR: 0.000021 | Reward: 0.15 | KL: 0.00726 | ClipFrac: 0.073 | ExplVar: 0.834
Step: 921600 | LR: 0.000020 | Reward: 0.25 | KL: 0.00370 | ClipFrac: 0.023 | ExplVar: 0.552




[Eval @ 921600] Return: -55.34
Saved video: videos/ppo_bipedal_eval_921600.mp4
Step: 923648 | LR: 0.000020 | Reward: 0.25 | KL: 0.00472 | ClipFrac: 0.064 | ExplVar: 0.562
Step: 925696 | LR: 0.000019 | Reward: 0.27 | KL: 0.00660 | ClipFrac: 0.097 | ExplVar: 0.652
Step: 927744 | LR: 0.000019 | Reward: 0.21 | KL: 0.00290 | ClipFrac: 0.038 | ExplVar: 0.858
Step: 929792 | LR: 0.000018 | Reward: 0.21 | KL: 0.00493 | ClipFrac: 0.036 | ExplVar: 0.696
Step: 931840 | LR: 0.000018 | Reward: 0.26 | KL: 0.00584 | ClipFrac: 0.050 | ExplVar: 0.666
Step: 933888 | LR: 0.000017 | Reward: 0.28 | KL: 0.00578 | ClipFrac: 0.075 | ExplVar: 0.579
Step: 935936 | LR: 0.000017 | Reward: 0.20 | KL: 0.00430 | ClipFrac: 0.072 | ExplVar: 0.721
Step: 937984 | LR: 0.000016 | Reward: 0.28 | KL: 0.00917 | ClipFrac: 0.040 | ExplVar: 0.495
Step: 940032 | LR: 0.000016 | Reward: 0.26 | KL: 0.00617 | ClipFrac: 0.085 | ExplVar: 0.585




[Eval @ 940032] Return: 66.89
Saved video: videos/ppo_bipedal_eval_940032.mp4
Step: 942080 | LR: 0.000015 | Reward: 0.22 | KL: -0.00026 | ClipFrac: 0.038 | ExplVar: 0.736
Step: 944128 | LR: 0.000014 | Reward: 0.27 | KL: 0.00402 | ClipFrac: 0.097 | ExplVar: 0.605
Step: 946176 | LR: 0.000014 | Reward: 0.26 | KL: 0.01220 | ClipFrac: 0.104 | ExplVar: 0.557
Step: 948224 | LR: 0.000013 | Reward: 0.21 | KL: 0.00369 | ClipFrac: 0.047 | ExplVar: 0.816
Step: 950272 | LR: 0.000013 | Reward: 0.27 | KL: 0.00101 | ClipFrac: 0.036 | ExplVar: 0.630
Step: 952320 | LR: 0.000012 | Reward: 0.26 | KL: 0.00405 | ClipFrac: 0.036 | ExplVar: 0.597
Step: 954368 | LR: 0.000012 | Reward: 0.29 | KL: 0.00346 | ClipFrac: 0.042 | ExplVar: 0.634
Step: 956416 | LR: 0.000011 | Reward: 0.19 | KL: 0.00250 | ClipFrac: 0.018 | ExplVar: 0.736
Step: 958464 | LR: 0.000011 | Reward: 0.27 | KL: 0.00170 | ClipFrac: 0.022 | ExplVar: 0.606
Step: 960512 | LR: 0.000010 | Reward: 0.28 | KL: 0.00553 | ClipFrac: 0.093 | ExplVar: 0.641




[Eval @ 960512] Return: 24.49
Saved video: videos/ppo_bipedal_eval_960512.mp4
Step: 962560 | LR: 0.000010 | Reward: 0.16 | KL: 0.00161 | ClipFrac: 0.028 | ExplVar: 0.855
Step: 964608 | LR: 0.000009 | Reward: 0.27 | KL: 0.00635 | ClipFrac: 0.021 | ExplVar: 0.568
Step: 966656 | LR: 0.000009 | Reward: 0.28 | KL: 0.00521 | ClipFrac: 0.055 | ExplVar: 0.605
Step: 968704 | LR: 0.000008 | Reward: 0.27 | KL: 0.00106 | ClipFrac: 0.048 | ExplVar: 0.579
Step: 970752 | LR: 0.000008 | Reward: 0.22 | KL: 0.00270 | ClipFrac: 0.023 | ExplVar: 0.718
Step: 972800 | LR: 0.000007 | Reward: 0.12 | KL: 0.00166 | ClipFrac: 0.012 | ExplVar: 0.845
Step: 974848 | LR: 0.000007 | Reward: 0.26 | KL: 0.00258 | ClipFrac: 0.049 | ExplVar: 0.483
Step: 976896 | LR: 0.000006 | Reward: 0.10 | KL: 0.00710 | ClipFrac: 0.030 | ExplVar: 0.868
Step: 978944 | LR: 0.000006 | Reward: 0.28 | KL: 0.00242 | ClipFrac: 0.013 | ExplVar: 0.500
Step: 980992 | LR: 0.000005 | Reward: 0.14 | KL: 0.00117 | ClipFrac: 0.001 | ExplVar: 0.858




[Eval @ 980992] Return: 83.95
Saved video: videos/ppo_bipedal_eval_980992.mp4
Step: 983040 | LR: 0.000005 | Reward: 0.21 | KL: 0.00443 | ClipFrac: 0.010 | ExplVar: 0.412
Step: 985088 | LR: 0.000004 | Reward: 0.15 | KL: 0.00181 | ClipFrac: 0.002 | ExplVar: 0.873
Step: 987136 | LR: 0.000004 | Reward: 0.21 | KL: 0.00129 | ClipFrac: 0.015 | ExplVar: 0.767
Step: 989184 | LR: 0.000003 | Reward: 0.21 | KL: -0.00006 | ClipFrac: 0.011 | ExplVar: 0.761
Step: 991232 | LR: 0.000003 | Reward: 0.27 | KL: 0.00080 | ClipFrac: 0.000 | ExplVar: 0.641
Step: 993280 | LR: 0.000002 | Reward: 0.20 | KL: 0.00128 | ClipFrac: 0.001 | ExplVar: 0.723
Step: 995328 | LR: 0.000002 | Reward: 0.27 | KL: -0.00012 | ClipFrac: 0.000 | ExplVar: 0.604
Step: 997376 | LR: 0.000001 | Reward: 0.27 | KL: -0.00014 | ClipFrac: 0.000 | ExplVar: 0.512
Step: 999424 | LR: 0.000001 | Reward: 0.15 | KL: -0.00008 | ClipFrac: 0.000 | ExplVar: 0.854
Step: 1000000 | LR: 0.000000 | Reward: 0.27 | KL: -0.00001 | ClipFrac: 0.000 | ExplVar: 0.



[Eval @ 1000000] Return: 4.07
Saved video: videos/ppo_bipedal_eval_1000000.mp4


In [None]:
!zip -r videos.zip /content/videos

  adding: content/videos/ (stored 0%)
  adding: content/videos/ppo_bipedal_eval_120832.mp4 (deflated 3%)
  adding: content/videos/ppo_bipedal_eval_921600.mp4 (deflated 3%)
  adding: content/videos/ppo_bipedal_eval_520192.mp4 (deflated 3%)
  adding: content/videos/ppo_bipedal_eval_901120.mp4 (deflated 3%)
  adding: content/videos/ppo_bipedal_eval_821248.mp4 (deflated 3%)
  adding: content/videos/ppo_bipedal_eval_481280.mp4 (deflated 4%)
  adding: content/videos/ppo_bipedal_eval_241664.mp4 (deflated 4%)
  adding: content/videos/ppo_bipedal_eval_40960.mp4 (deflated 38%)
  adding: content/videos/ppo_bipedal_eval_161792.mp4 (deflated 5%)
  adding: content/videos/ppo_bipedal_eval_221184.mp4 (deflated 5%)
  adding: content/videos/ppo_bipedal_eval_1000000.mp4 (deflated 3%)
  adding: content/videos/ppo_bipedal_eval_860160.mp4 (deflated 3%)
  adding: content/videos/ppo_bipedal_eval_841728.mp4 (deflated 3%)
  adding: content/videos/ppo_bipedal_eval_380928.mp4 (deflated 3%)
  adding: content/video

In [None]:
files.download('/content/videos.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>