In [None]:
import argparse
import os
import random
import time
from distutils.util import strtobool

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
from torch.distributions
from torch.utils.tensorboard import SummaryWriter

In [2]:
PERIOD = 7

# Env
import gym, json
from gym import spaces
from epipolicy.core.epidemic import construct_epidemic
from epipolicy.obj.act import construct_act

class EpiEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self, session):
        super(EpiEnv, self).__init__()
        self.epi = construct_epidemic(session)
        total_population = np.sum(self.epi.static.default_state.obs.current_comp)
        obs_count = self.epi.static.compartment_count * self.epi.static.locale_count * self.epi.static.group_count
        action_count = 0
        action_param_count =  0
        for itv in self.epi.static.interventions:
            if not itv.is_cost:
                action_count += 1
                action_param_count += len(itv.cp_list)
        self.act_domain = np.zeros((action_param_count, 2), dtype=np.float64)
        index = 0
        for itv in self.epi.static.interventions:
            if not itv.is_cost:
                for cp in itv.cp_list:
                    self.act_domain[index, 0] = cp.min_value
                    self.act_domain[index, 1] = cp.max_value
                    index += 1
        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions:
        self.action_space = spaces.Box(low=0, high=1, shape=(action_count,), dtype=np.float64)
        # Example for using image as input:
        self.observation_space = spaces.Box(low=0, high=total_population, shape=(obs_count,), dtype=np.float64)

    def step(self, action):
        expanded_action = np.zeros(len(self.act_domain), dtype=np.float64)
        index = 0
        for i in range(len(self.act_domain)):
            if self.act_domain[i, 0] == self.act_domain[i, 1]:
                expanded_action[i] = self.act_domain[i, 0]
            else:
                expanded_action[i] = action[index]
                index += 1

        epi_action = []
        index = 0
        for itv_id, itv in enumerate(self.epi.static.interventions):
            if not itv.is_cost:
                epi_action.append(construct_act(itv_id, expanded_action[index:index+len(itv.cp_list)]))
                index += len(itv.cp_list)

        total_r = 0
        for i in range(PERIOD):
            state, r, done = self.epi.step(epi_action)
            total_r += r
            if done:
                break
        return state.obs.current_comp.flatten(), total_r, done, dict()

    def reset(self):
        state = self.epi.reset()
        return state.obs.current_comp.flatten()  # reward, done, info can't be included
    def render(self, mode='human'):
        pass
    def close(self):
        pass

In [3]:
def parse_args(main_args = None):
    # fmt: off
    parser = argparse.ArgumentParser()
    parser.add_argument("--exp-name", type=str, default="PPO",
        help="the name of this experiment")
    parser.add_argument("--gym-id", type=str, default="HalfCheetahBulletEnv-v0",
        help="the id of the gym environment")
    parser.add_argument("--learning-rate", type=float, default=3e-4,
        help="the learning rate of the optimizer")
    parser.add_argument("--seed", type=int, default=1,
        help="seed of the experiment")
    parser.add_argument("--total-timesteps", type=int, default=700000,
        help="total timesteps of the experiments")
    parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, `torch.backends.cudnn.deterministic=False`")
    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, cuda will be enabled by default")
    parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
        help="if toggled, this experiment will be tracked with Weights and Biases")
    parser.add_argument("--wandb-project-name", type=str, default="ppo-implementation-details",
        help="the wandb's project name")
    parser.add_argument("--wandb-entity", type=str, default=None,
        help="the entity (team) of wandb's project")
    parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
        help="weather to capture videos of the agent performances (check out `videos` folder)")

    parser.add_argument("--policy_plot_interval", type=int, default=1,
        help="seed of the experiment")

    # Algorithm specific arguments
    parser.add_argument("--num-envs", type=int, default=1,
        help="the number of parallel game environments")
    parser.add_argument("--num-steps", type=int, default=2048,
        help="the number of steps to run in each environment per policy rollout")
    parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="Toggle learning rate annealing for policy and value networks")
    parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="Use GAE for advantage computation")
    parser.add_argument("--gamma", type=float, default=0.99,
        help="the discount factor gamma")
    parser.add_argument("--gae-lambda", type=float, default=0.95,
        help="the lambda for the general advantage estimation")
    parser.add_argument("--num-minibatches", type=int, default=32,
        help="the number of mini-batches")
    parser.add_argument("--update-epochs", type=int, default=10,
        help="the K epochs to update the policy")
    parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="Toggles advantages normalization")
    parser.add_argument("--clip-coef", type=float, default=0.2,
        help="the surrogate clipping coefficient")
    parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
    parser.add_argument("--ent-coef", type=float, default=0.0,
        help="coefficient of the entropy")
    parser.add_argument("--vf-coef", type=float, default=0.5,
        help="coefficient of the value function")
    parser.add_argument("--max-grad-norm", type=float, default=0.5,
        help="the maximum norm for the gradient clipping")
    parser.add_argument("--target-kl", type=float, default=None,
        help="the target KL divergence threshold")
    if main_args is not None:
        args = parser.parse_args(main_args.split())
    else:
        args = parser.parse_args()
    args.num_steps //= PERIOD
    args.total_timesteps //= PERIOD
    args.batch_size = int(args.num_envs * args.num_steps)
    args.minibatch_size = int(args.batch_size // args.num_minibatches)
    # fmt: on
    return args

In [4]:
epi_ids = ["SIR_A"]#, "SIR_B", "SIRV_A", "SIRV_B", "COVID_A", "COVID_B", "COVID_C"]

def make_env(gym_id, seed, idx, capture_video, run_name):
    def thunk():
        if gym_id in epi_ids:
            fp = open('jsons/{}.json'.format(gym_id), 'r')
            session = json.load(fp)
            env = EpiEnv(session)
        else:
            env = gym.make(gym_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        if capture_video:
            if idx == 0:
                env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        env = gym.wrappers.ClipAction(env)
        env = gym.wrappers.NormalizeObservation(env)
        env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
        env = gym.wrappers.NormalizeReward(env)
        env = gym.wrappers.TransformReward(env, lambda reward: np.clip(reward, -10, 10))
        # Our env is deterministic
        # env.seed(seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env

    return thunk

def make_primal_env(gym_id):
    def thunk():
        if gym_id in epi_ids:
            fp = open('jsons/{}.json'.format(gym_id), 'r')
            session = json.load(fp)
            env = EpiEnv(session)
        else:
            env = gym.make(gym_id)
        return env
    return thunk

In [None]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.actor_mean_sigma = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 2*np.prod(envs.single_action_space.shape)), std=0.01),
        )
        self.m = MultivariateNormal(
            torch.zeros(np.prod(envs.single_action_space.shape)), 
            torch.eye(np.prod(envs.single_action_space.shape))
        )

    def get_action(self, x):
        action_mean = self.actor_mean_sigma(x)[:np.prod(envs.single_action_space.shape)]
        action_sigma = self.actor_mean_sigma(x)[np.prod(envs.single_action_space.shape):]
        epsilon = m.sample()
        
        action = action_mean + action_sigma * epsilon 
        # TODO: ensure action is within boundaries
        return action, None, None, None


In [None]:
from tqdm import tqdm
args = parse_args("--gym-id SIR_A --seed 1")
run_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
print("Running", run_name)
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
    "hyperparameters",
    "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)

# TRY NOT TO MODIFY: seeding
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

# env setup
envs = gym.vector.SyncVectorEnv(
    [make_env(args.gym_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
)
# envs = gym.vector.AsyncVectorEnv(
#     [make_env(args.gym_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
# )
test_env = make_primal_env(args.gym_id)()
# test_env = make_env(args.gym_id, args.seed, 0, args.capture_video, run_name)()
assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"

agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

# ALGO Logic: Storage setup
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)

# TRY NOT TO MODIFY: start the game
global_step = 0
start_time = time.time()
next_obs = torch.Tensor(envs.reset()).to(device)
next_done = torch.zeros(args.num_envs).to(device)
num_updates = args.total_timesteps // args.batch_size
ROLLOUT_HORIZON = 35

csv_file = open('runs/{}/records.csv'.format(run_name), 'w')

for update in tqdm(range(1, num_updates + 1)):
    # Annealing the rate if instructed to do so.
    if args.anneal_lr:
        frac = 1.0 - (update - 1.0) / num_updates
        lrnow = frac * args.learning_rate
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, args.num_steps):
        global_step += 1 * args.num_envs
        obs[step] = next_obs
        dones[step] = next_done

        # ALGO LOGIC: action logic
        with torch.no_grad():
            action, _, _, _ = agent.get_action_and_value(next_obs)
        actions[step] = action

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, reward, done, info = envs.step(action.cpu().numpy())
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)

        for item in info:
            if "episode" in item.keys():
                print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
                writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                break

    # bootstrap value if not done
    with torch.no_grad():
        returns = torch.zeros_like(rewards).to(device)
        for t in reversed(range(args.num_steps)):
            if t == args.num_steps - 1:
                nextnonterminal = 1.0 - next_done
            else:
                nextnonterminal = 1.0 - dones[t + 1]
            next_return = returns[t + 1]
            returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return

    # flatten the batch
    b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
    b_returns = returns.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(args.batch_size)
    clipfracs = []
    for epoch in tqdm(range(max(1, args.update_epochs))): # eventually remove epoch as we look at trajectories only once
        np.random.shuffle(b_inds)
        for start in range(0, args.batch_size, args.minibatch_size):
            # TODO: do we actually need mini-batches as we look at all trajectories at once?
            end = start + args.minibatch_size
            mb_inds = b_inds[start:end]

            mb_returns = b_returns[mb_inds]
            pg_loss = -mb_returns
            
            loss = pg_loss 

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
            optimizer.step()


    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

    # TRY NOT TO MODIFY: record rewards for plotting purposes
    writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
    writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
    writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
    writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
    writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
    writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
    writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
    writer.add_scalar("losses/explained_variance", explained_var, global_step)
    print("SPS:", int(global_step / (time.time() - start_time)))
    writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

    # PLOT POLICY
    if args.gym_id in epi_ids and (update - 1) % args.policy_plot_interval == 0:
        test_obs = torch.Tensor(test_env.reset())
        env_obs = torch.Tensor(envs.reset()).to(device)
        timestep = 0
        total_r = 0
        done = False
        itv_line = []
        while not done:
            with torch.no_grad():
                action_mean = agent.get_action_mean(env_obs)
                test_action_mean = torch.mean(action_mean, 0)
                test_action_mean = torch.clamp(test_action_mean, 0, 1)
             
            test_obs, r, done, _ = test_env.step(test_action_mean.cpu().numpy())
            test_obs = torch.Tensor(test_obs)
            itv_index = 0
            itv_array = []
            for itv in test_env.epi.static.interventions:
                if not itv.is_cost:
                    v = float(test_action_mean[itv_index])
                    writer.add_scalar('charts/policy_{}/{}'.format(global_step, itv.name), v, timestep)
                    itv_array.append(v)
                    itv_index += 1
            itv_line.append(itv_array)
            
            env_obs, _, _, _ = envs.step(action_mean.cpu().numpy())
            env_obs = torch.Tensor(env_obs).to(device)

            total_r += r
            timestep += PERIOD
            
        line = '|'.join([str(global_step), str(total_r), str(itv_line)]) + '\n'
        csv_file.write(line)
        writer.add_scalar('charts/learning_curve', total_r, global_step)
        print("At global step {}, total_rewards={}".format(global_step, total_r))

csv_file.close()
envs.close()
writer.close()

Running SIR_A__PPO__1__1734057822


  0%|          | 0/342 [00:00<?, ?it/s]

global_step=53, episodic_return=-133795024.0
global_step=106, episodic_return=-249479552.0
global_step=159, episodic_return=-330727008.0
global_step=212, episodic_return=-109690464.0
global_step=265, episodic_return=-127306376.0


100%|██████████| 10/10 [00:02<00:00,  4.22it/s]


SPS: 16


  0%|          | 1/342 [00:23<2:12:57, 23.40s/it]

At global step 292, total_rewards=-2099613583.1634524
global_step=345, episodic_return=-146012256.0
global_step=398, episodic_return=-339601056.0
global_step=451, episodic_return=-665188416.0
global_step=504, episodic_return=-726868480.0
global_step=557, episodic_return=-147837248.0


100%|██████████| 10/10 [00:02<00:00,  4.16it/s]


SPS: 14


  1%|          | 2/342 [00:46<2:11:52, 23.27s/it]

At global step 584, total_rewards=-2040265843.0619414
global_step=637, episodic_return=-307461856.0
global_step=690, episodic_return=-99795864.0
global_step=743, episodic_return=-158665840.0
global_step=796, episodic_return=-114541504.0
global_step=849, episodic_return=-901531840.0


100%|██████████| 10/10 [00:02<00:00,  4.30it/s]


SPS: 13


  1%|          | 3/342 [01:09<2:09:33, 22.93s/it]

At global step 876, total_rewards=-1734867601.4887385
global_step=929, episodic_return=-445313216.0
global_step=982, episodic_return=-83600248.0
global_step=1035, episodic_return=-204367056.0
global_step=1088, episodic_return=-86524936.0
global_step=1141, episodic_return=-98369328.0


100%|██████████| 10/10 [00:02<00:00,  4.29it/s]


SPS: 13


  1%|          | 4/342 [01:31<2:08:22, 22.79s/it]

At global step 1168, total_rewards=-74881429.28981504
global_step=1221, episodic_return=-178455440.0
global_step=1274, episodic_return=-87397936.0
global_step=1327, episodic_return=-171950560.0
global_step=1380, episodic_return=-104134096.0
global_step=1433, episodic_return=-83350528.0


100%|██████████| 10/10 [00:02<00:00,  4.57it/s]


SPS: 13


  1%|▏         | 5/342 [01:54<2:07:55, 22.78s/it]

At global step 1460, total_rewards=-77379707.01438811
global_step=1513, episodic_return=-84824904.0
global_step=1566, episodic_return=-88166232.0
global_step=1619, episodic_return=-94956096.0
global_step=1672, episodic_return=-102244672.0
global_step=1725, episodic_return=-91697304.0


100%|██████████| 10/10 [00:02<00:00,  4.45it/s]


SPS: 13


  2%|▏         | 6/342 [02:18<2:10:36, 23.32s/it]

At global step 1752, total_rewards=-79578263.65854432
global_step=1805, episodic_return=-116219592.0
global_step=1858, episodic_return=-95018544.0
global_step=1911, episodic_return=-87980648.0
global_step=1964, episodic_return=-88118280.0
global_step=2017, episodic_return=-84181952.0


100%|██████████| 10/10 [00:02<00:00,  4.01it/s]


SPS: 13


  2%|▏         | 7/342 [02:42<2:11:46, 23.60s/it]

At global step 2044, total_rewards=-76738271.50424446
global_step=2097, episodic_return=-84659872.0
global_step=2150, episodic_return=-91678368.0
global_step=2203, episodic_return=-95625480.0
global_step=2256, episodic_return=-90566856.0
global_step=2309, episodic_return=-89653352.0


100%|██████████| 10/10 [00:02<00:00,  4.22it/s]


SPS: 12


  2%|▏         | 8/342 [03:06<2:11:41, 23.66s/it]

At global step 2336, total_rewards=-75981280.01219437
global_step=2389, episodic_return=-81829624.0
global_step=2442, episodic_return=-77562032.0
global_step=2495, episodic_return=-88878536.0
global_step=2548, episodic_return=-79239176.0
global_step=2601, episodic_return=-84469976.0


100%|██████████| 10/10 [00:02<00:00,  4.35it/s]


SPS: 12


  3%|▎         | 9/342 [03:29<2:10:07, 23.45s/it]

At global step 2628, total_rewards=-70506108.15418589
global_step=2681, episodic_return=-77444712.0
global_step=2734, episodic_return=-87007728.0
global_step=2787, episodic_return=-86875888.0
global_step=2840, episodic_return=-77934544.0
global_step=2893, episodic_return=-74851952.0


100%|██████████| 10/10 [00:02<00:00,  3.97it/s]


SPS: 12


  3%|▎         | 10/342 [03:52<2:09:16, 23.36s/it]

At global step 2920, total_rewards=-67381805.80293567
global_step=2973, episodic_return=-78005424.0
global_step=3026, episodic_return=-79989776.0
global_step=3079, episodic_return=-81408256.0
global_step=3132, episodic_return=-72861464.0
global_step=3185, episodic_return=-74645584.0


100%|██████████| 10/10 [00:02<00:00,  4.32it/s]


SPS: 12


  3%|▎         | 11/342 [04:15<2:07:19, 23.08s/it]

At global step 3212, total_rewards=-64335856.063965864
global_step=3265, episodic_return=-67815864.0
global_step=3318, episodic_return=-72752288.0
global_step=3371, episodic_return=-71911888.0
global_step=3424, episodic_return=-71556880.0
global_step=3477, episodic_return=-81386176.0


100%|██████████| 10/10 [00:02<00:00,  4.53it/s]


SPS: 12


  4%|▎         | 12/342 [04:37<2:05:49, 22.88s/it]

At global step 3504, total_rewards=-62402387.85231479
global_step=3557, episodic_return=-76875008.0
global_step=3610, episodic_return=-72770112.0
global_step=3663, episodic_return=-68624648.0
global_step=3716, episodic_return=-70534200.0
global_step=3769, episodic_return=-70005352.0


100%|██████████| 10/10 [00:02<00:00,  4.50it/s]


SPS: 12


  4%|▍         | 13/342 [05:01<2:06:34, 23.08s/it]

At global step 3796, total_rewards=-60686762.79950226
global_step=3849, episodic_return=-72333864.0
global_step=3902, episodic_return=-67380064.0
global_step=3955, episodic_return=-67931592.0
global_step=4008, episodic_return=-84924960.0
global_step=4061, episodic_return=-71303152.0


100%|██████████| 10/10 [00:02<00:00,  4.58it/s]


SPS: 12


  4%|▍         | 14/342 [05:23<2:04:59, 22.87s/it]

At global step 4088, total_rewards=-62711694.538359955
global_step=4141, episodic_return=-73808248.0
global_step=4194, episodic_return=-76212280.0
global_step=4247, episodic_return=-70205912.0
global_step=4300, episodic_return=-67460720.0
global_step=4353, episodic_return=-70003136.0


100%|██████████| 10/10 [00:02<00:00,  4.39it/s]


SPS: 12


  4%|▍         | 15/342 [05:46<2:04:01, 22.76s/it]

At global step 4380, total_rewards=-58569773.920522265
global_step=4433, episodic_return=-72629384.0
global_step=4486, episodic_return=-64818832.0
global_step=4539, episodic_return=-70458664.0
global_step=4592, episodic_return=-64300480.0
global_step=4645, episodic_return=-72019528.0


100%|██████████| 10/10 [00:02<00:00,  4.28it/s]


SPS: 12


  5%|▍         | 16/342 [06:09<2:04:08, 22.85s/it]

At global step 4672, total_rewards=-59394951.08730896
global_step=4725, episodic_return=-68378952.0
global_step=4778, episodic_return=-64129992.0
global_step=4831, episodic_return=-64226284.0
global_step=4884, episodic_return=-70788824.0
global_step=4937, episodic_return=-63923412.0


100%|██████████| 10/10 [00:02<00:00,  4.32it/s]


SPS: 12


  5%|▍         | 17/342 [06:31<2:03:09, 22.74s/it]

At global step 4964, total_rewards=-61656280.554615654
global_step=5017, episodic_return=-68673680.0
global_step=5070, episodic_return=-63926916.0
global_step=5123, episodic_return=-66350820.0
global_step=5176, episodic_return=-66612476.0
global_step=5229, episodic_return=-68279512.0


100%|██████████| 10/10 [00:02<00:00,  4.10it/s]


SPS: 12


  5%|▌         | 18/342 [06:55<2:03:48, 22.93s/it]

At global step 5256, total_rewards=-57133070.249809675
global_step=5309, episodic_return=-74974008.0
global_step=5362, episodic_return=-63797024.0
global_step=5415, episodic_return=-64817332.0
global_step=5468, episodic_return=-61595000.0
global_step=5521, episodic_return=-63766048.0


100%|██████████| 10/10 [00:02<00:00,  4.34it/s]


SPS: 12


  6%|▌         | 19/342 [07:17<2:02:53, 22.83s/it]

At global step 5548, total_rewards=-56398052.46691203
global_step=5601, episodic_return=-62686956.0
global_step=5654, episodic_return=-62832724.0
global_step=5707, episodic_return=-61992280.0
global_step=5760, episodic_return=-58220200.0
global_step=5813, episodic_return=-68500144.0


100%|██████████| 10/10 [00:02<00:00,  3.64it/s]


SPS: 12


  6%|▌         | 20/342 [07:41<2:04:44, 23.24s/it]

At global step 5840, total_rewards=-52527271.392345324
global_step=5893, episodic_return=-57496456.0
global_step=5946, episodic_return=-63416596.0
global_step=5999, episodic_return=-60338556.0
global_step=6052, episodic_return=-63159872.0
global_step=6105, episodic_return=-63926980.0


100%|██████████| 10/10 [00:02<00:00,  3.56it/s]


SPS: 12


  6%|▌         | 21/342 [08:04<2:03:21, 23.06s/it]

At global step 6132, total_rewards=-50772770.67273957
global_step=6185, episodic_return=-65014184.0
global_step=6238, episodic_return=-60727692.0
global_step=6291, episodic_return=-57287460.0
global_step=6344, episodic_return=-63917208.0
global_step=6397, episodic_return=-63224068.0


100%|██████████| 10/10 [00:02<00:00,  3.92it/s]


SPS: 12


  6%|▋         | 22/342 [08:26<2:01:54, 22.86s/it]

At global step 6424, total_rewards=-46126588.3365019
global_step=6477, episodic_return=-58712124.0
global_step=6530, episodic_return=-56496916.0
global_step=6583, episodic_return=-55983256.0
global_step=6636, episodic_return=-60844784.0
global_step=6689, episodic_return=-61305432.0


100%|██████████| 10/10 [00:02<00:00,  4.40it/s]


SPS: 12


  7%|▋         | 23/342 [08:49<2:00:43, 22.71s/it]

At global step 6716, total_rewards=-43624394.94834028
global_step=6769, episodic_return=-57083516.0
global_step=6822, episodic_return=-59604772.0
global_step=6875, episodic_return=-62715324.0
global_step=6928, episodic_return=-55597404.0
global_step=6981, episodic_return=-56456508.0


100%|██████████| 10/10 [00:02<00:00,  4.36it/s]


SPS: 12


  7%|▋         | 24/342 [09:12<2:01:44, 22.97s/it]

At global step 7008, total_rewards=-43891715.69008464
global_step=7061, episodic_return=-55910000.0
global_step=7114, episodic_return=-63495888.0
global_step=7167, episodic_return=-60106960.0
global_step=7220, episodic_return=-54380316.0
global_step=7273, episodic_return=-55668144.0


100%|██████████| 10/10 [00:02<00:00,  4.23it/s]


SPS: 12


  7%|▋         | 25/342 [09:35<2:01:03, 22.91s/it]

At global step 7300, total_rewards=-44049822.720060065
global_step=7353, episodic_return=-55698352.0
global_step=7406, episodic_return=-55994464.0
global_step=7459, episodic_return=-49088484.0
global_step=7512, episodic_return=-53138228.0
global_step=7565, episodic_return=-56414444.0


100%|██████████| 10/10 [00:02<00:00,  4.36it/s]


SPS: 12


  8%|▊         | 26/342 [09:58<2:00:29, 22.88s/it]

At global step 7592, total_rewards=-46584822.970904656
global_step=7645, episodic_return=-57426632.0
global_step=7698, episodic_return=-55533768.0
global_step=7751, episodic_return=-59041956.0
global_step=7804, episodic_return=-56720716.0
global_step=7857, episodic_return=-54987312.0


100%|██████████| 10/10 [00:02<00:00,  3.99it/s]


SPS: 12


  8%|▊         | 27/342 [10:22<2:01:53, 23.22s/it]

At global step 7884, total_rewards=-48816291.19711381
global_step=7937, episodic_return=-55528116.0
global_step=7990, episodic_return=-54061548.0
global_step=8043, episodic_return=-53545924.0
global_step=8096, episodic_return=-53677144.0
global_step=8149, episodic_return=-53739072.0


100%|██████████| 10/10 [00:02<00:00,  4.31it/s]


SPS: 12


  8%|▊         | 28/342 [10:44<1:59:53, 22.91s/it]

At global step 8176, total_rewards=-45585212.819101445
global_step=8229, episodic_return=-54453132.0
global_step=8282, episodic_return=-52782240.0
global_step=8335, episodic_return=-53756364.0
global_step=8388, episodic_return=-52698432.0
global_step=8441, episodic_return=-52214416.0


100%|██████████| 10/10 [00:02<00:00,  4.27it/s]


SPS: 12


  8%|▊         | 29/342 [11:07<1:58:39, 22.75s/it]

At global step 8468, total_rewards=-44471375.82020828
global_step=8521, episodic_return=-57264964.0
global_step=8574, episodic_return=-55219548.0
global_step=8627, episodic_return=-54101740.0
global_step=8680, episodic_return=-52382356.0
global_step=8733, episodic_return=-52053896.0


100%|██████████| 10/10 [00:02<00:00,  4.47it/s]


SPS: 12


  9%|▉         | 30/342 [11:30<1:58:43, 22.83s/it]

At global step 8760, total_rewards=-45806104.54661074
global_step=8813, episodic_return=-57861564.0
global_step=8866, episodic_return=-52430804.0
global_step=8919, episodic_return=-51889528.0
global_step=8972, episodic_return=-51273380.0
global_step=9025, episodic_return=-52959388.0


100%|██████████| 10/10 [00:02<00:00,  4.56it/s]


SPS: 12


  9%|▉         | 31/342 [11:52<1:58:14, 22.81s/it]

At global step 9052, total_rewards=-45439997.79261839
global_step=9105, episodic_return=-53672604.0
global_step=9158, episodic_return=-53851952.0
global_step=9211, episodic_return=-54898312.0
global_step=9264, episodic_return=-52499888.0
global_step=9317, episodic_return=-51427676.0


100%|██████████| 10/10 [00:02<00:00,  3.91it/s]


SPS: 12


  9%|▉         | 32/342 [12:15<1:57:15, 22.70s/it]

At global step 9344, total_rewards=-49086586.62170096
global_step=9397, episodic_return=-54347072.0
global_step=9450, episodic_return=-55274460.0
global_step=9503, episodic_return=-50691476.0
global_step=9556, episodic_return=-49646076.0
global_step=9609, episodic_return=-56541536.0


100%|██████████| 10/10 [00:04<00:00,  2.49it/s]


SPS: 12


 10%|▉         | 33/342 [12:42<2:04:25, 24.16s/it]

At global step 9636, total_rewards=-53855021.737694204
global_step=9689, episodic_return=-54022264.0
global_step=9742, episodic_return=-50207368.0
global_step=9795, episodic_return=-53143744.0
global_step=9848, episodic_return=-54503508.0
global_step=9901, episodic_return=-49656420.0


100%|██████████| 10/10 [00:03<00:00,  2.69it/s]


SPS: 12


 10%|▉         | 34/342 [13:10<2:09:24, 25.21s/it]

At global step 9928, total_rewards=-53137225.77185479
global_step=9981, episodic_return=-52339012.0
global_step=10034, episodic_return=-49936476.0
global_step=10087, episodic_return=-53060108.0
global_step=10140, episodic_return=-46798888.0
global_step=10193, episodic_return=-56195652.0


100%|██████████| 10/10 [00:02<00:00,  4.04it/s]


SPS: 12


 10%|█         | 35/342 [13:44<2:22:28, 27.85s/it]

At global step 10220, total_rewards=-53185416.54949258
global_step=10273, episodic_return=-48516876.0
global_step=10326, episodic_return=-48999888.0
global_step=10379, episodic_return=-56298220.0
global_step=10432, episodic_return=-52533684.0
global_step=10485, episodic_return=-53379628.0


100%|██████████| 10/10 [00:03<00:00,  2.57it/s]


SPS: 12


 11%|█         | 36/342 [14:09<2:18:05, 27.08s/it]

At global step 10512, total_rewards=-47100381.44439803
global_step=10565, episodic_return=-54314132.0
global_step=10618, episodic_return=-47492928.0
global_step=10671, episodic_return=-50886320.0
global_step=10724, episodic_return=-47876848.0
global_step=10777, episodic_return=-48905232.0


100%|██████████| 10/10 [00:02<00:00,  4.05it/s]


SPS: 12


 11%|█         | 37/342 [14:36<2:16:25, 26.84s/it]

At global step 10804, total_rewards=-48007064.55710171
global_step=10857, episodic_return=-50189580.0
global_step=10910, episodic_return=-47586332.0
global_step=10963, episodic_return=-49825056.0
global_step=11016, episodic_return=-50066172.0
global_step=11069, episodic_return=-48960016.0


100%|██████████| 10/10 [00:02<00:00,  4.01it/s]


SPS: 12


 11%|█         | 38/342 [15:06<2:21:45, 27.98s/it]

At global step 11096, total_rewards=-46363218.65324462
global_step=11149, episodic_return=-59491360.0
global_step=11202, episodic_return=-62188644.0
global_step=11255, episodic_return=-46890900.0
global_step=11308, episodic_return=-48425652.0
global_step=11361, episodic_return=-47930188.0


100%|██████████| 10/10 [00:02<00:00,  3.94it/s]


SPS: 12


 11%|█▏        | 39/342 [15:36<2:24:38, 28.64s/it]

At global step 11388, total_rewards=-46226250.25802873
global_step=11441, episodic_return=-52877752.0
global_step=11494, episodic_return=-49557028.0
global_step=11547, episodic_return=-47128444.0
global_step=11600, episodic_return=-48455184.0
global_step=11653, episodic_return=-51297872.0


100%|██████████| 10/10 [00:03<00:00,  2.74it/s]


SPS: 12


 12%|█▏        | 40/342 [16:04<2:21:58, 28.21s/it]

At global step 11680, total_rewards=-45879422.072195515
global_step=11733, episodic_return=-56921572.0
global_step=11786, episodic_return=-50882424.0
global_step=11839, episodic_return=-49350592.0
global_step=11892, episodic_return=-47142480.0
global_step=11945, episodic_return=-47647960.0


100%|██████████| 10/10 [00:02<00:00,  4.59it/s]


SPS: 12


 12%|█▏        | 41/342 [16:30<2:18:47, 27.67s/it]

At global step 11972, total_rewards=-44715964.006441206
global_step=12025, episodic_return=-60272560.0
global_step=12078, episodic_return=-45574256.0
global_step=12131, episodic_return=-49165588.0
global_step=12184, episodic_return=-47125136.0
global_step=12237, episodic_return=-52258168.0


100%|██████████| 10/10 [00:02<00:00,  3.35it/s]


SPS: 12


 12%|█▏        | 42/342 [16:56<2:16:01, 27.21s/it]

At global step 12264, total_rewards=-44206595.9860551
global_step=12317, episodic_return=-51284428.0
global_step=12370, episodic_return=-49159856.0
global_step=12423, episodic_return=-46140484.0
global_step=12476, episodic_return=-49758736.0
global_step=12529, episodic_return=-47168432.0


100%|██████████| 10/10 [00:02<00:00,  3.44it/s]


SPS: 12


 13%|█▎        | 43/342 [17:21<2:11:40, 26.42s/it]

At global step 12556, total_rewards=-43655797.32124931
global_step=12609, episodic_return=-49859296.0
global_step=12662, episodic_return=-47713124.0
global_step=12715, episodic_return=-46323676.0
global_step=12768, episodic_return=-47953068.0
global_step=12821, episodic_return=-47020724.0


100%|██████████| 10/10 [00:02<00:00,  3.52it/s]


SPS: 12


 13%|█▎        | 44/342 [17:47<2:10:19, 26.24s/it]

At global step 12848, total_rewards=-43914766.042894915
global_step=12901, episodic_return=-51268144.0
global_step=12954, episodic_return=-50072824.0
global_step=13007, episodic_return=-48291044.0
global_step=13060, episodic_return=-50581336.0
global_step=13113, episodic_return=-47913820.0


100%|██████████| 10/10 [00:03<00:00,  3.11it/s]


SPS: 12


 13%|█▎        | 45/342 [18:13<2:09:58, 26.26s/it]

At global step 13140, total_rewards=-43549887.85791444
global_step=13193, episodic_return=-53323212.0
global_step=13246, episodic_return=-53984352.0


 13%|█▎        | 45/342 [18:21<2:01:09, 24.48s/it]


KeyboardInterrupt: 