In [28]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import parallel
import torch
import movable_wall_parallel
import torch.nn as nn
import numpy as np
from torch.distributions import Categorical
import wandb
RANDOM = 42

import torch.nn.functional as F

In [29]:


############################ HIGHLY IMPORTANT VARIABLES TO SET######################################
GRID_SIZE = 7
NUM_THINGS = 5 # number of things in the grid wall, pred1, pred2, h1, h2, movablewall


INITIALIZATIONS = [
    './models/PRED_GROUP_POLICY_BOTH_INDIV_RWD_2700.ckpt', # pred_policy
    './models/PRED_GROUP_POLICY_BOTH_INDIV_RWD_2700.ckpt',    # hider_policy
    './models/HIDER_GROUP_POLICY_BOTH_INDIV_RWD_1800.ckpt',
    './models/HIDER_GROUP_POLICY_BOTH_INDIV_RWD_1800.ckpt',
    ]    # hider_2
#should be either RANDOM ; or a path to a pretrained checkpoint (a String)



IS_TRAINING =   [
    True, #pred1
    True, #pred2
    False, #hider1
    False #hider2
]
#either True or False, if False, weights are frozen (or if random it will stay random)


envname = 'mparallel-walls' #just for wandb logging
CUSTOMENV = movable_wall_parallel.parallel_env(grid_size=GRID_SIZE,walls=True)
# change architecture if needed

ent_coef = 0.4
vf_coef = 0.2
clip_coef = 0.1
gamma = 0.975
batch_size = 64
max_cycles = 200
total_episodes = 2000
PPO_STEPS = 3

reminder = '''DONT FORGET TO ADD CODE TO SAVE CHECKPOINTS IF YOU WANT TO DO THAT'''
ckpt_name= "GROUP_POLICY_BOTH_INDIV_RWD_ROUND2"
SAVE_PRED_POL = True
SAVE_HIDER_POL = False
print(reminder)

##################################################################################################


Using WALLS
DONT FORGET TO ADD CODE TO SAVE CHECKPOINTS IF YOU WANT TO DO THAT


In [30]:
class Super_Agent(nn.Module):
    #Common agent class for all hiders/seekers
    
    def __init__(self, num_actions, num_agents):
        super().__init__()

        # CNN architecture inspired by DQN for Atari
        self.network = nn.Sequential(
            nn.Conv2d(NUM_THINGS, 32, kernel_size=3, stride=1, padding=1),  # Output: 32 x 7 x 7
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # Output: 64 x 7 x 7
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),  # Output: 64 x 7 x 7
            nn.ReLU(),
            nn.Flatten(),  # Output: 64 * 7 * 7 = 3136
        )
        self.actor = self._layer_init(nn.Linear(64*GRID_SIZE**2, num_actions), std=0.01) #TODO depends on GRID_SIZE
        self.critic = self._layer_init(nn.Linear(64*GRID_SIZE**2, 1))

    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

    def get_value(self, x):
        return self.critic(self.network(x / 1.0))  # Normalize input to [0, 1]

    def get_action_and_value(self, x, action=None):
        hidden = self.network(x / 1.0)  # Normalize input to [0, 1]
        
        logits = self.actor(hidden)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)

    

def batchify_obs(obs, device):
    """Converts PZ style observations to batch of torch arrays."""
    # convert to list of np arrays
    obs = np.stack([obs[a] for a in obs], axis=0)
    # convert to torch
    obs = torch.tensor(obs).to(device)

    return obs


def batchify(x, device):
    """Converts PZ style returns to batch of torch arrays."""
    # convert to list of np arrays
    x = np.stack([x[a] for a in x], axis=0)
    # convert to torch
    x = torch.tensor(x).to(device)

    return x


def unbatchify(x, env):
    """Converts np array to PZ style arguments."""
    x = x.cpu().numpy()
    x = {a: x[i] for i, a in enumerate(env.possible_agents)}

    return x

def reshape_obs(observations, env):
    modified_observations = {}
    for self_name, obs in observations.items():
        self_layer = env.agent_layers[self_name]
        enemy_layers = []
        for name, layer_idx in env.agent_layers.items():
            
            if name == self_name:
                self_layer = obs[layer_idx]
            elif name.startswith(self_name[:4]): #starts with the same 4 letters: pred or hide
                friend_layer = obs[layer_idx]
            else:
                enemy_layers.append(obs[layer_idx])
         
        new_obs = [obs[0]] #walls
        new_obs.append(self_layer) #self
        new_obs.append(friend_layer) #friend
        new_obs.append(sum(enemy_layers)) #enemies
        new_obs.append(obs[-1]) #movable walls

        modified_observations[self_name] = np.stack(new_obs, axis = 0)
        
    return modified_observations
        


In [32]:
    wandb.init(
            project="multi-agent-ppo",  # Set your project name
            name=ckpt_name,
            config={
                "env": envname,
                "GRID_SIZE": GRID_SIZE,
                "NUM_THINGS": NUM_THINGS,
                "INITIALIZATIONS": INITIALIZATIONS,
                "IS_TRAINING": IS_TRAINING,
                "ent_coef": ent_coef,
                "vf_coef": vf_coef,
                "clip_coef": clip_coef,
                "gamma": gamma,
                "batch_size": batch_size,
                "max_cycles": max_cycles,
                "total_episodes": total_episodes,
                "PPO_STEPS": PPO_STEPS,
            }
    )
    """ALGO PARAMS"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    lr = 0.0001
    

    """ ENV SETUP """
    env = CUSTOMENV

    num_agents = len(env.possible_agents)
    num_actions = env.action_space(env.possible_agents[0]).n
    observation_size = env.observation_space(env.possible_agents[0]).shape
    
    """ LEARNER SETUP """
    # Create a list of agents, one for each training agent
    training_agents = []
    optimizers = []
    training_agent_indices = [i for i, training in enumerate(IS_TRAINING) if training]
    frozen_agent_indices = [i for i, training in enumerate(IS_TRAINING) if not training and INITIALIZATIONS[i] != RANDOM]
    
    hiders = [2,3]
    preds = [0,1]

    pred_policy = Super_Agent(num_actions, num_agents).to(device)
    pred_optimizer = optim.Adam(pred_policy.parameters(), lr=lr, eps=1e-5)
    if INITIALIZATIONS[0] != RANDOM:
        agent.load_state_dict(torch.load(INITIALIZATIONS[0]))
        print(f'loaded from {INITIALIZATIONS[0]}')
        
    hider_policy = Super_Agent(num_actions, num_agents).to(device)
    hider_optimizer = optim.Adam(hider_policy.parameters(), lr=lr, eps=1e-5)
    if INITIALIZATIONS[2] != RANDOM:
        agent.load_state_dict(torch.load(INITIALIZATIONS[1]))
        print(f'loaded from {INITIALIZATIONS[1]}')
        
    # a frozen is one that is NOT TRAINING and NOT RANDOM
    for idx in training_agent_indices:
        if idx in hiders:
            agent = hider_policy
            optimizer = hider_optimizer
        elif idx in preds:
            agent = pred_policy
            optimizer = pred_optimizer

        training_agents.append(agent)
        optimizers.append(optimizer)
    
    frozen_agents = [] # These agents are not random, but are NOT TRAINING ; initialized with a checkpoint
    for idx, init in enumerate(INITIALIZATIONS):
        if init != RANDOM and not IS_TRAINING[idx]:
            agent = Super_Agent(num_actions=num_actions, num_agents=num_agents).to(device)
            agent.load_state_dict(torch.load(init))
            agent.eval()
            #freeze weights
            for param in agent.parameters():
                param.requires_grad = False
            print(f' loaded from {init}')
            frozen_agents.append(agent)

    """ ALGO LOGIC: EPISODE STORAGE"""
    end_step = 0
    total_episodic_return = 0
    rb_obs = torch.zeros((max_cycles, num_agents, NUM_THINGS,GRID_SIZE,GRID_SIZE)).to(device)
    rb_actions = torch.zeros((max_cycles, num_agents)).to(device)
    rb_logprobs = torch.zeros((max_cycles, num_agents)).to(device)
    rb_rewards = torch.zeros((max_cycles, num_agents)).to(device)
    rb_terms = torch.zeros((max_cycles, num_agents)).to(device)
    rb_values = torch.zeros((max_cycles, num_agents)).to(device)

    """ TRAINING LOGIC """
    # Track returns for all agents
    all_returns = [[] for _ in range(num_agents)]

    for episode in range(1,total_episodes+1):
        # collect an episode
        with torch.no_grad():
            # collect observations and convert to batch of torch tensors
            next_obs, info = env.reset(seed=None)
            # reset the episodic return
            total_episodic_return = 0

            # each episode has num_steps
            for step in range(0, max_cycles):
                #modify observation to get self, friends, enemy position layers
                obs = reshape_obs(next_obs, env)
                
                # rollover the observation
                obs = batchify_obs(obs, device)

                # get action for first agent from the trained agents
                # get random actions for other agents
                actions = torch.zeros(num_agents, dtype=torch.long).to(device)
                logprobs = torch.zeros(num_agents).to(device)
                values = torch.zeros(num_agents).to(device)

                # Process each agent
                for i in range(num_agents):
                    if IS_TRAINING[i]:
                        # Find the index of this training agent among training agents
                        train_idx = training_agent_indices.index(i)
                        # Get action and value for training agent
                        agent_obs = obs[i].unsqueeze(0)
                        actions[i], logprobs[i], _, values[i] = training_agents[train_idx].get_action_and_value(agent_obs)
                    elif INITIALIZATIONS[i] != RANDOM:
                        #this is a frozen agent (not training, but not random because it has a checkpoint)
                        frozen_idx = frozen_agent_indices.index(i)
                        agent_obs = obs[i].unsqueeze(0)
                        actions[i], logprobs[i], _, values[i] = frozen_agents[frozen_idx].get_action_and_value(agent_obs)

                        logprobs[i] = torch.log(torch.tensor(1.0/num_actions))
                        values[i] = 0.0  # No value estimation for frozen agents
                    else:
                        # Random action for random agents
                        actions[i] = torch.randint(0, num_actions, (1,)).to(device)
                        logprobs[i] = torch.log(torch.tensor(1.0/num_actions))
                        values[i] = 0.0  # No value estimation for random agents

                # execute the environment and log data
                next_obs, rewards, terms, truncs, infos = env.step(
                    unbatchify(actions, env)
                )

                # add to episode storage
                rb_obs[step] = obs
                rb_rewards[step] = batchify(rewards, device)
                rb_terms[step] = batchify(terms, device)
                rb_actions[step] = actions
                rb_logprobs[step] = logprobs
                rb_values[step] = values

                # compute episodic return
                total_episodic_return += rb_rewards[step].cpu().numpy()

                # if we reach termination or truncation, end
                if any([terms[a] for a in terms]) or any([truncs[a] for a in truncs]):
                    end_step = step
                    break

        # Train only the specified agents
        for train_idx, agent_idx in enumerate(training_agent_indices):
            # Bootstrap value and advantages only for the training agent
            with torch.no_grad():
                rb_advantages = torch.zeros_like(rb_rewards).to(device)
                for t in reversed(range(end_step)):
                    delta = (
                        rb_rewards[t, agent_idx]  # Only specific agent's reward
                        + gamma * rb_values[t + 1, agent_idx] * rb_terms[t + 1, agent_idx]
                        - rb_values[t, agent_idx]
                    )
                    rb_advantages[t, agent_idx] = delta + gamma * gamma * rb_advantages[t + 1, agent_idx]
                rb_returns = rb_advantages + rb_values

            # convert our episodes to batch of individual transitions (only for specific agent)
            b_obs = rb_obs[:end_step, agent_idx]
            b_logprobs = rb_logprobs[:end_step, agent_idx]
            b_actions = rb_actions[:end_step, agent_idx]
            b_returns = rb_returns[:end_step, agent_idx]
            b_values = rb_values[:end_step, agent_idx]
            b_advantages = rb_advantages[:end_step, agent_idx]

            # Optimizing the policy and value network
            b_index = np.arange(len(b_obs))
            clip_fracs = []
            for repeat in range(PPO_STEPS):
                # shuffle the indices we use to access the data
                np.random.shuffle(b_index)
                for start in range(0, len(b_obs), batch_size):
                    # select the indices we want to train on
                    end = start + batch_size
                    batch_index = b_index[start:end]

                    _, newlogprob, entropy, value = training_agents[train_idx].get_action_and_value(
                        b_obs[batch_index], b_actions.long()[batch_index]
                    )
                    logratio = newlogprob - b_logprobs[batch_index]
                    ratio = logratio.exp()

                    with torch.no_grad():
                        # calculate approx_kl http://joschu.net/blog/kl-approx.html
                        old_approx_kl = (-logratio).mean()
                        approx_kl = ((ratio - 1) - logratio).mean()
                        clip_fracs += [
                            ((ratio - 1.0).abs() > clip_coef).float().mean().item()
                        ]

                    # normalize advantages
                    advantages = b_advantages[batch_index]
                    advantages = (advantages - advantages.mean()) / (
                        advantages.std() + 1e-8
                    )

                    # Policy loss
                    pg_loss1 = -b_advantages[batch_index] * ratio
                    pg_loss2 = -b_advantages[batch_index] * torch.clamp(
                        ratio, 1 - clip_coef, 1 + clip_coef
                    )
                    pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                    # Value loss
                    value = value.flatten()
                    v_loss_unclipped = (value - b_returns[batch_index]) ** 2
                    v_clipped = b_values[batch_index] + torch.clamp(
                        value - b_values[batch_index],
                        -clip_coef,
                        clip_coef,
                    )
                    v_loss_clipped = (v_clipped - b_returns[batch_index]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()

                    entropy_loss = entropy.mean()
                    loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

                    optimizers[train_idx].zero_grad()
                    loss.backward()
                    optimizers[train_idx].step()

            # Store returns for the training agents
            y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
            var_y = np.var(y_true)
            explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        # Accumulate returns for all agents
        for i in range(num_agents):
            all_returns[i].append(total_episodic_return[i])

        if episode % 10 == 0:
            print(f"Training episode {episode}")
            print(f"Episodic Return: {(total_episodic_return)}")

            # Print smoothed returns for each agent
            for i in range(num_agents):
                status = None

                if IS_TRAINING[i]:
                    status = "Training"
                elif INITIALIZATIONS[i] == RANDOM:
                    status = "Random"
                else:
                    status = "Frozen"
                    
                print(f"Smoothed Returns for agent_{i} ({status}): {np.mean(all_returns[i][-20:])}")

            print(f"Episode Length: {end_step}")
            print("")
            print(f"Value Loss: {v_loss.item()}")
            print(f"Policy Loss: {pg_loss.item()}")
            print(f"Old Approx KL: {old_approx_kl.item()}")
            print(f"Approx KL: {approx_kl.item()}")
            print(f"Clip Fraction: {np.mean(clip_fracs)}")
            print(f"Explained Variance: {explained_var.item()}")
            print("\n-------------------------------------------\n")

        #log all with wandb
        wandb.log({
            "Ep return pred1": total_episodic_return[0],
            "Ep return pred2": total_episodic_return[1],
            "Ep return hider1": total_episodic_return[2],
            "Ep return hider2": total_episodic_return[3],
            "Episode Length": end_step,
            "Value Loss": v_loss.item(),
            "Policy Loss": pg_loss.item(),
            "Old Approx KL": old_approx_kl.item(),
            "Approx KL": approx_kl.item(),
            "Clip Fraction": np.mean(clip_fracs),
            "Explained Variance": explained_var.item()
        })

        #if for pred_1 (index 0) episode return and smoothed are greater than -200, save the model

        # if total_episodic_return[0] > -210 and np.mean(all_returns[0][-20:]) > -210:
        #     #create dir
        #     import os
        #     if not os.path.exists('./models'):
        #         os.makedirs('./models')f
        #     #save just state dict for 0
        #     torch.save(agents[0].state_dict(), f'./models/agentwalls_{episode}.ckpt')
        #     exit(1)



        #if reward greater than 600 for hider_1 both for last and smoothed for last 5
        #every 100 epochs save the 2 models

        if episode % 100 == 0:
            if SAVE_PRED_POL:
                torch.save(pred_policy.state_dict(), f'./models/PRED_{ckpt_name}_{episode}.ckpt')
            if SAVE_HIDER_POL:
                torch.save(hider_policy.state_dict(), f'./models/HIDER_{ckpt_name}_{episode}.ckpt')

0,1
Approx KL,▂▂▁▁▁▂▁▁▂█▁▂▃▁▂▁▁▁▂▁▃▁▂▃▄▂▁▂▂▁▁▆▁▁▁▁▁▁▅▂
Clip Fraction,▆▂█▃▂█▅▁▄▃▅▂▄▅▄▃▁▄▄▂▅▃▃▄▅▃▂▂▅▂▄▆▅▃▆▄▃▃▃▁
Ep return hider1,▇▇▇██▆█▆▆▇▅▇▅█▆▄▅▇▆▆▆▆▃▄▅▇▄▂▄▃▄▃▂▄▃▂▁▂▂▂
Ep return hider2,▆█▆▆▅█▅▆▇▅▆██▅▅▂█▅▆▅▇▇█▅▄▆▄▅▇▂█▄▅▅▆▃▂█▁▃
Ep return pred1,▂▁▂▁▃▂▂▂▂▃▂▂▂▂▁▂▂▂▂▁▃▂▂▂▂█▃▃▂▃▃▃▃▅▃▄▅▄▄▄
Ep return pred2,▃▃▂▃▃▃▄▃▃▂▃▂▃▄▂▃▃▄▃▂▄▅▅▃▄▄▅▆▅▅▅▅█▁▇█▆▆█▃
Episode Length,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Explained Variance,▅▅▅▅▅▆▅▅▅▆▆▆▆▆▆▆▆▅▆█▆▇▇▆▆▇▇▇▃▇▅▅▃▇▁▄▅▇▅▆
Old Approx KL,▄▆▅▃▃▅▂▅▃▂▅▂▄▄▃▄▃▅▄▄▅▄▄▄▄▃▄█▂▃▃▁▂▄▃▄▄▃▅▄
Policy Loss,█▆▂▄▆▄▄▅▆▇▅▆▅▅▅▄▆▆▅▅▇▇▁▃▅▅▆▆▄▆▃▅▆▄▇▅▅▄▄▅

0,1
Approx KL,0.00012
Clip Fraction,0.0638
Ep return hider1,446.29742
Ep return hider2,-736.66669
Ep return pred1,144.72395
Ep return pred2,-213.58389
Episode Length,199.0
Explained Variance,0.05426
Old Approx KL,0.01161
Policy Loss,1.13616


  agent.load_state_dict(torch.load(INITIALIZATIONS[0]))
  agent.load_state_dict(torch.load(INITIALIZATIONS[1]))
  agent.load_state_dict(torch.load(init))


loaded from ./models/PRED_GROUP_POLICY_BOTH_INDIV_RWD_2700.ckpt
loaded from ./models/PRED_GROUP_POLICY_BOTH_INDIV_RWD_2700.ckpt
 loaded from ./models/HIDER_GROUP_POLICY_BOTH_INDIV_RWD_1800.ckpt
 loaded from ./models/HIDER_GROUP_POLICY_BOTH_INDIV_RWD_1800.ckpt
Training episode 10
Episodic Return: [-321.58313 -795.39465  439.53745  310.74405]
Smoothed Returns for agent_0 (Training): -607.5576782226562
Smoothed Returns for agent_1 (Training): -575.2372436523438
Smoothed Returns for agent_2 (Frozen): 412.5675354003906
Smoothed Returns for agent_3 (Frozen): 399.1104736328125
Episode Length: 199

Value Loss: 3048.09423828125
Policy Loss: 17.68434715270996
Old Approx KL: 0.018755769357085228
Approx KL: 0.002177843125537038
Clip Fraction: 0.5522693457702795
Explained Variance: 0.0006704926490783691

-------------------------------------------

Training episode 20
Episodic Return: [ 153.04854  111.45593 -584.3666  -157.32454]
Smoothed Returns for agent_0 (Training): -280.62139892578125
Smoothed

Training episode 150
Episodic Return: [-752.7502  -742.9088   832.32874  589.23724]
Smoothed Returns for agent_0 (Training): 306.48834228515625
Smoothed Returns for agent_1 (Training): 617.4381713867188
Smoothed Returns for agent_2 (Frozen): -592.8497924804688
Smoothed Returns for agent_3 (Frozen): -212.4578094482422
Episode Length: 199

Value Loss: 558.926025390625
Policy Loss: 27.576251983642578
Old Approx KL: -0.023229174315929413
Approx KL: 0.0017623135354369879
Clip Fraction: 0.09393601243694623
Explained Variance: -0.11678802967071533

-------------------------------------------

Training episode 160
Episodic Return: [  682.8243    705.02795   768.54614 -1753.0159 ]
Smoothed Returns for agent_0 (Training): 464.863525390625
Smoothed Returns for agent_1 (Training): 814.304931640625
Smoothed Returns for agent_2 (Frozen): -665.8410034179688
Smoothed Returns for agent_3 (Frozen): -466.177978515625
Episode Length: 199

Value Loss: 1802.0709228515625
Policy Loss: -19.52788734436035
Old 

Training episode 290
Episodic Return: [  938.75836  4826.305   -2127.785   -4530.53   ]
Smoothed Returns for agent_0 (Training): 2142.53369140625
Smoothed Returns for agent_1 (Training): 3923.475341796875
Smoothed Returns for agent_2 (Frozen): -3698.028564453125
Smoothed Returns for agent_3 (Frozen): -2722.1806640625
Episode Length: 199

Value Loss: 3893.01416015625
Policy Loss: -37.39149856567383
Old Approx KL: 0.011045558378100395
Approx KL: 0.00019931794668082148
Clip Fraction: 0.09375
Explained Variance: 0.07444632053375244

-------------------------------------------

Training episode 300
Episodic Return: [ 1907.6195  2429.5693 -1287.141  -3141.1992]
Smoothed Returns for agent_0 (Training): 2545.05224609375
Smoothed Returns for agent_1 (Training): 3882.990966796875
Smoothed Returns for agent_2 (Frozen): -3798.568359375
Smoothed Returns for agent_3 (Frozen): -3033.57861328125
Episode Length: 199

Value Loss: 3027.47607421875
Policy Loss: 64.99368286132812
Old Approx KL: -0.11052557

Training episode 430
Episodic Return: [ -828.9431  3165.7952  -519.4048 -2415.4702]
Smoothed Returns for agent_0 (Training): -687.6026611328125
Smoothed Returns for agent_1 (Training): -505.74249267578125
Smoothed Returns for agent_2 (Frozen): 530.1954956054688
Smoothed Returns for agent_3 (Frozen): 388.11529541015625
Episode Length: 199

Value Loss: 11571.7880859375
Policy Loss: -120.22271728515625
Old Approx KL: -0.04420610889792442
Approx KL: 0.006666362751275301
Clip Fraction: 0.0848214291036129
Explained Variance: 0.12443464994430542

-------------------------------------------

Training episode 440
Episodic Return: [-865.5096  -369.11667  367.8737   571.3036 ]
Smoothed Returns for agent_0 (Training): -778.3743896484375
Smoothed Returns for agent_1 (Training): 784.9898681640625
Smoothed Returns for agent_2 (Frozen): -15.448710441589355
Smoothed Returns for agent_3 (Frozen): -388.8085021972656
Episode Length: 199

Value Loss: 74.86509704589844
Policy Loss: -4.148743629455566
Old Ap

Training episode 580
Episodic Return: [  110.695015  2008.7056   -1854.9377    -995.3377  ]
Smoothed Returns for agent_0 (Training): -800.592041015625
Smoothed Returns for agent_1 (Training): -594.6871337890625
Smoothed Returns for agent_2 (Frozen): 492.54217529296875
Smoothed Returns for agent_3 (Frozen): 542.5738525390625
Episode Length: 199

Value Loss: 3774.332763671875
Policy Loss: -36.95780944824219
Old Approx KL: -0.052536334842443466
Approx KL: 0.0020460912492126226
Clip Fraction: 0.23642113308111826
Explained Variance: 0.1285862922668457

-------------------------------------------

Training episode 590
Episodic Return: [  298.44592  4177.3696   -664.3136  -4316.713  ]
Smoothed Returns for agent_0 (Training): -424.97589111328125
Smoothed Returns for agent_1 (Training): 811.9912109375
Smoothed Returns for agent_2 (Frozen): -299.4462585449219
Smoothed Returns for agent_3 (Frozen): -467.04315185546875
Episode Length: 199

Value Loss: 1504.5167236328125
Policy Loss: -11.1200742721

Training episode 720
Episodic Return: [ -869.6142  -1123.4858    890.56354   881.6673 ]
Smoothed Returns for agent_0 (Training): 83.25618743896484
Smoothed Returns for agent_1 (Training): 11.543475151062012
Smoothed Returns for agent_2 (Frozen): -274.5625915527344
Smoothed Returns for agent_3 (Frozen): -209.87350463867188
Episode Length: 199

Value Loss: 32.984291076660156
Policy Loss: 7.284465789794922
Old Approx KL: 0.01804334856569767
Approx KL: 0.003523537190631032
Clip Fraction: 0.10621279974778493
Explained Variance: 0.0007756352424621582

-------------------------------------------

Training episode 730
Episodic Return: [ 2742.9587  2200.2397 -3704.4675 -2376.8914]
Smoothed Returns for agent_0 (Training): 891.4905395507812
Smoothed Returns for agent_1 (Training): 885.7154541015625
Smoothed Returns for agent_2 (Frozen): -1295.199951171875
Smoothed Returns for agent_3 (Frozen): -929.3883056640625
Episode Length: 199

Value Loss: 1208.90234375
Policy Loss: -34.08380889892578
Old Ap

Training episode 860
Episodic Return: [ -875.1385  -1156.3005    886.22656   888.94086]
Smoothed Returns for agent_0 (Training): -300.25823974609375
Smoothed Returns for agent_1 (Training): -215.1400604248047
Smoothed Returns for agent_2 (Frozen): 239.84579467773438
Smoothed Returns for agent_3 (Frozen): -33.472660064697266
Episode Length: 199

Value Loss: 47.880455017089844
Policy Loss: 9.183050155639648
Old Approx KL: -0.0014356885803863406
Approx KL: 1.4645713690697448e-06
Clip Fraction: 0.022135416666666668
Explained Variance: -0.2806360721588135

-------------------------------------------

Training episode 870
Episodic Return: [  167.30226   365.89017   606.48584 -1253.4596 ]
Smoothed Returns for agent_0 (Training): -368.10943603515625
Smoothed Returns for agent_1 (Training): -529.6005859375
Smoothed Returns for agent_2 (Frozen): 341.45025634765625
Smoothed Returns for agent_3 (Frozen): 268.10076904296875
Episode Length: 199

Value Loss: 459.59918212890625
Policy Loss: 30.2060108

Training episode 1000
Episodic Return: [ -870.6874 -1121.3702   885.8896   884.8268]
Smoothed Returns for agent_0 (Training): -670.37158203125
Smoothed Returns for agent_1 (Training): -1049.84423828125
Smoothed Returns for agent_2 (Frozen): 698.9071044921875
Smoothed Returns for agent_3 (Frozen): 692.3676147460938
Episode Length: 199

Value Loss: 52.426856994628906
Policy Loss: -10.175975799560547
Old Approx KL: 0.0017337800236418843
Approx KL: 3.899846888089087e-06
Clip Fraction: 0.00390625
Explained Variance: -0.12842059135437012

-------------------------------------------

Training episode 1010
Episodic Return: [ -858.842   -1288.0125    875.14307   881.3475 ]
Smoothed Returns for agent_0 (Training): 198.89761352539062
Smoothed Returns for agent_1 (Training): -86.25946807861328
Smoothed Returns for agent_2 (Frozen): 134.97000122070312
Smoothed Returns for agent_3 (Frozen): -518.2606201171875
Episode Length: 199

Value Loss: 7.35326623916626
Policy Loss: 0.4612472355365753
Old Appro

Training episode 1140
Episodic Return: [ -878.7333 -1177.4946   890.6223   889.9944]
Smoothed Returns for agent_0 (Training): 613.29736328125
Smoothed Returns for agent_1 (Training): 1516.802978515625
Smoothed Returns for agent_2 (Frozen): -1723.504150390625
Smoothed Returns for agent_3 (Frozen): -724.690185546875
Episode Length: 199

Value Loss: 107.18485260009766
Policy Loss: -14.001142501831055
Old Approx KL: 0.05385085940361023
Approx KL: 0.03182181343436241
Clip Fraction: 0.542782741288344
Explained Variance: -0.14066946506500244

-------------------------------------------

Training episode 1150
Episodic Return: [ -869.46924 -1228.5609    889.0368    879.99756]
Smoothed Returns for agent_0 (Training): -82.87213134765625
Smoothed Returns for agent_1 (Training): 656.6347045898438
Smoothed Returns for agent_2 (Frozen): -664.8201904296875
Smoothed Returns for agent_3 (Frozen): -233.4114532470703
Episode Length: 199

Value Loss: 19.97218132019043
Policy Loss: -5.566396713256836
Old Ap

Training episode 1280
Episodic Return: [ -358.20557  3007.7334    108.1548  -2987.6724 ]
Smoothed Returns for agent_0 (Training): -128.33724975585938
Smoothed Returns for agent_1 (Training): -182.902099609375
Smoothed Returns for agent_2 (Frozen): -132.17120361328125
Smoothed Returns for agent_3 (Frozen): 77.89765930175781
Episode Length: 199

Value Loss: 1238.887939453125
Policy Loss: -37.36002731323242
Old Approx KL: 0.008259705267846584
Approx KL: 0.0011269961250945926
Clip Fraction: 0.0652901791036129
Explained Variance: 0.25410860776901245

-------------------------------------------

Training episode 1290
Episodic Return: [ -845.40594   2036.2117      91.830795 -1636.3624  ]
Smoothed Returns for agent_0 (Training): -444.56793212890625
Smoothed Returns for agent_1 (Training): 369.84619140625
Smoothed Returns for agent_2 (Frozen): -237.2029571533203
Smoothed Returns for agent_3 (Frozen): 38.66210174560547
Episode Length: 199

Value Loss: 2875.941650390625
Policy Loss: 15.6307086944

Training episode 1420
Episodic Return: [ -867.6076  -1195.569     874.42664   890.42804]
Smoothed Returns for agent_0 (Training): -350.7993469238281
Smoothed Returns for agent_1 (Training): -732.0870361328125
Smoothed Returns for agent_2 (Frozen): 483.7811584472656
Smoothed Returns for agent_3 (Frozen): 323.5356140136719
Episode Length: 199

Value Loss: 14.423738479614258
Policy Loss: 4.895479679107666
Old Approx KL: -0.0016825540224090219
Approx KL: 0.00020112311176490039
Clip Fraction: 0.049851191540559135
Explained Variance: 0.3011478781700134

-------------------------------------------

Training episode 1430
Episodic Return: [ 1057.3108  2428.6523 -1466.7745 -2090.605 ]
Smoothed Returns for agent_0 (Training): -111.13069915771484
Smoothed Returns for agent_1 (Training): -727.1298828125
Smoothed Returns for agent_2 (Frozen): 433.0518493652344
Smoothed Returns for agent_3 (Frozen): 29.648670196533203
Episode Length: 199

Value Loss: 8218.046875
Policy Loss: -108.32699584960938
Old A

Training episode 1560
Episodic Return: [ 2471.2822  3791.0334 -4313.1787 -2575.1843]
Smoothed Returns for agent_0 (Training): 3536.219482421875
Smoothed Returns for agent_1 (Training): 3723.66845703125
Smoothed Returns for agent_2 (Frozen): -3441.180419921875
Smoothed Returns for agent_3 (Frozen): -4170.81640625
Episode Length: 199

Value Loss: 1219.390625
Policy Loss: -36.63357162475586
Old Approx KL: -0.05380242317914963
Approx KL: 0.0056876796297729015
Clip Fraction: 0.09691220397750537
Explained Variance: 0.3733801245689392

-------------------------------------------

Training episode 1570
Episodic Return: [ 4740.1855  3250.8499 -6149.3413 -2126.8772]
Smoothed Returns for agent_0 (Training): 4441.58984375
Smoothed Returns for agent_1 (Training): 4338.43017578125
Smoothed Returns for agent_2 (Frozen): -4631.01611328125
Smoothed Returns for agent_3 (Frozen): -4474.802734375
Episode Length: 199

Value Loss: 90.97383880615234
Policy Loss: -0.305184930562973
Old Approx KL: 0.0002671650

Training episode 1700
Episodic Return: [ -850.7756  -1224.055     869.32294   869.01587]
Smoothed Returns for agent_0 (Training): -839.6887817382812
Smoothed Returns for agent_1 (Training): -1164.061279296875
Smoothed Returns for agent_2 (Frozen): 849.2918090820312
Smoothed Returns for agent_3 (Frozen): 805.330322265625
Episode Length: 199

Value Loss: 10.582571983337402
Policy Loss: -2.896498680114746
Old Approx KL: 0.0010858264286071062
Approx KL: 8.344650836988876e-07
Clip Fraction: 0.0013020833333333333
Explained Variance: -0.006035804748535156

-------------------------------------------

Training episode 1710
Episodic Return: [ -852.483   -1304.524     868.26917   871.566  ]
Smoothed Returns for agent_0 (Training): -147.32418823242188
Smoothed Returns for agent_1 (Training): -552.9990234375
Smoothed Returns for agent_2 (Frozen): 20.17669105529785
Smoothed Returns for agent_3 (Frozen): 369.05926513671875
Episode Length: 199

Value Loss: 7.820875644683838
Policy Loss: 3.64103555679

Training episode 1840
Episodic Return: [ 7711.117   -980.1463 -3305.5596 -5679.7236]
Smoothed Returns for agent_0 (Training): 7062.2470703125
Smoothed Returns for agent_1 (Training): -1044.681884765625
Smoothed Returns for agent_2 (Frozen): -4079.658203125
Smoothed Returns for agent_3 (Frozen): -4333.5087890625
Episode Length: 199

Value Loss: 21.56920051574707
Policy Loss: -6.666763782501221
Old Approx KL: -0.012866701930761337
Approx KL: 0.0015240652719512582
Clip Fraction: 0.06789434577027957
Explained Variance: 0.010694503784179688

-------------------------------------------

Training episode 1850
Episodic Return: [ 7774.3433 -1062.1012 -5742.2627 -2952.9028]
Smoothed Returns for agent_0 (Training): 7663.97119140625
Smoothed Returns for agent_1 (Training): -1017.9275512695312
Smoothed Returns for agent_2 (Frozen): -4823.66748046875
Smoothed Returns for agent_3 (Frozen): -4185.419921875
Episode Length: 199

Value Loss: 32.30327224731445
Policy Loss: -7.234496116638184
Old Approx KL

Training episode 1980
Episodic Return: [ -847.8052 -1310.8661   860.4817   872.9284]
Smoothed Returns for agent_0 (Training): 676.1123046875
Smoothed Returns for agent_1 (Training): -1249.09130859375
Smoothed Returns for agent_2 (Frozen): -12.507159233093262
Smoothed Returns for agent_3 (Frozen): -68.11869812011719
Episode Length: 199

Value Loss: 13.227852821350098
Policy Loss: -4.740039348602295
Old Approx KL: -0.009916987270116806
Approx KL: 0.0009011882357299328
Clip Fraction: 0.051153274873892464
Explained Variance: -0.03548276424407959

-------------------------------------------

Training episode 1990
Episodic Return: [ -853.1446  -1323.2157    876.98206   860.20685]
Smoothed Returns for agent_0 (Training): -430.9779357910156
Smoothed Returns for agent_1 (Training): -1301.114013671875
Smoothed Returns for agent_2 (Frozen): 632.63720703125
Smoothed Returns for agent_3 (Frozen): 625.8532104492188
Episode Length: 199

Value Loss: 5.825280666351318
Policy Loss: 2.0688157081604004
Ol