In [1]:
from pettingzoo.mpe import simple_spread_v2

env = simple_spread_v2.parallel_env(N=3, local_ratio=0.5, max_cycles=25, continuous_actions=False)

import pettingzoo
pettingzoo.__version__
import sys
import torch  
import gym
from tqdm import tqdm
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
# env.reset()

# while env.agents:
#     actions = {agent: env.action_space(agent).sample() for agent in env.agents}  # this is where you would insert your policy
#     observations, rewards, terminations, truncations, infos = env.step(actions)

{'agent_0': array([ 0.31500483,  0.59926844, -1.0832875 , -0.06565291,  0.28186268,
        0.11397307,  0.16470452, -0.17338154,  0.99939805,  0.0890506 ,
        0.01904923,  0.2580465 ,  0.91106004, -0.7305967 ,  0.        ,
        0.        ,  0.        ,  0.        ], dtype=float32), 'agent_1': array([-0.7975387 , -0.51615536, -1.0642382 ,  0.19239359,  0.26281345,
       -0.14407343,  0.14565529, -0.43142805,  0.9803488 , -0.1689959 ,
       -0.01904923, -0.2580465 ,  0.8920108 , -0.9886432 ,  0.        ,
        0.        ,  0.        ,  0.        ], dtype=float32), 'agent_2': array([ 0.26891196, -0.30733103, -0.17222741, -0.7962496 , -0.62919736,
        0.8445698 , -0.74635553,  0.5572152 ,  0.088338  ,  0.8196473 ,
       -0.91106004,  0.7305967 , -0.8920108 ,  0.9886432 ,  0.        ,
        0.        ,  0.        ,  0.        ], dtype=float32)}


In [2]:
"""
This is a single agent actor-critic implementation.
It is specifically designed to work with the four rooms environment, but it should work with any gym environment.
"""

import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd


def obs_to_tensor(obs: np.array) -> torch.Tensor:
    # initialize an empty np array
    obs_tensor = np.array([])
    for _, value in obs.items():
        obs_tensor = np.concatenate((obs_tensor, value))
    return torch.Tensor(obs_tensor)



In [3]:
env.observation_space.shape

AttributeError: 'function' object has no attribute 'shape'

In [4]:
class ActorCriticCentralizedMultiAgent(nn.Module):
    """
    A simple, but flexible, implementation of the actor-critic algorithm.
    """
    def __init__(self, num_inputs, num_actions, actions_per_agent, hidden_size, learning_rate=3e-4):
        super(ActorCriticCentralizedMultiAgent, self).__init__()

        self.num_actions = num_actions
        self.actions_per_agent = actions_per_agent
        # estimate the value function
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)

        # estimate the policy distribution
        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        """
        Do inference to calculate the action probabilities and the state value.
        """
        value = F.relu(self.critic_linear1(state))
        value = self.critic_linear2(value)
        
        policy_dist = F.relu(self.actor_linear1(state))
        # softmax effectively generates a probability for each of our output options
        policy_dist = self.actor_linear2(policy_dist)
        # reshape the policy dist for each actor
        policy_dist = policy_dist.view(-1, self.actions_per_agent, self.num_actions)

        policy_dist = F.softmax(policy_dist, 1)

        return value, policy_dist

In [18]:
class A2CCentralizedMultiAgent:
    def __init__(self, env, obs_size, hidden_size, output_size, actions_per_agent, learning_rate, num_episodes, num_steps, gamma) -> None:
        self.env = env
        self.obs_size = obs_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.num_episodes = num_episodes
        self.num_steps = num_steps
        self.gamma = gamma

        self.model = ActorCriticCentralizedMultiAgent(self.obs_size, self.output_size, actions_per_agent, self.hidden_size, self.learning_rate)
    
    def _get_actions(self, actor_act_pred, num_agents):
        # Sample an action according the probs the network just output.
        start = 0
        actions = {}
        actions = {agent: env.action_space(agent).sample() for agent in env.agents} 
        for agent in env.agents:
            action_space_size = env.action_space(agent).n
            action = np.random.choice(action_space_size, p=np.squeeze(actor_act_pred[start: start + action_space_size]))
            start += action_space_size
            actions[agent] = env.action_space(agent)[action]
        return actions

    def _obs_to_tensor(self, obs: np.array) -> torch.Tensor:
        # initialize an empty np array
        obs_tensor = np.array([])
        for _, value in obs.items():
            obs_tensor = np.concatenate((obs_tensor, value))
        return torch.Tensor(obs_tensor)

    def train(self):
        # Use Adam optimizer for the actor-critic because it should converge faster than SGD and generalization may not be super important
        ac_optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        # all episode length
        all_lengths = []
        # average episode length
        average_lengths = []
        all_entropies = []
        all_losses = []
        
        entropy_term = 0

        for episode in range(self.num_episodes):
            log_probs = []
            critic_td_errors = []
            rewards = []

            state = self.env.reset()
            # enable n step actor critic. 
            for steps in range(self.num_steps):
                state_tensor = self._obs_to_tensor(state)
                critic_td_error, actor_act_pred = self.model.forward(state_tensor.float())
                # drop the tensor dimension and computational graph info
                critic_td_error = critic_td_error.detach().numpy()
                detached_act_pred = actor_act_pred.detach().numpy()
                actions = self._get_actions(detached_act_pred)

                # Calculate the log probability of the actions we've taken
                log_prob = torch.log(actor_act_pred.squeeze(0)[actions])

                # Calculate the entropy/ uncertainty of the policy term. This is used to encourage exploration
                entropy = -np.sum(np.mean(detached_act_pred) * np.log(detached_act_pred))
                new_state, rewards, done, _, _ = self.env.step(actions)

                rewards.append(np.mean(rewards))
                critic_td_errors.append(critic_td_error)
                log_probs.append(log_prob)
                entropy_term += entropy
                state = new_state
                
                if done or steps == self.num_steps - 1:
                    state_tensor = self._obs_to_tensor(state)
                    Qval, _ = self.model.forward(state_tensor.float())
                    Qval = Qval.detach().numpy()
                    all_lengths.append(steps)
                    # use numpy to get the standard deviation of all lengths

                    
                    average_lengths.append(np.mean(all_lengths[-10:]))
                    if episode % 30 == 0:
                        # Where total length is the number of steps taken in the episode and average length is average steps in all episodes seen 
                        sys.stdout.write("episode: {}, total length: {}, average length of prev 10: {} \n".format(episode, steps, average_lengths[-1]))
                    break
            
            # compute Q values
            # These are the rewards plus discounted state values to calculate the advantage
            Qvals = np.zeros_like(critic_td_errors)
            for t in reversed(range(len(rewards))):
                Qval = rewards[t] + self.gamma * Qval
                Qvals[t] = Qval
    
            # update actor critic
            critic_td_errors = torch.FloatTensor(np.array(critic_td_errors))
            Qvals = torch.FloatTensor(Qvals)
            log_probs = torch.stack(log_probs)
            
            advantage = Qvals - critic_td_errors
            actor_loss = (-log_probs * advantage).mean()
            critic_loss = 0.5 * advantage.pow(2).mean()
            ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

            ac_optimizer.zero_grad()
            ac_loss.backward()
            ac_optimizer.step()
            all_entropies.append(entropy_term)
            all_losses.append(ac_loss.detach().numpy())
        return all_lengths, average_lengths, all_entropies, all_losses

In [19]:
hidden_size = 64
learning_rate = 3e-4

# Constants
gamma = 0.99
max_episodes = 5000
num_steps = 10_000

obs_dict = env.reset()
obs_size = obs_to_tensor(obs_dict).shape[0]
output_size = 0
actions_per_agent = []

for i in range(len(env.agents)):
    output_size += env.action_space(env.agents[i]).n
    actions_per_agent.append(env.action_space(list(obs_dict.keys())[i]).n)

agent = A2CCentralizedMultiAgent(env, obs_size, hidden_size, output_size, actions_per_agent, learning_rate, num_episodes=max_episodes, num_steps=num_steps, gamma=gamma)

all_lengths, average_lengths, all_entropies, all_losses = agent.train()


TypeError: view(): argument 'size' must be tuple of SymInts, but found element of type list at pos 2