In [14]:
from pettingzoo.mpe import simple_spread_v2

env = simple_spread_v2.parallel_env(N=3, local_ratio=0.5, max_cycles=25, continuous_actions=False)

import pettingzoo
pettingzoo.__version__

'1.22.1'

In [23]:
env.reset()

while env.agents:
    actions = {agent: env.action_space(agent).sample() for agent in env.agents}  # this is where you would insert your policy
    observations, rewards, terminations, truncations, infos = env.step(actions)

{'agent_0': array([ 0.31500483,  0.59926844, -1.0832875 , -0.06565291,  0.28186268,
        0.11397307,  0.16470452, -0.17338154,  0.99939805,  0.0890506 ,
        0.01904923,  0.2580465 ,  0.91106004, -0.7305967 ,  0.        ,
        0.        ,  0.        ,  0.        ], dtype=float32), 'agent_1': array([-0.7975387 , -0.51615536, -1.0642382 ,  0.19239359,  0.26281345,
       -0.14407343,  0.14565529, -0.43142805,  0.9803488 , -0.1689959 ,
       -0.01904923, -0.2580465 ,  0.8920108 , -0.9886432 ,  0.        ,
        0.        ,  0.        ,  0.        ], dtype=float32), 'agent_2': array([ 0.26891196, -0.30733103, -0.17222741, -0.7962496 , -0.62919736,
        0.8445698 , -0.74635553,  0.5572152 ,  0.088338  ,  0.8196473 ,
       -0.91106004,  0.7305967 , -0.8920108 ,  0.9886432 ,  0.        ,
        0.        ,  0.        ,  0.        ], dtype=float32)}


In [34]:
"""
This is a single agent actor-critic implementation.
It is specifically designed to work with the four rooms environment, but it should work with any gym environment.
"""

import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd


def obs_to_tensor(obs: np.array) -> torch.Tensor:
    # initialize an empty np array
    obs_tensor = np.array([])
    for _, value in obs.items():
        obs_tensor = np.concatenate((obs_tensor, value))
    return torch.Tensor(obs_tensor)



In [35]:
env.observation_space.shape

AttributeError: 'function' object has no attribute 'shape'

In [43]:
class ActorCriticSingleAgent(nn.Module):
    """
    A simple, but flexible, implementation of the actor-critic algorithm.
    """
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(ActorCriticSingleAgent, self).__init__()

        self.num_actions = num_actions
        # estimate the value function
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)

        # estimate the policy distribution
        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        """
        Do inference to calculate the action probabilities and the state value.
        """
        value = F.relu(self.critic_linear1(state))
        value = self.critic_linear2(value)
        
        policy_dist = F.relu(self.actor_linear1(state))
        # softmax effectively generates a probability for each of our output options
        policy_dist = F.softmax(self.actor_linear2(policy_dist))

        return value, policy_dist

In [None]:
"""
This doesn't quite use entropy correctly and it doesn't bootstrap correctly.
BUT. it is a start.
"""
def a2c(env, hidden_size, learning_rate, num_episodes, num_steps, gamma=0.99, num_classes=11):
    num_inputs = env.observation_space.shape[0] * num_classes
    num_outputs = env.action_space.n
    
    actor_critic = ActorCriticSingleAgent(num_inputs, num_outputs, hidden_size)
    # Use Adam optimizer for the actor-critic because it should converge faster than SGD and generalization may not be super important
    ac_optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate)

    # all episode length
    all_lengths = []
    # average episode length
    average_lengths = []
    # all episode rewards
    all_rewards = []
    
    entropy_term = 0

    for episode in range(num_episodes):
        log_probs = []
        values = []
        rewards = []

        state = env.reset()
        for steps in range(num_steps):
            # state_tensor = torch.flatten(F.one_hot(torch.tensor(state, dtype=torch.int64), num_classes=num_classes))
            value, policy_dist = actor_critic.forward(state_tensor.float())
            # drop the tensor dimension and computational graph info
            value = value.detach().numpy()
            dist = policy_dist.detach().numpy() 

            # Sample an action according the probs the network just output.
            action = np.random.choice(num_outputs, p=np.squeeze(dist))
            # Calculate the log probability of the action we've taken
            log_prob = torch.log(policy_dist.squeeze(0)[action])
            # 
            entropy = -np.sum(np.mean(dist) * np.log(dist))
            new_state, reward, done, _, _ = env.step(action)

            rewards.append(reward)
            values.append(value)
            log_probs.append(log_prob)
            entropy_term += entropy
            state = new_state
            
            if done or steps == num_steps-1:
                # state_tensor = torch.flatten(F.one_hot(torch.tensor(new_state, dtype=torch.int64), num_classes=num_classes))
                Qval, _ = actor_critic.forward(state_tensor.float())
                Qval = Qval.detach().numpy()
                all_rewards.append(np.sum(rewards))
                all_lengths.append(steps)
                average_lengths.append(np.mean(all_lengths[-10:]))
                if episode % 10 == 0:
                    # Where total length is the number of steps taken in the episode and average length is average steps in all episodes seen 
                    sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
                break
        
        # compute Q values
        # These are the rewards plus discounted state values to calculate the advantage
        Qvals = np.zeros_like(values)
        for t in reversed(range(len(rewards))):
            Qval = rewards[t] + gamma * Qval
            Qvals[t] = Qval
  
        #update actor critic
        values = torch.FloatTensor(values)
        Qvals = torch.FloatTensor(Qvals)
        log_probs = torch.stack(log_probs)
        
        advantage = Qvals - values
        actor_loss = (-log_probs * advantage).mean()
        critic_loss = 0.5 * advantage.pow(2).mean()
        ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

        ac_optimizer.zero_grad()
        ac_loss.backward()
        ac_optimizer.step()

    return all_rewards, all_lengths, average_lengths

In [49]:
class A2C:
    def __init__(self, env, obs_size, hidden_size, output_size, obs_to_tensor_fn, learning_rate, num_episodes, num_steps, gamma) -> None:
        self.env = env
        self.obs_size = obs_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.obs_to_tensor_fn = obs_to_tensor_fn
        self.learning_rate = learning_rate
        self.num_episodes = num_episodes
        self.num_steps = num_steps
        self.gamma = gamma

        self.model = ActorCriticSingleAgent(self.obs_size, self.output_size, self.hidden_size, self.learning_rate)
    
    def _get_action_tensors(value, distance):
        

    def train(self):
        # Use Adam optimizer for the actor-critic because it should converge faster than SGD and generalization may not be super important
        ac_optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        # all episode length
        all_lengths = []
        # average episode length
        average_lengths = []
        # all episode rewards
        all_rewards = []
        
        entropy_term = 0

        for episode in range(self.num_episodes):
            log_probs = []
            values = []
            rewards = []

            state = env.reset()
            for steps in range(self.num_steps):
                state_tensor = self.obs_to_tensor_fn(state)
                # state_tensor = torch.flatten(F.one_hot(torch.tensor(state, dtype=torch.int64), num_classes=num_classes))
                value, policy_dist = self.model.forward(state_tensor.float())
                # drop the tensor dimension and computational graph info
                value = value.detach().numpy()
                dist = policy_dist.detach().numpy() 

                # TODO divide the action space for each agent up into sections.
                # select each action accordingly.
                # TODO use the parallel API in a separate function to apply these actions
                actions = {agent: env.action_space(agent).sample() for agent in env.agents}  # this is where you would insert your policy
                observations, rewards, terminations, truncations, infos = env.step(actions) 
                
                # Sample an action according the probs the network just output.
                action = np.random.choice(self.output_size, p=np.squeeze(dist))

                # Calculate the log probability of the action we've taken
                log_prob = torch.log(policy_dist.squeeze(0)[action])
                # 
                entropy = -np.sum(np.mean(dist) * np.log(dist))
                new_state, reward, done, _, _ = env.step(action)

                rewards.append(reward)
                values.append(value)
                log_probs.append(log_prob)
                entropy_term += entropy
                state = new_state
                
                if done or steps == self.num_steps - 1:
                    # state_tensor = torch.flatten(F.one_hot(torch.tensor(new_state, dtype=torch.int64), num_classes=num_classes))
                    Qval, _ = self.model.forward(state_tensor.float())
                    Qval = Qval.detach().numpy()
                    all_rewards.append(np.sum(rewards))
                    all_lengths.append(steps)
                    average_lengths.append(np.mean(all_lengths[-10:]))
                    if episode % 10 == 0:
                        # Where total length is the number of steps taken in the episode and average length is average steps in all episodes seen 
                        sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
                    break
            
            # compute Q values
            # These are the rewards plus discounted state values to calculate the advantage
            Qvals = np.zeros_like(values)
            for t in reversed(range(len(rewards))):
                Qval = rewards[t] + self.gamma * Qval
                Qvals[t] = Qval
    
            #update actor critic
            values = torch.FloatTensor(values)
            Qvals = torch.FloatTensor(Qvals)
            log_probs = torch.stack(log_probs)
            
            advantage = Qvals - values
            actor_loss = (-log_probs * advantage).mean()
            critic_loss = 0.5 * advantage.pow(2).mean()
            ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

            ac_optimizer.zero_grad()
            ac_loss.backward()
            ac_optimizer.step()

In [50]:
hidden_size = 64
learning_rate = 3e-4

# Constants
gamma = 0.99
max_episodes = 5000
num_steps = 10_000

obs_dict = env.reset()
obs_size = obs_to_tensor(obs_dict).shape[0]
output_size = env.action_space(list(obs_dict.keys())[0]).n * env.num_agents
ac_class = A2C(env, obs_size, hidden_size, output_size, obs_to_tensor, learning_rate, num_episodes=max_episodes, num_steps=num_steps, gamma=gamma)
all_rewards, all_lengths, average_lengths = ac_class.train()


  policy_dist = F.softmax(self.actor_linear2(policy_dist))


TypeError: 'int' object is not subscriptable