In [39]:
from pettingzoo.mpe import simple_spread_v2

env = simple_spread_v2.parallel_env(N=3, local_ratio=0.5, max_cycles=500, continuous_actions=False)

import pettingzoo
pettingzoo.__version__
import sys
import torch  
import gym
# from tqdm import tqdm
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

In [40]:
class A2CTD0:
    def __init__(self, env, obs_size, hidden_size, output_size, learning_rate, num_episodes, num_steps, gamma) -> None:
        self.env = env
        self.obs_size = obs_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.num_episodes = num_episodes
        self.num_steps = num_steps
        self.gamma = gamma

        # self.model = ActorCriticSingleAgent(self.obs_size * self.env.observation_space.shape[0] + 1, self.output_size, self.hidden_size, self.learning_rate)
    
    def _get_action(self, actor_act_pred):
        # Sample an action according the probs the network just output.
        action = np.random.choice(self.output_size, p=np.squeeze(actor_act_pred))
        return action

    def _obs_to_tensor(self, obs):
        state_tensor = torch.flatten(F.one_hot(torch.tensor(obs, dtype=torch.int64), num_classes=self.obs_size))
        # add a bias bit with a value of 1 in front of the one hot vector
        state_tensor = torch.cat((torch.tensor([1.0]), state_tensor))
        return state_tensor

    def _show_episode_results(self, episode, steps, state, all_losses, average_lengths, frequency):
        if episode % frequency == 0:
            # make an array of the value of each state currently
            # make an empty array for the values of each state of shape 10x10
            state_values = np.zeros((10, 10))
            for i in range(1, 11):
                for j in range(1, 11):
                    state_tensor = self._obs_to_tensor([i, j])
                    value, _ = self.model.forward(state_tensor.float())
                    state_values[i-1][j-1] = value.detach().numpy()
            # show the value of each state with matplot lib
            # plt.imshow(state_values)
            # plt.show()
                # sys.stdout.write("\nState: " + str(i) + " Value: " + str(state_values))
            sys.stdout.write("\nLoss: " + str(all_losses[-1]) 
                # + "\nCurrent State: " + str(state)
                )
            # Where total length is the number of steps taken in the episode and average length is average steps in all episodes seen 
            sys.stdout.write("\nepisode: {}, total length: {}, average length of prev 10: {} \n".format(episode, steps, average_lengths[-1]))

    def train(self):
        # Use Adam optimizer for the actor-critic because it should converge faster than SGD and generalization may not be super important
        ac_optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        # all episode length
        all_lengths = []
        episode_rewards = []
        # average episode length
        average_lengths = []
        all_entropies = []
        all_losses = []

        for episode in range(self.num_episodes):
            rewards = []
            done = False
            i = 1

            state = self.env.reset()
            steps = 0
            # enable n step actor critic. 
            while not done:
                state_tensor = self._obs_to_tensor(state)
                critic_td_error, actor_act_pred = self.model.forward(state_tensor.float())
                # drop the tensor dimension and computational graph info
                detached_act_pred = actor_act_pred.detach().numpy()
                action = self._get_action(detached_act_pred)

                # Calculate the log probability of the action we've taken
                log_prob = torch.distributions.Categorical(actor_act_pred).log_prob(torch.tensor(action))

                # Calculate the entropy/ uncertainty of the policy term. This is used to encourage exploration
                entropy = -np.sum(np.mean(detached_act_pred) * np.log(detached_act_pred))
                new_state, reward, done, _, _ = self.env.step(action)
                steps += 1

                state = new_state
                
                state_tensor = self._obs_to_tensor(state)
                if not done:
                  q_val_tensor, _ = self.model.forward(state_tensor.float())
                else:
                    q_val_tensor = torch.tensor(0)
                    all_lengths.append(steps)
                    average_lengths.append(np.mean(all_lengths[-10:]))
                td_error = reward + self.gamma * q_val_tensor - critic_td_error
        
                # update actor critic
                actor_loss = -log_prob * (reward + self.gamma * q_val_tensor.item() - critic_td_error.item())
                actor_loss *= i
                critic_loss = 0.5 * td_error ** 2
                critic_loss *= i

                ac_loss = actor_loss + critic_loss + 0.001 * entropy

                ac_optimizer.zero_grad()
                ac_loss.backward()
                ac_optimizer.step()
                # all_entropies.append(entropy)
                all_losses.append(ac_loss.detach().numpy())
                i *= self.gamma
            if done:
              episode_rewards.append(np.sum(rewards))
            self._show_episode_results(episode, steps, state, all_losses, average_lengths)
            
        return all_lengths, average_lengths, all_entropies, all_losses, episode_rewards

In [41]:
"""
This is a single agent actor-critic implementation.
It is specifically designed to work with the four rooms environment, but it should work with any gym environment.
"""

import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd


def obs_to_tensor(obs: np.array) -> torch.Tensor:
    # initialize an empty np array
    obs_tensor = np.array([])
    for _, value in obs.items():
        obs_tensor = np.concatenate((obs_tensor, value))
    return torch.Tensor(obs_tensor)



In [42]:
env.observation_space.shape

AttributeError: 'function' object has no attribute 'shape'

In [43]:
class ActorCriticCentralizedMultiAgent(nn.Module):
    """
    A simple, but flexible, implementation of the actor-critic algorithm.
    """
    def __init__(self, num_inputs, num_actions, actions_per_agent, hidden_size, learning_rate=3e-4):
        super(ActorCriticCentralizedMultiAgent, self).__init__()

        self.num_actions = num_actions
        self.actions_per_agent = actions_per_agent
        # estimate the value function
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, len(self.actions_per_agent))

        # estimate the policy distribution
        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        """
        Do inference to calculate the action probabilities and the state value.
        """
        value = F.relu(self.critic_linear1(state))
        value = self.critic_linear2(value)
        
        policy_dist = F.relu(self.actor_linear1(state))
        # softmax effectively generates a probability for each of our output options
        policy_dist = self.actor_linear2(policy_dist)
        # reshape the policy dist for each actor
        policy_dist = policy_dist.view((len(self.actions_per_agent), self.actions_per_agent[0]))

        policy_dist = F.softmax(policy_dist, 1)

        return value, policy_dist

In [55]:
class A2CTDNCentralizedMultiAgent(A2CTD0):
    def __init__(self, env, obs_size, hidden_size, output_size, actions_per_agent, learning_rate, num_episodes, num_steps, gamma) -> None:
        super().__init__(env, obs_size, hidden_size, output_size, learning_rate, num_episodes, num_steps, gamma)
        self.actions_per_agent = actions_per_agent

        self.model = ActorCriticCentralizedMultiAgent(self.obs_size + 1, self.output_size, actions_per_agent, self.hidden_size, self.learning_rate)
    
    def _get_actions(self, actor_act_pred, num_agents):
        # Sample an action according the probs the network just output.
        idx = 0
        actions = {}
        for agent in env.agents:
            action_space_size = env.action_space(agent).n
            action = np.random.choice(action_space_size, p=np.squeeze(actor_act_pred[idx]))
            actions[agent] = action
            idx += 1
        return actions

    def _show_episode_results(self, episode, steps, state, all_losses, average_lengths, rewards, frequency):
        if episode % frequency == 0:
            # make an array of the value of each state currently
            # make an empty array for the values of each state of shape 10x10
            # state_values = np.zeros((10, 10))
            # for i in range(1, 11):
            #     for j in range(1, 11):
            #         state_tensor = self._obs_to_tensor([i, j])
            #         value, _ = self.model.forward(state_tensor.float())
            #         state_values[i-1][j-1] = value.detach().numpy()
            # # show the value of each state with matplot lib
            # # plt.imshow(state_values)
            # # plt.show()
            #     # sys.stdout.write("\nState: " + str(i) + " Value: " + str(state_values))
            sys.stdout.write("\nLoss: " + str(all_losses[-1]) + "\nRewards: " + str(rewards[-1]) + "\n" + "Average last 20 Rewards" + str(np.mean(rewards[-20:])))
                # + "\nCurrent State: " + str(state))
            # Where total length is the number of steps taken in the episode and average length is average steps in all episodes seen 
            sys.stdout.write("\nepisode: {}, ")
            #total length: {}, average length of prev 10: {} \n".format(episode, steps, average_lengths[-1]))

    def _obs_to_tensor(self, obs: np.array) -> torch.Tensor:
        # initialize an empty np array
        obs_tensor = np.array([1.0])
        for _, value in obs.items():
            obs_tensor = np.concatenate((obs_tensor, value))
        return torch.Tensor(obs_tensor)

    def _combine_rewards(self, rewards):
        """
        Square the rewards, if they are negative initially add a negative sign to the squared value.
        Then sum the squares
        """
        return sum([r ** 2 if r > 0 else -(r ** 2) for r in rewards.values()])

    def _get_log_probs(self, actor_act_pred, actions):
        start = 0
        log_probs = []
        for agent in env.agents:
            action_space_size = env.action_space(agent).n
            log_probs.append(torch.distributions.Categorical(actor_act_pred[start: start + action_space_size]).log_prob(torch.tensor(actions[agent])))
            start += action_space_size
        return torch.concatenate(log_probs)

    def train(self):
        # Use Adam optimizer for the actor-critic because it should converge faster than SGD and generalization may not be super important
        ac_optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        # all episode length
        all_lengths = []
        episode_rewards = []
        # average episode length
        average_lengths = []
        all_entropies = []
        all_losses = []

        for episode in range(self.num_episodes):
            single_episode_rewards = []
            done = False
            i = 1
            G = 0
            state = self.env.reset()
            steps = 0
            step_rewards = []
            states = [state]
            dones = []
            actions = []

            # enable n step actor critic. 
            while True:
                state_tensor = self._obs_to_tensor(state)
                _, actor_act_pred = self.model.forward(state_tensor.float())
                # drop the tensor dimension and computational graph info
                detached_act_pred = actor_act_pred.detach().numpy()
                action = self._get_actions(detached_act_pred, len(self.env.agents))

                actions.append(action)

                # Calculate the entropy/ uncertainty of the policy term. This is used to encourage exploration
                # entropy = -np.sum(np.mean(detached_act_pred) * np.log(detached_act_pred))
                # entropy_term += entropy
                new_state, rewards, done, _, _ = self.env.step(action)
                # done is a dict of agents. We only care if all agents are done
                done = all(done.values()) or len(self.env.agents) < 2
                if done:
                    break
                steps += 1

                state = new_state
                step_rewards.append(self._combine_rewards(rewards))
                dones.append(done)
                states.append(new_state)

                if len(step_rewards) >= self.num_steps:
                    if len(step_rewards) > self.num_steps:
                        step_rewards.pop(0)
                        states.pop(0)
                        dones.pop(0)
                        actions.pop(0)

                    # calculate G
                    G = 0
                    for idx, r in enumerate(step_rewards):
                        # account for the dones vector too
                        G += r * (self.gamma ** idx) * (1 - dones[idx])
                    single_episode_rewards.append(G)

                    critic_val, actor_act_pred = self.model.forward(self._obs_to_tensor(states[0]).float())
                    if len(env.agents) < 2:
                        print('where agent')
                    log_probs = self._get_log_probs(actor_act_pred, actions[0])
                    next_state_tensor = self._obs_to_tensor(states[-1])
                    if not dones[0]:
                        next_critic_value_tensor, _ = self.model.forward(next_state_tensor.float())
                    else:
                        next_critic_value_tensor = torch.zeros(3)
                    # calculate the n step td error
                    # td error = G + gamma ** self.num_steps * V(s_{t+n}) - V(s_t)
                    td_error = G + self.gamma ** self.num_steps * next_critic_value_tensor.T - critic_val.T


                    # update actor critic
                    actor_loss = -log_probs * (G + self.gamma * next_critic_value_tensor.T.detach() - critic_val.T.detach())
                    actor_loss *= i
                    critic_loss = 0.5 * td_error ** 2

                    ac_loss = actor_loss + critic_loss# + 0.001 * entropy

                    ac_optimizer.zero_grad()
                    ac_loss.backward(torch.ones_like(ac_loss))
                    ac_optimizer.step()
                    # all_entropies.append(entropy)
                    all_losses.append(ac_loss.detach().numpy())
                    i *= self.gamma


            all_lengths.append(steps)
            average_lengths.append(np.mean(all_lengths[-10:]))
            episode_rewards.append(np.sum(single_episode_rewards))
            self._show_episode_results(episode, steps, states, all_losses, average_lengths, episode_rewards, 20)
            
        return all_lengths, average_lengths, all_entropies, all_losses, episode_rewards

In [56]:
hidden_size = 64
learning_rate = 3e-4

# Constants
gamma = 0.99
max_episodes = 5000
num_steps = 5

obs_dict = env.reset()
obs_size = obs_to_tensor(obs_dict).shape[0]
output_size = 0
actions_per_agent = []

for i in range(len(env.agents)):
    output_size += env.action_space(env.agents[i]).n
    actions_per_agent.append(env.action_space(list(obs_dict.keys())[i]).n)

agent = A2CTDNCentralizedMultiAgent(env, obs_size, hidden_size, output_size, actions_per_agent, learning_rate, num_episodes=max_episodes, num_steps=num_steps, gamma=gamma)

all_lengths, average_lengths, all_entropies, all_losses, episode_rewards = agent.train()



Loss: [7.5072326e+08 7.5074707e+08 7.5067213e+08]
Rewards: -3582140.91823447
Average last 20 Rewards-3582140.91823447
episode: {}, 
Loss: [4630851.5 4578144.5 4675868.5]
Rewards: -16277.923836848233
Average last 20 Rewards-22311073.074612103
episode: {}, 
Loss: [1.7031249e+10 1.7039343e+10 1.7026545e+10]
Rewards: -25644970.351527277
Average last 20 Rewards-26604994.800037567
episode: {}, 
Loss: [92793912. 92341032. 93078040.]
Rewards: -129081.4231264782
Average last 20 Rewards-4491607.552188614
episode: {}, 
Loss: [65637148. 65384148. 65786792.]
Rewards: -165522.9019661791
Average last 20 Rewards-3101385.235277614
episode: {}, 
Loss: [20367354. 20274122. 20413658.]
Rewards: -2362148.546373754
Average last 20 Rewards-3128460.223196623
episode: {}, 
Loss: [2.5506493e+09 2.5520645e+09 2.5499843e+09]
Rewards: -17621238.742187425
Average last 20 Rewards-22935844.20390937
episode: {}, 
Loss: [32084056. 32183230. 31983960.]
Rewards: -6746882.193696046
Average last 20 Rewards-5589141.83900211