In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.distributions import MultivariateNormal
import numpy as np

%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
import json
from typing import Tuple, Optional, Dict
import torch
import gymnasium as gym

from env import MazeEnv

In [2]:
class MemoryBuffer:
    '''Simple buffer to collect experiences and clear after each update.'''
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.dones = []
        self.state_values = []
    
    def clear_buffer(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.dones[:]
        del self.state_values[:]
    
    def get_ordered_trajectories(self, n_agents=None):
        ordered_actions = torch.FloatTensor()
        ordered_states = torch.FloatTensor()
        ordered_logprobs = torch.FloatTensor()
        ordered_rewards = []
        ordered_dones = []
        
        actions = torch.stack(self.actions)
        states = torch.stack(self.states)
        logprobs = torch.stack(self.logprobs)

        self.ordered_actions = torch.FloatTensor()
        for index in range(actions.shape[1]):
            if n_agents !=None and n_agents == index+1:
                break
            ordered_states = torch.cat((ordered_states, states[:, index]), 0)
            ordered_actions = torch.cat((ordered_actions, actions[:, index]), 0)
            ordered_logprobs = torch.cat((ordered_logprobs, logprobs[:, index]), 0)
            ordered_rewards.extend(np.asarray(self.rewards)[:, index])
            ordered_dones.extend(np.asarray(self.dones)[:, index])

        return ordered_states, ordered_actions, ordered_logprobs, ordered_rewards, ordered_dones

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, state_size, action_size, action_std=0.5, hidden_size=32, low_policy_weights_init=True):
        super().__init__()

        self.actor_fc1 = nn.Linear(state_size, 2*hidden_size)
        self.actor_fc2 = nn.Linear(2*hidden_size, 2*hidden_size)
        self.actor_fc3 = nn.Linear(2*hidden_size, hidden_size)

        self.actor_mu = nn.Linear(hidden_size, action_size)
        self.actor_sigma = nn.Linear(hidden_size, action_size)
        
        
        self.critic_fc1 = nn.Linear(state_size, 2*hidden_size)
        self.critic_fc2 = nn.Linear(2*hidden_size, 2*hidden_size)
        self.critic_fc3 = nn.Linear(2*hidden_size, hidden_size)

        self.critic_value = nn.Linear(hidden_size, 1)

        self.distribution = torch.distributions.Normal

        self.action_var = torch.full((action_size,), action_std*action_std)
        
        # Boosts training performance in the beginning
        if low_policy_weights_init:
            with torch.no_grad():
                self.actor_mu.weight.mul_(0.01)

    def forward(self, state):
        x = torch.tanh(self.actor_fc1(state))
        x = torch.tanh(self.actor_fc2(x))
        x = torch.tanh(self.actor_fc3(x))
        mu = torch.tanh(self.actor_mu(x))
        sigma = F.softplus(self.actor_sigma(x))

        v = torch.tanh(self.critic_fc1(state))
        v = torch.tanh(self.critic_fc2(v))
        v = torch.tanh(self.critic_fc3(v))
        state_value = self.critic_value(v)

        return mu, sigma, state_value 

    def act(self, state):
        '''Choose action according to the policy.'''
        action_mu, action_sigma, state_value = self.forward(state)

        action_var = self.action_var.expand_as(action_mu)
        cov_mat = torch.diag_embed(action_var)
        dist = MultivariateNormal(action_mu, cov_mat)
        action = dist.sample()
        log_prob = dist.log_prob(action)

        return action.detach(), log_prob.detach()
    
    def evaluateStd(self, state, action):
        '''Evaluate action using learned std value for distribution.'''
        action_mu, action_sigma, state_value = self.forward(state)
        m = self.distribution(action_mu.squeeze(), action_sigma.squeeze())
        log_prob = m.log_prob(action)

        return log_prob, state_value

    def evaluate(self, state, action):
        '''Evaluate action for a given state.'''   
        action_mean, _, state_value = self.forward(state)
        
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var)
        
        dist = MultivariateNormal(action_mean, cov_mat)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy

In [3]:
def create_mlp(input_shape: Tuple[int], n_actions: int, hidden_sizes: list = [128, 128]):
    """
    Simple Multi-Layer Perceptron network
    
    param input_shape: Shape of input tensor
    param n_actions: Number of actions to output
    param hidden_sizes: List of sizes of hidden layers
    """
    net_layers = []
    
    net_layers.append(torch.nn.Linear(input_shape[0],hidden_sizes[0]))
    net_layers.append(torch.nn.ReLU())
    for i in range(1,len(hidden_sizes)):
        net_layers.append(torch.nn.Linear(hidden_sizes[i-1],hidden_sizes[i]))
        net_layers.append(torch.nn.ReLU())
    net_layers.append(torch.nn.Linear(hidden_sizes[-1],n_actions))
    #net_layers.append(torch.nn.Softmax(-1))

    
    return torch.nn.Sequential(*net_layers)

In [4]:
class DiscreteActor(torch.nn.Module):

    def __init__(self, actor_net: torch.nn.Module):
        super().__init__()

        self.actor_net = actor_net

    def forward(self, states) -> Tuple[torch.distributions.Categorical, torch.Tensor]:
        '''

        :param states: state of environment
        :return: Probabilities of actions and chosen action
        '''
        logits = self.actor_net(states)
        pi = torch.distributions.Categorical(logits=logits)
        action = pi.sample()
        return pi, action

In [5]:
class ActorCritic(object):
    """
    Actor Critic Agent used during trajectory collection. It returns a
    distribution and an action given an observation.

    """

    def __init__(self, actor_net: torch.nn.Module, critic_net: torch.nn.Module):
        self.actor_net = actor_net
        self.critic_net = critic_net

    @torch.no_grad()
    def __call__(self, state: torch.Tensor) -> Tuple[
        torch.distributions.Categorical, torch.Tensor, torch.Tensor, torch.Tensor]:
        '''
        :param state:
        :return: Categorical distribution, action, log probability of action, critic value of state
        '''
        
        # TODO: Calculate the probabilities and the action chosen by the actor as well as the value returned by the critic
        pi, _ = self.actor_net(state)

        action = pi.sample()
        log_p = pi.log_prob(action)
        value = self.critic_net(state)
        return pi, action, log_p, value

In [6]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=32, low_policy_weights_init=True):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_size, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, action_size)
        )
    
    def forward(self, x):
        return self.fc(x)
    

class Critic(nn.Module):
    def __init__(self, state_size, hidden_size=32):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_size, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        return self.fc(x)

In [70]:
import torch.distributions as dist

class MAPPOAgent:
    def __init__(self, state_size, action_size, n_agents, lr=3e-4):
        self.n_agents = n_agents
        self.actor = Actor(state_size, action_size)
        self.critic = Critic(state_size)
        self.optim_actor = torch.optim.Adam(self.actor.parameters(), lr=lr)
        self.optim_critic = torch.optim.Adam(self.critic.parameters(), lr=lr)

    def select_actions(self, states):
        """ Sélectionne les actions pour tous les agents en parallèle """
        states = torch.tensor(states, dtype=torch.float32)  # (n_agents, obs_dim)
        probs = self.actor(states)  # (n_agents, act_dim)
        distribution = dist.Categorical(logits=probs)
        actions = distribution.sample()  # (n_agents,)
        log_probs = distribution.log_prob(actions)  # (n_agents,)
        return actions.detach().numpy(), log_probs.detach().numpy()  # Retourne toutes les actions et log_probs
    
    def compute_loss(self, states, actions, log_probs_old, rewards, dones, gamma=0.99, clip_eps=0.2):
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.long)
        log_probs_old = torch.tensor(log_probs_old, dtype=torch.float32)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        
        # Compute advantage
        values = self.critic(states).squeeze()
        dones = torch.tensor(dones, dtype=torch.float32)
        returns = rewards + gamma * values * (1 - dones)
        advantage = returns - values.detach()

        # Compute new log_probs
        probs = self.actor(states)
        dist_new = dist.Categorical(logits=probs)
        log_probs_new = dist_new.log_prob(actions)

        # PPO Clip loss
        ratio = torch.exp(log_probs_new - log_probs_old)
        clipped_ratio = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps)
        actor_loss = -torch.min(ratio * advantage, clipped_ratio * advantage).mean()

        # Critic loss (MSE loss)
        critic_loss = F.mse_loss(values, returns)

        return actor_loss, critic_loss


In [71]:
class ReplayBuffer:
    def __init__(self):
        self.memory = []

    def store(self, trajectory):
        self.memory.append(trajectory)

    def get_data(self):
        return self.memory


In [72]:
def simulation_config(config_path: str, new_agent: bool = True):
    """
    Configure the environment and optionally an agent using a JSON configuration file.

    Args:
        config_path (str): Path to the configuration JSON file.
        new_agent (bool): Whether to initialize the agent. Defaults to True.

    Returns:
        Tuple[MazeEnv, Optional[MyAgent], Dict]: Configured environment, agent (if new), and the configuration dictionary.
    """
    
    # Read config
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)

    # Env configuration
    env = MazeEnv(
        size=config.get('grid_size'),                               # Grid size
        walls_proportion=config.get('walls_proportion'),            # Walls proportion in the grid
        num_dynamic_obstacles=config.get('num_dynamic_obstacles'),  # Number of dynamic obstacles
        num_agents=config.get('num_agents'),                        # Number of agents
        communication_range=config.get('communication_range'),      # Maximum distance for agent communications
        max_lidar_dist_main=config.get('max_lidar_dist_main'),      # Maximum distance for main LIDAR scan
        max_lidar_dist_second=config.get('max_lidar_dist_second'),  # Maximum distance for secondary LIDAR scan
        max_episode_steps=config.get('max_episode_steps'),          # Number of steps before episode termination
        render_mode=config.get('render_mode', None),
        seed=config.get('seed', None)                               # Seed for reproducibility
    )

    # Agent configuration
    agents = MAPPOAgent(state_size=env.single_agent_state_size,action_size=env.action_space.n,n_agents=2) if new_agent else None

    return env, agents, config

In [82]:
env, agent, config = simulation_config('config.json',new_agent=True)
n_agents = env.num_agents
buffer = ReplayBuffer()

for episode in range(1000):
    states, info = env.reset()  # (n_agents, obs_dim)
    episode_rewards = []

    for step in range(200):
        actions, log_probs = agent.select_actions(states)  # Récupérer toutes les actions
        actions = actions.tolist()
        log_probs = log_probs.tolist()
        next_states, rewards, dones, _ ,info= env.step(actions)  # Exécuter toutes les actions
        #convert to list
        # print(dones)
        # terminal = list(dones)
        
        # Stocker les expériences pour chaque agent
        for i in range(n_agents):
            buffer.store((states[i], actions[i], log_probs[i], rewards[i], next_states[i], dones))

        states = next_states
        episode_rewards.append(sum(rewards))

        if dones: break  # Fin de l'épisode si tous les agents sont terminés
    
    # Train the agent after collecting data
    data = buffer.get_data()
    states, actions, log_probs_old, rewards, next_states, dones = zip(*data)
    dones = list(dones)
    actor_loss, critic_loss = agent.compute_loss(states, actions, log_probs_old, rewards, dones)
    
    # Update Actor
    agent.optim_actor.zero_grad()
    actor_loss.backward(retain_graph=True)
    agent.optim_actor.step()

    # Update Critic
    agent.optim_critic.zero_grad()
    critic_loss.backward()
    agent.optim_critic.step()

    print(f"Episode {episode}, Reward: {sum(episode_rewards)},Evacuated: {len(info['evacuated_agents'])},Deactivated: {len(info['deactivated_agents'])}, Actor Loss: {actor_loss}, Critic Loss: {critic_loss}")

Episode 0, Reward: -469.33000000000004,Evacuated: 0,Deactivated: 2, Actor Loss: 3.7302918434143066, Critic Loss: 57.83147048950195
Episode 1, Reward: -39.92,Evacuated: 0,Deactivated: 2, Actor Loss: 3.7067160606384277, Critic Loss: 63.89392852783203
Episode 2, Reward: -618.94,Evacuated: 0,Deactivated: 2, Actor Loss: 3.446608066558838, Critic Loss: 61.27943420410156
Episode 3, Reward: -39.980000000000004,Evacuated: 0,Deactivated: 2, Actor Loss: 3.512723684310913, Critic Loss: 63.3642692565918
Episode 4, Reward: -309.36999999999995,Evacuated: 0,Deactivated: 2, Actor Loss: 3.324774742126465, Critic Loss: 63.43502426147461
Episode 5, Reward: -129.63,Evacuated: 0,Deactivated: 2, Actor Loss: 3.191117763519287, Critic Loss: 65.18900299072266
Episode 6, Reward: -319.3,Evacuated: 0,Deactivated: 2, Actor Loss: 2.9912710189819336, Critic Loss: 63.31642532348633
Episode 7, Reward: -519.16,Evacuated: 0,Deactivated: 2, Actor Loss: 3.0463156700134277, Critic Loss: 62.45793914794922
Episode 8, Reward: 

In [None]:
def evaluate(configs_paths: list, trained_agent, num_episodes: int = 10) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Evaluate a trained agent on multiple configurations, calculate metrics, and visualize results.

    Args:
        config_path (list): List of paths to the configuration JSON files.
        trained_agent (MyAgent): A pre-trained agent to evaluate.
        num_episodes (int): Number of episodes to run for evaluation per configuration. Defaults to 10.

    Returns:
        pd.DataFrame: A DataFrame containing evaluation metrics for each episode and configuration.
    """

    # Evaluation results
    all_results = pd.DataFrame()

    for config_path in configs_paths:
        print(f"\n--- Evaluating Configuration: {config_path} ---")

        # Environment configuration
        env, _, config = simulation_config(config_path, new_agent=False)

        # Metrics to follow the performance
        metrics = []
        total_reward = 0
        episode_count = 0
        
        # Initial reset of the environment
        states, info = env.reset()
        
   
        # Run evaluation for the specified number of episodes
        try:
            while episode_count < num_episodes:
                # Determine agents actions
                actions, log_probs = trained_agent.select_actions(states)  # Récupérer toutes les actions
                actions = actions.tolist()
                
                next_states, rewards, dones, truncated ,info= env.step(actions)  # Exécuter toutes les actions
                
                total_reward += sum(rewards)
                # Display of the step information
                print(f"\rEpisode {episode_count + 1}/{num_episodes}, Step {info['current_step']}, "
                    f"Reward: {total_reward:.2f}, "
                    f"Evacuated: {len(info['evacuated_agents'])}, "
                    f"Deactivated: {len(info['deactivated_agents'])}", end='')
                states = next_states
                # Pause
                #time.sleep(1)

                # If the episode is terminated
                if dones or truncated:
                    print("\r")
                    # Save metrics
                    metrics.append({
                        "config_path": config_path,
                        "episode": episode_count + 1,
                        "steps": info['current_step'],
                        "reward": total_reward,
                        "evacuated": len(info['evacuated_agents']),
                        "deactivated": len(info['deactivated_agents'])
                    })

                    episode_count += 1
                    total_reward = 0

                    if episode_count < num_episodes:
                        states, info = env.reset()
        
        except KeyboardInterrupt:
            print("\nSimulation interrupted by the user")
        
        finally:
            env.close()

        # Convert the current configuration's metrics to a DataFrame
        config_results = pd.DataFrame(metrics)
        all_results = pd.concat([all_results, config_results], ignore_index=True)
    
    env.close()

    all_results.to_csv('all_results.csv', index=False)

    return all_results

In [99]:
# Load the trained agent
all_results = evaluate(configs_paths=['config.json'], trained_agent=agent)


--- Evaluating Configuration: config.json ---


UnboundLocalError: local variable 'states' referenced before assignment

In [None]:
class PPO():

    def __init__(
            self,
            gamma: float = 0.99,
            lam: float = 0.95,
            lr_actor: float = 1e-3,
            lr_critic: float = 1e-3,
            max_episode_len: float = 1000,
            batch_size: int = 512,
            steps_per_epoch: int = 2048,
            clip_ratio: float = 0.2,
    ) -> None:

        super().__init__()

        # Hyperparameters
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.steps_per_epoch = steps_per_epoch
        self.batch_size = batch_size
        self.gamma = gamma  # only needed for trajectory
        self.lam = lam  # only needed for trajectory
        self.max_episode_len = max_episode_len
        self.clip_ratio = clip_ratio  # epsilon
        self.automatic_optimization = False
        self.save_hyperparameters()

        #self.env = gym.make(env, render_mode='rgb_array')
        self.critic = create_mlp(self.env.observation_space.shape, 1)
        actor_mlp = create_mlp(self.env.observation_space.shape, self.env.action_space.n)
        self.actor = DiscreteActor(actor_mlp)

        self.agent = ActorCritic(self.actor, self.critic)

        self.batch_states = []
        self.batch_actions = []
        self.batch_adv = []
        self.batch_d_rewards = []
        self.batch_logp = []

        self.ep_rewards = []
        self.ep_values = []
        self.epoch_rewards = []

        self.episode_step = 0
        self.avg_ep_reward = 0
        self.avg_ep_len = 0
        self.avg_reward = 0

        self.state = torch.FloatTensor(self.env.reset()[0])

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Passes in a state x through the network and returns the policy and a sampled action
        Args:
            x: environment state
        Returns:
            Tuple of policy and action
        """
        action_prob, action = self.actor(x)
        value = self.critic(x)

        return action_prob, action, value

    def actor_loss(self, state, action, logp_old, advantage) -> torch.Tensor:
        """
        Calculate the actor loss.

        Args:
            state: current state of environment
            action: selected action
            logp_old: old log-probability
            advantage: advantage of action
        Returns:
            loss
        """

        # TODO: Implement the PPO Actor Loss
        pi,_ = self.actor.forward(state)
        logpi = pi.log_prob(action)

        quotient = torch.exp(logpi)/torch.exp(logp_old)
        new_adv = torch.clamp(quotient,1-self.clip_ratio,1+self.clip_ratio)*advantage
        f = lambda x: x if x < 1+self.clip_ratio else 1+self.clip_ratio
        with torch.no_grad():
            quotient.data.apply_(f)

        loss_actor = -(quotient*advantage).mean()
       
        
        return loss_actor

    def critic_loss(self, state: torch.Tensor, d_reward: torch.Tensor) -> torch.Tensor:
        """
        Calculate the critic loss.

        Args:
            state: current state of environment
            d_reward: discounted reward
        Returns:
            loss
        """
        # TODO: Implemente the PPO Critic Loss
        value = self.agent(state=state)[-1]
        loss_critic = ((value - d_reward)**2).mean()
        
        return loss_critic

    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx):
        """
        Carries out a single update to actor and critic network from a batch of replay buffer.

        Args:
            batch: batch of replay buffer/trajectory data
            batch_idx: used for logging
            optimizer_idx: idx that controls optimizing actor or critic network
        Returns:
            loss
        """
        optims = self.optimizers()
        optim = optims[0] if batch_idx % 2 == 0 else optims[1]
        optim.zero_grad()

        state, action, old_logp, d_reward, advantage = batch

        # normalize advantages
        advantage = (advantage - advantage.mean()) / advantage.std()

        self.log("avg_ep_len", self.trainer.datamodule.avg_ep_len, prog_bar=True, on_step=False, on_epoch=True)
        self.log("avg_ep_reward", self.trainer.datamodule.avg_ep_reward, prog_bar=True, on_step=False, on_epoch=True)
        self.log("avg_reward", self.trainer.datamodule.avg_reward, prog_bar=True, on_step=False, on_epoch=True)

        if batch_idx % 2 == 0:
            loss_actor = self.actor_loss(state, action, old_logp, advantage)
            self.log('loss_actor', loss_actor, on_step=False, on_epoch=True, prog_bar=True, logger=True)
            self.manual_backward(loss_actor)
            optim.step()

            return loss_actor

        elif batch_idx % 2 == 0:
            loss_critic = self.critic_loss(state, d_reward)
            self.log('loss_critic', loss_critic, on_step=False, on_epoch=True, prog_bar=False, logger=True)
            self.manual_backward(loss_critic)
            optim.step()

            return loss_critic

    def viz_agent(self):
        '''
        Visualize actions of the trained agent in environment in a loop, you should use the built in env.render() functionality
        :return:
        '''
        
        imgs = []
        fig = plt.figure()
        state = torch.FloatTensor(self.env.reset()[0])
        img = self.env.render()
        imgs.append(img)
        
         # TODO: implement full evaluation loop of environment and use env.render() to get images
        state_imgs = []
        for _ in range(self.max_episode_len):
            _, action, _, _ = self.agent(state) 
            new_state,_,terminated,*_ = self.env.step(action.cpu().numpy()) 
            state = torch.FloatTensor(new_state)
            if terminated:
                break
            state_imgs.append(self.env.render())
        imgs += state_imgs
                 
        im = plt.imshow(imgs[0])
        print('Episode length', len(imgs))
        def animate(i):
            im.set_array((imgs[i]))
            return [im]
        
        anim = FuncAnimation(fig, animate, frames=len(imgs), interval=20)
        return anim


    def configure_optimizers(self) -> List[Optimizer]:
        """ Initialize Adam optimizer"""
        optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.lr_actor)
        optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.lr_critic)

        return optimizer_actor, optimizer_critic