In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/ECM/IAM/Hackathon/Solution/

/content/drive/MyDrive/ECM/IAM/Hackathon/Solution


In [3]:
%ls

actor_best.pth   config.json      env.py         process_state.py  simulate.py
actor.pth        critic_best.pth  [0m[01;34meval_configs[0m/  [01;34m__pycache__[0m/
all_results.csv  critic.pth       ppo_sma.ipynb  reward.py


In [4]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.distributions import MultivariateNormal
import numpy as np

%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
import json
from typing import Tuple, Optional, Dict
import torch
import gymnasium as gym


from env import MazeEnv

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=32, low_policy_weights_init=True):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_size, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, action_size)
        )

    def forward(self, x):
        return self.fc(x)


class Critic(nn.Module):
    def __init__(self, state_size, hidden_size=32):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_size, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.fc(x)

In [32]:
import torch.distributions as dist
from process_state import StateNormalizer

class MAPPOAgent:
    def __init__(self, state_size, action_size, n_agents, lr=1e-4, grid_size = 30, max_lidar_range = 8):
        self.n_agents = n_agents
        self.actor = Actor(state_size, action_size).to(device)
        self.critic = Critic(state_size).to(device)
        self.optim_actor = torch.optim.Adam(self.actor.parameters(), lr=lr)
        self.optim_critic = torch.optim.Adam(self.critic.parameters(), lr=lr)
        self.normalizer = StateNormalizer(grid_size, max_lidar_range)

    def select_actions(self, states):
        """ Sélectionne les actions pour tous les agents en parallèle """
        states = np.array([self.normalizer.normalize_agent_state(s) for s in states])
        states = torch.tensor(states, dtype=torch.float32, device=device)  # (n_agents, obs_dim)
        probs = self.actor(states)  # (n_agents, act_dim)
        distribution = dist.Categorical(logits=probs)
        actions = distribution.sample()  # (n_agents,)
        log_probs = distribution.log_prob(actions)  # (n_agents,)
        return actions.cpu().detach().numpy(), log_probs.cpu().detach().numpy()  # Retourne toutes les actions et log_probs

    def compute_loss(self, states, actions, log_probs_old, rewards, dones, gamma=0.99, clip_eps=0.2):
        states = np.array([self.normalizer.normalize_agent_state(s) for s in states])
        states = torch.tensor(states, dtype=torch.float32, device=device)
        actions = torch.tensor(actions, dtype=torch.long, device=device)
        log_probs_old = torch.tensor(log_probs_old, dtype=torch.float32, device=device)
        rewards = torch.tensor(rewards, dtype=torch.float32, device=device)

        # Compute advantage
        values = self.critic(states).squeeze()
        dones = torch.tensor(dones, dtype=torch.float32,device=device)
        returns = rewards + gamma * values * (1 - dones)
        advantage = returns - values.detach()
        #normalize advantage
        #advantage = (advantage - advantage.mean()) / (advantage.std()+1e-5)
        # Compute new log_probs
        probs = self.actor(states)
        dist_new = dist.Categorical(logits=probs)
        log_probs_new = dist_new.log_prob(actions)

        # PPO Clip loss
        ratio = torch.exp(log_probs_new - log_probs_old)
        clipped_ratio = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps)

        # Calcul de l'entropie pour encourager l'exploration
        entropy = dist_new.entropy().mean()  # Mesure d'incertitude de la politique
        entropy_bonus = 0.01 * entropy  # Poids ajustable de l'entropie

        actor_loss = -torch.min(ratio * advantage, clipped_ratio * advantage).mean() - entropy_bonus

        # Critic loss (MSE loss)
        noise = torch.randn_like(returns) * 1 # Petit bruit aléatoire
        returns_noisy = returns + noise
        critic_loss = F.mse_loss(values, returns_noisy)

        return actor_loss, critic_loss


In [28]:
from collections import deque
class ReplayBuffer:
    def __init__(self):
        self.memory = deque(maxlen=100000)

    def store(self, trajectory):
        self.memory.append(trajectory)

    def get_data(self):
        return self.memory


In [34]:
def simulation_config(config_path: str, new_agent: bool = True):
    """
    Configure the environment and optionally an agent using a JSON configuration file.

    Args:
        config_path (str): Path to the configuration JSON file.
        new_agent (bool): Whether to initialize the agent. Defaults to True.

    Returns:
        Tuple[MazeEnv, Optional[MyAgent], Dict]: Configured environment, agent (if new), and the configuration dictionary.
    """

    # Read config
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)

    # Env configuration
    env = MazeEnv(
        size=config.get('grid_size'),                               # Grid size
        walls_proportion=config.get('walls_proportion'),            # Walls proportion in the grid
        num_dynamic_obstacles=config.get('num_dynamic_obstacles'),  # Number of dynamic obstacles
        num_agents=config.get('num_agents'),                        # Number of agents
        communication_range=config.get('communication_range'),      # Maximum distance for agent communications
        max_lidar_dist_main=config.get('max_lidar_dist_main'),      # Maximum distance for main LIDAR scan
        max_lidar_dist_second=config.get('max_lidar_dist_second'),  # Maximum distance for secondary LIDAR scan
        max_episode_steps=config.get('max_episode_steps'),          # Number of steps before episode termination
        render_mode=None,
        seed=config.get('seed', None)                               # Seed for reproducibility
    )
    num_agents = env.num_agents
    # Agent configuration
    agents = MAPPOAgent(state_size=98,action_size=env.action_space.n,n_agents=num_agents, lr=1e-5) if new_agent else None

    return env, agents, config

In [35]:
env, agent, config = simulation_config('config.json',new_agent=True)
#load agent network
agent.actor.load_state_dict(torch.load('actor_best.pth'))
agent.critic.load_state_dict(torch.load('critic_best.pth'))
n_agents = env.num_agents
buffer = ReplayBuffer()

for episode in range(1000):
    states, info = env.reset()  # (n_agents, obs_dim)
    episode_rewards = []

    for step in range(500):
        actions, log_probs = agent.select_actions(states)  # Récupérer toutes les actions
        actions = actions.tolist()
        log_probs = log_probs.tolist()
        next_states, rewards, dones, _ ,info= env.step(actions)  # Exécuter toutes les actions
        #convert to list
        # print(dones)
        # terminal = list(dones)

        # Stocker les expériences pour chaque agent
        for i in range(n_agents):
            buffer.store((states[i], actions[i], log_probs[i], rewards[i], next_states[i], dones))

        states = next_states
        episode_rewards.append(sum(rewards))

        if dones: break  # Fin de l'épisode si tous les agents sont terminés

    # Train the agent after collecting data
    data = buffer.get_data()
    states, actions, log_probs_old, rewards, next_states, dones = zip(*data)
    dones = list(dones)
    actor_loss, critic_loss = agent.compute_loss(states, actions, log_probs_old, rewards, dones)

    # Update Actor
    agent.optim_actor.zero_grad()
    actor_loss.backward(retain_graph=True)
    agent.optim_actor.step()

    # Update Critic
    agent.optim_critic.zero_grad()
    critic_loss.backward()
    agent.optim_critic.step()

    print(f"Episode {episode}, Reward: {sum(episode_rewards):.2f}, Evacuated: {len(info['evacuated_agents'])}, Deactivated: {len(info['deactivated_agents'])}, Actor Loss: {actor_loss:.2f}, Critic Loss: {critic_loss:.2f}")


Episode 0, Reward: 1973.42, Evacuated: 3, Deactivated: 1, Actor Loss: -1.24, Critic Loss: 153.82
Episode 1, Reward: 1138.13, Evacuated: 1, Deactivated: 3, Actor Loss: -1.13, Critic Loss: 136.66
Episode 2, Reward: 976.31, Evacuated: 0, Deactivated: 4, Actor Loss: -0.85, Critic Loss: 121.67
Episode 3, Reward: 859.87, Evacuated: 1, Deactivated: 3, Actor Loss: -0.99, Critic Loss: 125.19
Episode 4, Reward: 1319.36, Evacuated: 1, Deactivated: 3, Actor Loss: -0.79, Critic Loss: 125.37
Episode 5, Reward: 1288.30, Evacuated: 1, Deactivated: 3, Actor Loss: -0.76, Critic Loss: 125.39
Episode 6, Reward: 1812.53, Evacuated: 2, Deactivated: 2, Actor Loss: -0.74, Critic Loss: 125.25
Episode 7, Reward: 1767.10, Evacuated: 0, Deactivated: 4, Actor Loss: -0.48, Critic Loss: 131.59
Episode 8, Reward: 2058.53, Evacuated: 3, Deactivated: 1, Actor Loss: -0.62, Critic Loss: 134.30
Episode 9, Reward: 717.73, Evacuated: 1, Deactivated: 3, Actor Loss: -0.74, Critic Loss: 133.85
Episode 10, Reward: 1368.30, Evac

KeyboardInterrupt: 

In [36]:
def evaluate(configs_paths: list, trained_agent, num_episodes: int = 10) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Evaluate a trained agent on multiple configurations, calculate metrics, and visualize results.

    Args:
        config_path (list): List of paths to the configuration JSON files.
        trained_agent (MyAgent): A pre-trained agent to evaluate.
        num_episodes (int): Number of episodes to run for evaluation per configuration. Defaults to 10.

    Returns:
        pd.DataFrame: A DataFrame containing evaluation metrics for each episode and configuration.
    """

    # Evaluation results
    all_results = pd.DataFrame()

    for config_path in configs_paths:
        print(f"\n--- Evaluating Configuration: {config_path} ---")

        # Environment configuration
        env, _, config = simulation_config(config_path, new_agent=False)

        # Metrics to follow the performance
        metrics = []
        total_reward = 0
        episode_count = 0

        # Initial reset of the environment
        states, info = env.reset()

        evacuated_total = 0
        # Run evaluation for the specified number of episodes
        try:
            while episode_count < num_episodes:
                # Determine agents actions
                actions, log_probs = trained_agent.select_actions(states)  # Récupérer toutes les actions
                actions = actions.tolist()

                next_states, rewards, dones, truncated ,info= env.step(actions)  # Exécuter toutes les actions

                total_reward += sum(rewards)
                # Display of the step information
                print(f"\rEpisode {episode_count + 1}/{num_episodes}, Step {info['current_step']}, "
                    f"Reward: {total_reward:.2f}, "
                    f"Evacuated: {len(info['evacuated_agents'])}, "
                    f"Deactivated: {len(info['deactivated_agents'])}", end='')
                states = next_states
                # Pause
                #time.sleep(1)

                # If the episode is terminated
                if dones or truncated:
                    evacuated_total += len(info['evacuated_agents'])
                    # Display of the episode information
                    print("\r")
                    # Save metrics
                    metrics.append({
                        "config_path": config_path,
                        "episode": episode_count + 1,
                        "steps": info['current_step'],
                        "reward": total_reward,
                        "evacuated": len(info['evacuated_agents']),
                        "deactivated": len(info['deactivated_agents'])
                    })
                    episode_count += 1
                    total_reward = 0

                    if episode_count < num_episodes:
                        states, info = env.reset()

        except KeyboardInterrupt:
            print("\nSimulation interrupted by the user")

        finally:
            env.close()

        # Convert the current configuration's metrics to a DataFrame
        config_results = pd.DataFrame(metrics)
        all_results = pd.concat([all_results, config_results], ignore_index=True)

    env.close()

    all_results.to_csv('all_results.csv', index=False)

    return all_results, evacuated_total

In [37]:
import os
# iterate over eval folder
evacuated_total = 0
for file in os.listdir('eval_configs/'):
    _, agents, config = simulation_config(f'eval_configs/{file}', new_agent=True)
    #turn off warning
    import warnings
    warnings.filterwarnings("ignore")
    agents.actor.load_state_dict(torch.load('actor_best.pth'))
    agents.critic.load_state_dict(torch.load('critic_best.pth'))
    eval_configs = [f'eval_configs/{file}']
    all_results, evacuated = evaluate(configs_paths=eval_configs, trained_agent=agent)
    evacuated_total += evacuated
print("Total evacuated : ",evacuated_total,"/",400)


--- Evaluating Configuration: eval_configs/config_2.json ---
Episode 1/10, Step 42, Reward: 914.98, Evacuated: 3, Deactivated: 1
Episode 2/10, Step 92, Reward: 502.14, Evacuated: 0, Deactivated: 4
Episode 3/10, Step 28, Reward: 363.03, Evacuated: 1, Deactivated: 3
Episode 4/10, Step 63, Reward: 257.10, Evacuated: 0, Deactivated: 4
Episode 5/10, Step 99, Reward: 417.16, Evacuated: 0, Deactivated: 4
Episode 6/10, Step 22, Reward: 600.00, Evacuated: 2, Deactivated: 2
Episode 7/10, Step 28, Reward: 481.05, Evacuated: 1, Deactivated: 3
Episode 8/10, Step 91, Reward: 956.09, Evacuated: 2, Deactivated: 2
Episode 9/10, Step 126, Reward: 1092.10, Evacuated: 2, Deactivated: 2
Episode 10/10, Step 66, Reward: 649.09, Evacuated: 1, Deactivated: 3

--- Evaluating Configuration: eval_configs/config_1.json ---
Episode 1/10, Step 842, Reward: 3481.06, Evacuated: 4, Deactivated: 0
Episode 2/10, Step 944, Reward: 2159.04, Evacuated: 1, Deactivated: 3
Episode 3/10, Step 134, Reward: 997.13, Evacuated: 2,

In [None]:
#save networks
torch.save(agent.actor.state_dict(), 'actor_best.pth')
torch.save(agent.critic.state_dict(), 'critic_best.pth')

In [None]:
class PPO():

    def __init__(
            self,
            gamma: float = 0.99,
            lam: float = 0.95,
            lr_actor: float = 1e-3,
            lr_critic: float = 1e-3,
            max_episode_len: float = 1000,
            batch_size: int = 512,
            steps_per_epoch: int = 2048,
            clip_ratio: float = 0.2,
    ) -> None:

        super().__init__()

        # Hyperparameters
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.steps_per_epoch = steps_per_epoch
        self.batch_size = batch_size
        self.gamma = gamma  # only needed for trajectory
        self.lam = lam  # only needed for trajectory
        self.max_episode_len = max_episode_len
        self.clip_ratio = clip_ratio  # epsilon
        self.automatic_optimization = False
        self.save_hyperparameters()

        #self.env = gym.make(env, render_mode='rgb_array')
        self.critic = create_mlp(self.env.observation_space.shape, 1)
        actor_mlp = create_mlp(self.env.observation_space.shape, self.env.action_space.n)
        self.actor = DiscreteActor(actor_mlp)

        self.agent = ActorCritic(self.actor, self.critic)

        self.batch_states = []
        self.batch_actions = []
        self.batch_adv = []
        self.batch_d_rewards = []
        self.batch_logp = []

        self.ep_rewards = []
        self.ep_values = []
        self.epoch_rewards = []

        self.episode_step = 0
        self.avg_ep_reward = 0
        self.avg_ep_len = 0
        self.avg_reward = 0

        self.state = torch.FloatTensor(self.env.reset()[0])

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Passes in a state x through the network and returns the policy and a sampled action
        Args:
            x: environment state
        Returns:
            Tuple of policy and action
        """
        action_prob, action = self.actor(x)
        value = self.critic(x)

        return action_prob, action, value

    def actor_loss(self, state, action, logp_old, advantage) -> torch.Tensor:
        """
        Calculate the actor loss.

        Args:
            state: current state of environment
            action: selected action
            logp_old: old log-probability
            advantage: advantage of action
        Returns:
            loss
        """

        # TODO: Implement the PPO Actor Loss
        pi,_ = self.actor.forward(state)
        logpi = pi.log_prob(action)

        quotient = torch.exp(logpi)/torch.exp(logp_old)
        new_adv = torch.clamp(quotient,1-self.clip_ratio,1+self.clip_ratio)*advantage
        f = lambda x: x if x < 1+self.clip_ratio else 1+self.clip_ratio
        with torch.no_grad():
            quotient.data.apply_(f)

        loss_actor = -(quotient*advantage).mean()


        return loss_actor

    def critic_loss(self, state: torch.Tensor, d_reward: torch.Tensor) -> torch.Tensor:
        """
        Calculate the critic loss.

        Args:
            state: current state of environment
            d_reward: discounted reward
        Returns:
            loss
        """
        # TODO: Implemente the PPO Critic Loss
        value = self.agent(state=state)[-1]
        loss_critic = ((value - d_reward)**2).mean()

        return loss_critic

    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx):
        """
        Carries out a single update to actor and critic network from a batch of replay buffer.

        Args:
            batch: batch of replay buffer/trajectory data
            batch_idx: used for logging
            optimizer_idx: idx that controls optimizing actor or critic network
        Returns:
            loss
        """
        optims = self.optimizers()
        optim = optims[0] if batch_idx % 2 == 0 else optims[1]
        optim.zero_grad()

        state, action, old_logp, d_reward, advantage = batch

        # normalize advantages
        advantage = (advantage - advantage.mean()) / advantage.std()

        self.log("avg_ep_len", self.trainer.datamodule.avg_ep_len, prog_bar=True, on_step=False, on_epoch=True)
        self.log("avg_ep_reward", self.trainer.datamodule.avg_ep_reward, prog_bar=True, on_step=False, on_epoch=True)
        self.log("avg_reward", self.trainer.datamodule.avg_reward, prog_bar=True, on_step=False, on_epoch=True)

        if batch_idx % 2 == 0:
            loss_actor = self.actor_loss(state, action, old_logp, advantage)
            self.log('loss_actor', loss_actor, on_step=False, on_epoch=True, prog_bar=True, logger=True)
            self.manual_backward(loss_actor)
            optim.step()

            return loss_actor

        elif batch_idx % 2 == 0:
            loss_critic = self.critic_loss(state, d_reward)
            self.log('loss_critic', loss_critic, on_step=False, on_epoch=True, prog_bar=False, logger=True)
            self.manual_backward(loss_critic)
            optim.step()

            return loss_critic

    def viz_agent(self):
        '''
        Visualize actions of the trained agent in environment in a loop, you should use the built in env.render() functionality
        :return:
        '''

        imgs = []
        fig = plt.figure()
        state = torch.FloatTensor(self.env.reset()[0])
        img = self.env.render()
        imgs.append(img)

         # TODO: implement full evaluation loop of environment and use env.render() to get images
        state_imgs = []
        for _ in range(self.max_episode_len):
            _, action, _, _ = self.agent(state)
            new_state,_,terminated,*_ = self.env.step(action.cpu().numpy())
            state = torch.FloatTensor(new_state)
            if terminated:
                break
            state_imgs.append(self.env.render())
        imgs += state_imgs

        im = plt.imshow(imgs[0])
        print('Episode length', len(imgs))
        def animate(i):
            im.set_array((imgs[i]))
            return [im]

        anim = FuncAnimation(fig, animate, frames=len(imgs), interval=20)
        return anim


    def configure_optimizers(self) -> List[Optimizer]:
        """ Initialize Adam optimizer"""
        optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.lr_actor)
        optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.lr_critic)

        return optimizer_actor, optimizer_critic