In [1]:
!pip install 'pettingzoo[mpe]' pygame
!pip install imageio imageio[ffmpeg]


Collecting imageio-ffmpeg (from imageio[ffmpeg])
  Downloading imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl (29.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m29.5/29.5 MB[0m [31m1.7 MB/s[0m  [33m0:00:17[0mm0:00:01[0m00:01[0m
[?25hInstalling collected packages: imageio-ffmpeg
Successfully installed imageio-ffmpeg-0.6.0


In [2]:
import MARL_env as generic_mpe
import numpy as np

class MAPFBuilder:
    def __init__(self, num_total_agents, obs_radius=0.5):
        self.num_total = num_total_agents
        self.agents_config = {}
        self.obstacles_config = [] # La liste des obstacles est ici
        
        # Settings
        self.world_settings = {'damping': 0.25, 'obs_radius': obs_radius} 
        self.palette = [[0.9, 0.1, 0.1], [0.1, 0.9, 0.1], [0.1, 0.1, 0.9], [0.9, 0.9, 0.1], [0.1, 0.8, 0.8]]

    def set_world_params(self, damping=0.25, obs_radius=None):
        self.world_settings['damping'] = damping
        if obs_radius is not None: self.world_settings['obs_radius'] = obs_radius

    # --- GESTION DES OBSTACLES ---
    
    def add_obstacle(self, x, y, size=0.1):
        """Ajoute un obstacle √† la liste."""
        self.obstacles_config.append({'pos': [x, y], 'size': size})
        print(f"Obstacle ajout√© en [{x}, {y}]. Total: {len(self.obstacles_config)}")

    def remove_last_obstacle(self):
        """Supprime le dernier obstacle ajout√©."""
        if len(self.obstacles_config) > 0:
            removed = self.obstacles_config.pop()
            print(f"Obstacle supprim√© : {removed['pos']}")
        else:
            print("Aucun obstacle √† supprimer.")

    def reset_obstacles(self):
        """Supprime TOUS les obstacles."""
        self.obstacles_config = []
        print("Tous les obstacles ont √©t√© supprim√©s.")

    # -----------------------------

    def set_agent(self, agent_id, start_pos, goal_pos, size=0.1, color=None):
        if color is None: color = self.palette[agent_id % len(self.palette)]
        self.agents_config[agent_id] = {
            'id': agent_id, 'start': start_pos, 'goal': goal_pos, 
            'size': size, 'color': color
        }

    def get_env(self, mode='test', agent_id_train=None, render_mode=None):
        agents_to_spawn = []
        if mode == 'train':
            if agent_id_train in self.agents_config: agents_to_spawn.append(self.agents_config[agent_id_train])
        elif mode == 'test':
            for i in range(self.num_total): 
                if i in self.agents_config: agents_to_spawn.append(self.agents_config[i])
        
        full_config = {
            'world': {
                'total_agents_possible': self.num_total, 
                'damping': self.world_settings['damping'],
                'obs_radius': self.world_settings['obs_radius']
            }, 
            'obstacles': self.obstacles_config, 
            'agents': agents_to_spawn
        }
        return generic_mpe.make_env(full_config, render_mode=render_mode)

pygame 2.6.1 (SDL 2.28.4, Python 3.10.16)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
import time

# --- INITIALISATION (AGENT MYOPE) ---
# L'agent voit √† 0.3, mais le but est √† 0.6
builder = MAPFBuilder(num_total_agents=1, obs_radius=0.3) 

# Pas d'obstacle pour ce test (pour prouver que c'est le rayon qui bloque)
# Agent √† -0.3, Goal √† +0.3 -> Distance totale = 0.6
GOAL_POS = np.array([0.3, 0.0])
builder.set_agent(0, [-0.3, 0.0], GOAL_POS, size=0.1)

env = builder.get_env(mode='test', render_mode='human')
obs, _ = env.reset()

scenario = env.unwrapped.scenario
world = env.unwrapped.world

# ... (Le d√©but du code reste identique) ...

print(f"--- TEST RADIUS (Rayon Vision: {builder.world_settings['obs_radius']}) ---")

for i in range(60):
    # Action nulle
    actions = {'agent_0': np.array([1.0, 0.0, 0.0, 0.0, 0.0])}
    env.step(actions)
    env.render()
    
    pos_agent = world.agents[0].state.p_pos
    dist = np.linalg.norm(pos_agent - GOAL_POS)
    
    # VERIFICATION
    # La fonction utilise scenario.obs_radius en interne
    is_visible = scenario.check_visibility(pos_agent, GOAL_POS, world)
    
    print(f"Dist r√©elle: {dist:.2f} | Visible ? {is_visible}")
    
    if i == 20:
        print("\n--- ‚ö° UPGRADE: ON AUGMENTE LA VISION √Ä 1.0 ! ---")
        # CORRECTION ICI : On change juste l'attribut du sc√©nario
        scenario.obs_radius = 1.0 
        # La ligne 'scenario.sensor.radius = 1.0' est supprim√©e car inutile

    time.sleep(0.05)

env.close()

--- TEST RADIUS (Rayon Vision: 0.3) ---
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False
Dist r√©elle: 0.60 | Visible ? False

--- ‚ö° UPGRADE: ON AUGMENTE LA VISION √Ä 1.0 ! ---
Dist r√©elle: 0.60 | Visible ? True
Dist r√©elle: 0.60 | Visible ? True
Dist r√©elle: 0.60 | Visible ? True
Dist r√©elle: 0.60 | V

In [4]:
import time
import numpy as np

# --- SETUP ---
builder = MAPFBuilder(num_total_agents=1, obs_radius=1.0)

# 1. OBSTACLE (Mur) au centre (0.0, 0.0) taille 0.2
# Le bord gauche du mur est √† x = -0.2
builder.add_obstacle(0.0, 0.0, size=0.2) 

# 2. AGENT (Rouge) √† gauche (-0.5, 0.0) taille 0.1
# Le bord droit de l'agent est √† x = -0.4
# Il a donc 0.2 unit√©s de distance libre avant le contact.
GOAL_POS = [0.8, 0.0] # Goal derri√®re le mur
builder.set_agent(0, [-0.5, 0.0], GOAL_POS, size=0.1, color=[0.9, 0.1, 0.1])

env = builder.get_env(mode='test', render_mode='human')
obs, _ = env.reset()

print("--- üí• CRASH TEST EN COURS ---")
print("L'agent (Rouge) va foncer dans le Mur (Gris).")
print("Physique attendue : Il doit se bloquer vers x = -0.3")
print("Reward attendu : -50.0")

for i in range(50):
    # ACTION : FONCER √Ä DROITE [NoOp, Left, Right, Down, Up]
    # On met une force max (1.0) vers la droite
    actions = {'agent_0': np.array([0.0, 0.0, 1.0, 0.0, 0.0])}
    
    obs, rewards, term, trunc, info = env.step(actions)
    env.render()
    
    # R√©cup√©ration des infos
    pos_x = env.unwrapped.world.agents[0].state.p_pos[0]
    rew = rewards['agent_0']
    
    # Formatage console pour bien voir
    status = ""
    if rew == -50.0:
        status = "üî¥ COLLISION (-50) !"
    elif rew == 100.0:
        status = "üü¢ WIN (+100) !"
    else:
        status = f"üîµ Approche ({rew})"
        
    print(f"Step {i:02d} | Pos X: {pos_x:.3f} | Reward: {rew} | {status}")
    
    time.sleep(0.05)

env.close()

Obstacle ajout√© en [0.0, 0.0]. Total: 1
--- üí• CRASH TEST EN COURS ---
L'agent (Rouge) va foncer dans le Mur (Gris).
Physique attendue : Il doit se bloquer vers x = -0.3
Reward attendu : -50.0
Step 00 | Pos X: -0.500 | Reward: -1.0 | üîµ Approche (-1.0)
Step 01 | Pos X: -0.470 | Reward: -1.0 | üîµ Approche (-1.0)
Step 02 | Pos X: -0.417 | Reward: -1.0 | üîµ Approche (-1.0)
Step 03 | Pos X: -0.348 | Reward: -1.0 | üîµ Approche (-1.0)
Step 04 | Pos X: -0.300 | Reward: -51.0 | üîµ Approche (-51.0)
Step 05 | Pos X: -0.300 | Reward: -51.0 | üîµ Approche (-51.0)
Step 06 | Pos X: -0.300 | Reward: -51.0 | üîµ Approche (-51.0)
Step 07 | Pos X: -0.300 | Reward: -51.0 | üîµ Approche (-51.0)
Step 08 | Pos X: -0.300 | Reward: -51.0 | üîµ Approche (-51.0)
Step 09 | Pos X: -0.300 | Reward: -51.0 | üîµ Approche (-51.0)
Step 10 | Pos X: -0.300 | Reward: -51.0 | üîµ Approche (-51.0)
Step 11 | Pos X: -0.300 | Reward: -51.0 | üîµ Approche (-51.0)
Step 12 | Pos X: -0.300 | Reward: -51.0 | ü

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# --- 1. LE CERVEAU (Un peu plus gros pour compenser l'aveuglement) ---
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(QNetwork, self).__init__()
        # On passe √† 128 neurones pour qu'il m√©morise mieux la carte
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

In [6]:
class DeepDynaQAgent:
    def __init__(self, state_dim, action_dim, lr=0.0005, gamma=0.99, epsilon=0.3, epsilon_decay=0.99, epsilon_min=0.3, n_planning=5):
        # ... (Tout pareil qu'avant) ...
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.n_planning = n_planning
        self.batch_size = 64
        self.lr = lr
        self.memory = deque(maxlen=50000)

        self.q_network = QNetwork(state_dim, action_dim)
        self.target_network = QNetwork(state_dim, action_dim)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.criterion = nn.MSELoss()

    # ... (select_action, remember inchang√©s) ...
    def select_action(self, state, eval=False):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        if not eval and np.random.rand() < self.epsilon:
            return np.random.randint(self.action_dim)
        with torch.no_grad():
            q_values = self.q_network(state_tensor)
            return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.memory) < self.batch_size: return 0.0
        total_loss = 0
        
        for _ in range(self.n_planning):
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.FloatTensor(np.array(states))
            actions = torch.LongTensor(actions).unsqueeze(1)
            rewards = torch.FloatTensor(rewards).unsqueeze(1)
            next_states = torch.FloatTensor(np.array(next_states))
            dones = torch.FloatTensor(np.array(dones, dtype=np.float32)).unsqueeze(1)

            with torch.no_grad():
                next_q_values = self.target_network(next_states)
                max_next_q = next_q_values.max(1)[0].unsqueeze(1)
                target_q = rewards + (self.gamma * max_next_q * (1 - dones))

            current_q_values = self.q_network(states)
            current_q = current_q_values.gather(1, actions)

            loss = self.criterion(current_q, target_q)
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 1.0)
            self.optimizer.step()
            total_loss += loss.item()

        # --- MODIFICATION MAJEURE ---
        # J'ai SUPPRIM√â la d√©croissance d'epsilon ici
        # Elle se fera manuellement √† la fin de l'√©pisode
            
        return total_loss / self.n_planning

    # NOUVELLE FONCTION
    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    # ... (update_target_network, save, load inchang√©s) ...
    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())
    def action_to_vector(self, action_idx):
        vec = np.zeros(5); vec[action_idx] = 1.0; return vec
    def save(self, filename): torch.save(self.q_network.state_dict(), filename)
    def load(self, filename): self.q_network.load_state_dict(torch.load(filename))

In [8]:
import MARL_env as generic_mpe
import numpy as np

# --- CONFIG ---
builder = MAPFBuilder(1, obs_radius=1.0) 
start_pos = [-0.8, -0.8]; goal_pos = [0.8, 0.8]
builder.set_agent(0, start_pos, goal_pos, size=0.1, color=[0.9, 0.1, 0.1])
builder.reset_obstacles()
env = builder.get_env(mode='train', agent_id_train=0)

scenario = env.unwrapped.scenario
world = env.unwrapped.world
raw_agent = world.agents[0] 

# --- AGENT ---
# epsilon_decay=0.99 par √âPISODE -> Exploration lente et longue
agent = DeepDynaQAgent(state_dim=4, action_dim=5, lr=0.0005, epsilon=0.3, epsilon_decay=0.999, n_planning=5)

print("--- TRAINING (LONG EXPLORATION) ---")

num_episodes = 1000 # Tu peux monter √† 1000 pour √™tre s√ªr
max_steps = 200

for episode in range(num_episodes):
    obs, _ = env.reset()
    state = obs['agent_0']
    
    total_reward = 0
    
    for step in range(max_steps):
        action_idx = agent.select_action(state)
        action_vec = agent.action_to_vector(action_idx)
        
        next_obs, rewards, terms, truncs, _ = env.step({'agent_0': action_vec})
        
        if 'agent_0' not in next_obs: break # S√©curit√©

        next_state = next_obs['agent_0']
        reward = rewards['agent_0']
        
        is_success = scenario.is_done(raw_agent, world)
        done = is_success or (step == max_steps - 1)
        
        agent.remember(state, action_idx, reward, next_state, done)
        loss = agent.train()
        
        state = next_state
        total_reward += reward
        
        if is_success:
            break
            
    # --- C'EST ICI QU'ON UPDATE EPSILON ---
    # On le fait une seule fois par √©pisode
    agent.decay_epsilon()
    
    agent.update_target_network()
    
    status = "üèÜ VICTOIRE" if is_success else "‚è≥ TEMPS"
    print(f"Ep {episode+1:03d} | R: {total_reward:5.1f} | Eps: {agent.epsilon:.2f} | Loss: {loss:.4f} | {status}")

agent.save("model_agent_solo.pth")
env.close()

Tous les obstacles ont √©t√© supprim√©s.
--- TRAINING (LONG EXPLORATION) ---
Ep 001 | R: -200.0 | Eps: 0.30 | Loss: 0.0003 | ‚è≥ TEMPS
Ep 002 | R: -200.0 | Eps: 0.30 | Loss: 0.0001 | ‚è≥ TEMPS
Ep 003 | R: -200.0 | Eps: 0.30 | Loss: 0.0010 | ‚è≥ TEMPS
Ep 004 | R: -200.0 | Eps: 0.30 | Loss: 0.0211 | ‚è≥ TEMPS
Ep 005 | R: -200.0 | Eps: 0.30 | Loss: 0.0183 | ‚è≥ TEMPS
Ep 006 | R: -200.0 | Eps: 0.30 | Loss: 0.1155 | ‚è≥ TEMPS
Ep 007 | R: -200.0 | Eps: 0.30 | Loss: 0.1384 | ‚è≥ TEMPS
Ep 008 | R: -200.0 | Eps: 0.30 | Loss: 0.1302 | ‚è≥ TEMPS
Ep 009 | R: -200.0 | Eps: 0.30 | Loss: 0.6685 | ‚è≥ TEMPS
Ep 010 | R: -200.0 | Eps: 0.30 | Loss: 0.4200 | ‚è≥ TEMPS
Ep 011 | R: -200.0 | Eps: 0.30 | Loss: 0.0066 | ‚è≥ TEMPS
Ep 012 | R: -200.0 | Eps: 0.30 | Loss: 0.6849 | ‚è≥ TEMPS
Ep 013 | R: -200.0 | Eps: 0.30 | Loss: 0.0103 | ‚è≥ TEMPS
Ep 014 | R: -200.0 | Eps: 0.30 | Loss: 0.3719 | ‚è≥ TEMPS
Ep 015 | R: -200.0 | Eps: 0.30 | Loss: 0.8369 | ‚è≥ TEMPS
Ep 016 | R: -200.0 | Eps: 0.30 | Loss: 0.4554 | ‚è≥ T

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from collections import deque

# --- DEFINITION DES RESEAUX ---

class CriticNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(CriticNetwork, self).__init__()
        # Q1 architecture
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, action_dim) # Sortie: Q-value pour CHAQUE action

        # Q2 architecture (Double Q-Learning pour la stabilit√©)
        self.fc4 = nn.Linear(state_dim, 256)
        self.fc5 = nn.Linear(256, 256)
        self.fc6 = nn.Linear(256, action_dim)

    def forward(self, state):
        x1 = F.relu(self.fc1(state))
        x1 = F.relu(self.fc2(x1))
        q1 = self.fc3(x1)

        x2 = F.relu(self.fc4(state))
        x2 = F.relu(self.fc5(x2))
        q2 = self.fc6(x2)
        return q1, q2

class ActorNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, action_dim) 

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        # Pour du discret : on sort des probabilit√©s (Softmax)
        probs = F.softmax(x, dim=-1)
        return probs

# --- AGENT SAC DISCRET ---
class SACAgent:
    def __init__(self, state_dim, action_dim, lr=0.0003, gamma=0.99, tau=0.005, batch_size=64, n_updates=1):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_updates = n_updates
        self.memory = deque(maxlen=50000)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Critic
        self.critic = CriticNetwork(state_dim, action_dim).to(self.device)
        self.critic_target = CriticNetwork(state_dim, action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)

        # Actor
        self.actor = ActorNetwork(state_dim, action_dim).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)

        # --- FIX 1: Target Entropy plus relax (0.6 au lieu de 0.98) ---
        # Cela √©vite que l'agent cherche d√©sesp√©r√©ment √† √™tre trop al√©atoire
        self.target_entropy = -np.log(1.0 / action_dim) * 0.6 
        
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr)
        self.alpha = self.log_alpha.exp()

    def select_action(self, state, eval=False):
        state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            probs = self.actor(state_t)
        
        if eval:
            action = torch.argmax(probs, dim=1).item()
        else:
            dist = torch.distributions.Categorical(probs)
            action = dist.sample().item()
        return action

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.memory) < self.batch_size: return 0.0
        
        total_loss = 0
        
        for _ in range(self.n_updates):
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.FloatTensor(np.array(states)).to(self.device)
            actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)
            rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
            next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
            dones = torch.FloatTensor(np.array(dones, dtype=np.float32)).unsqueeze(1).to(self.device)

            # --------------------------
            # 1. Update CRITIC
            # --------------------------
            with torch.no_grad():
                next_probs = self.actor(next_states)
                next_log_probs = torch.log(next_probs + 1e-8)
                q1_target, q2_target = self.critic_target(next_states)
                min_q_target = torch.min(q1_target, q2_target)
                
                # Terme d'entropie pond√©r√© par Alpha
                target_v = (next_probs * (min_q_target - self.alpha * next_log_probs)).sum(dim=1, keepdim=True)
                target_q = rewards + (1 - dones) * self.gamma * target_v

            q1, q2 = self.critic(states)
            q1 = q1.gather(1, actions)
            q2 = q2.gather(1, actions)
            
            critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q)
            
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            # --- FIX 2: Gradient Clipping (Vital pour la stabilit√©) ---
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1.0)
            self.critic_optimizer.step()

            # --------------------------
            # 2. Update ACTOR
            # --------------------------
            probs = self.actor(states)
            log_probs = torch.log(probs + 1e-8)
            with torch.no_grad():
                q1_pi, q2_pi = self.critic(states)
                min_q_pi = torch.min(q1_pi, q2_pi)
            
            actor_loss = (probs * (self.alpha * log_probs - min_q_pi)).sum(dim=1).mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            # --- FIX 2b: Clipping sur l'Actor aussi ---
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1.0)
            self.actor_optimizer.step()

            # --------------------------
            # 3. Update ALPHA
            # --------------------------
            probs_detached = probs.detach()
            log_probs_detached = log_probs.detach()
            current_entropy = - (probs_detached * log_probs_detached).sum(dim=1).mean()
            
            alpha_loss = (self.log_alpha * (current_entropy - self.target_entropy).detach()).mean()
            
            self.alpha_optimizer.zero_grad()
            alpha_loss.backward()
            # --- FIX 2c: Clipping sur Alpha ---
            torch.nn.utils.clip_grad_norm_([self.log_alpha], 1.0)
            self.alpha_optimizer.step()
            
            # --- FIX 3: Hard Clamp sur Alpha (Emp√™che l'explosion math√©matique) ---
            # On force log_alpha √† rester entre -5 (tr√®s petit) et 2 (environ 7.3)
            with torch.no_grad():
                self.log_alpha.clamp_(min=-5.0, max=2.0)

            self.alpha = self.log_alpha.exp()
            total_loss += critic_loss.item() + actor_loss.item()

            self.soft_update(self.critic, self.critic_target)

        return total_loss / self.n_updates
    
    # ... (Le reste: soft_update, save, load, action_to_vector inchang√©s) ...
    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
    
    def action_to_vector(self, action_idx):
        vec = np.zeros(self.action_dim); vec[action_idx] = 1.0; return vec
    def save(self, filename):
        torch.save({'actor': self.actor.state_dict(), 'critic': self.critic.state_dict(), 'log_alpha': self.log_alpha}, filename)
    def load(self, filename):
        c = torch.load(filename)
        self.actor.load_state_dict(c['actor']); self.critic.load_state_dict(c['critic']); self.log_alpha = c['log_alpha']; self.alpha = self.log_alpha.exp()

In [10]:
import MARL_env as generic_mpe
import numpy as np
# Assure-toi d'importer ta classe SACAgent ici ou de la copier-coller avant
# from my_sac_file import SACAgent 

# --- CONFIG ---
builder = MAPFBuilder(1, obs_radius=1.0) 
start_pos = [-0.8, -0.8]; goal_pos = [0.8, 0.8]
builder.set_agent(0, start_pos, goal_pos, size=0.1, color=[0.9, 0.1, 0.1])
builder.reset_obstacles()
env = builder.get_env(mode='train', agent_id_train=0)

scenario = env.unwrapped.scenario
world = env.unwrapped.world
raw_agent = world.agents[0] 

# --- AGENT SAC ---
# Plus d'epsilon ici. 
# lr=0.0003 est standard pour SAC (souvent plus bas que DQN).
# n_updates=1 signifie 1 descente de gradient par pas de temps.
agent = SACAgent(state_dim=4, action_dim=5, lr=0.0003, n_updates=1)

print("--- TRAINING (SAC - Maximum Entropy) ---")

num_episodes = 1000 
max_steps = 200

for episode in range(num_episodes):
    obs, _ = env.reset()
    state = obs['agent_0']
    
    total_reward = 0
    avg_loss = 0
    steps_count = 0
    
    for step in range(max_steps):
        # SAC choisit l'action de mani√®re stochastique (bas√©e sur les probas)
        action_idx = agent.select_action(state, eval=False)
        action_vec = agent.action_to_vector(action_idx)
        
        next_obs, rewards, terms, truncs, _ = env.step({'agent_0': action_vec})
        
        if 'agent_0' not in next_obs: break 

        next_state = next_obs['agent_0']
        reward = rewards['agent_0']
        
        # V√©rification victoire
        is_success = scenario.is_done(raw_agent, world)
        
        # Le flag 'done' pour le buffer : True si victoire OU fin du temps
        done = is_success or (step == max_steps - 1)
        
        # Stockage
        agent.remember(state, action_idx, reward, next_state, done)
        
        # Entra√Ænement (Update Actor, Critic & Alpha + Soft Updates Targets)
        loss = agent.train()
        
        state = next_state
        total_reward += reward
        avg_loss += loss
        steps_count += 1
        
        if is_success:
            break
            
    # --- FIN DE L'EPISODE ---
    
    # Note : Pas de decay_epsilon() ni de update_target_network() manuels.
    # Tout est g√©r√© dans agent.train() pour le SAC.

    # On r√©cup√®re la valeur actuelle de alpha (pour voir si l'exploration diminue)
    # .item() car c'est souvent un tenseur dans SAC
    current_alpha = agent.alpha.item() if hasattr(agent.alpha, 'item') else agent.alpha
    avg_loss = avg_loss / steps_count if steps_count > 0 else 0

    status = "üèÜ VICTOIRE" if is_success else "‚è≥ TEMPS"
    
    print(f"Ep {episode+1:03d} | R: {total_reward:6.1f} | Alpha: {current_alpha:.4f} | Loss: {avg_loss:.4f} | {status}")

# Sauvegarde
agent.save("model_agent_sac.pth")
env.close()

Tous les obstacles ont √©t√© supprim√©s.
--- TRAINING (SAC - Maximum Entropy) ---
Ep 001 | R: -200.0 | Alpha: 0.9597 | Loss: -1.5458 | ‚è≥ TEMPS
Ep 002 | R: -200.0 | Alpha: 0.9038 | Loss: -2.4854 | ‚è≥ TEMPS
Ep 003 | R: -200.0 | Alpha: 0.8512 | Loss: -2.6518 | ‚è≥ TEMPS
Ep 004 | R: -200.0 | Alpha: 0.8017 | Loss: -2.7821 | ‚è≥ TEMPS
Ep 005 | R: -200.0 | Alpha: 0.7551 | Loss: -2.8364 | ‚è≥ TEMPS
Ep 006 | R: -200.0 | Alpha: 0.7112 | Loss: -2.8535 | ‚è≥ TEMPS
Ep 007 | R: -200.0 | Alpha: 0.6700 | Loss: -2.7884 | ‚è≥ TEMPS
Ep 008 | R: -200.0 | Alpha: 0.6311 | Loss: -2.6988 | ‚è≥ TEMPS
Ep 009 | R: -200.0 | Alpha: 0.5945 | Loss: -2.5415 | ‚è≥ TEMPS
Ep 010 | R: -200.0 | Alpha: 0.5600 | Loss: -2.3583 | ‚è≥ TEMPS
Ep 011 | R: -200.0 | Alpha: 0.5274 | Loss: -2.1221 | ‚è≥ TEMPS
Ep 012 | R: -200.0 | Alpha: 0.4967 | Loss: -1.8269 | ‚è≥ TEMPS
Ep 013 | R: -200.0 | Alpha: 0.4678 | Loss: -1.5012 | ‚è≥ TEMPS
Ep 014 | R: -200.0 | Alpha: 0.4406 | Loss: -1.1421 | ‚è≥ TEMPS
Ep 015 | R: -200.0 | Alpha: 0.4149 |

In [13]:
class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1) # Sortie : 1 scalaire (V(s))

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        v = self.fc3(x)
        return v

In [14]:
class ActorCriticAgent:
    def __init__(self, state_dim, action_dim, lr=0.0005, gamma=0.99):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # On utilise une m√©moire simple (liste) car on clear apr√®s chaque update
        self.memory = [] 

        # Actor : D√©cide de l'action (Probabilit√©s)
        self.actor = ActorNetwork(state_dim, action_dim).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)

        # Critic : √âvalue l'√©tat (V-Value)
        self.critic = ValueNetwork(state_dim).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)

    def select_action(self, state, eval=False):
        state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        probs = self.actor(state_t)
        
        if eval:
            action = torch.argmax(probs, dim=1).item()
        else:
            dist = torch.distributions.Categorical(probs)
            action = dist.sample().item()
        return action

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self):
        # A2C apprend g√©n√©ralement √† la fin d'un √©pisode ou apr√®s N steps
        # Ici on suppose qu'on appelle train() r√©guli√®rement.
        # Pour A2C, on a besoin d'un batch de trajectoire.
        if len(self.memory) < 5: return 0.0 # Attendre un peu de donn√©es

        states, actions, rewards, next_states, dones = zip(*self.memory)

        states = torch.FloatTensor(np.array(states)).to(self.device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
        dones = torch.FloatTensor(np.array(dones, dtype=np.float32)).unsqueeze(1).to(self.device)

        # 1. Calcul des TD Targets (Bootstrapping)
        with torch.no_grad():
            next_values = self.critic(next_states)
            target_values = rewards + (1 - dones) * self.gamma * next_values

        # 2. Update Critic (MSE Loss)
        values = self.critic(states)
        advantage = (target_values - values).detach() # L'avantage guide l'Actor
        
        critic_loss = F.mse_loss(values, target_values)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # 3. Update Actor (Policy Gradient)
        probs = self.actor(states)
        dist = torch.distributions.Categorical(probs)
        log_probs = dist.log_prob(actions.squeeze())
        
        # On veut maximiser (log_pi * advantage) -> minimiser -(log_pi * advantage)
        actor_loss = -(log_probs * advantage.squeeze()).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # IMPORTANT : On-Policy = On vide la m√©moire !
        self.memory = []
        
        return actor_loss.item() + critic_loss.item()

    def action_to_vector(self, action_idx):
        vec = np.zeros(self.action_dim); vec[action_idx] = 1.0; return vec
    def save(self, filename):
        torch.save({'actor': self.actor.state_dict(), 'critic': self.critic.state_dict()}, filename)
    def load(self, filename):
        c = torch.load(filename); self.actor.load_state_dict(c['actor']); self.critic.load_state_dict(c['critic'])

In [15]:
import MARL_env as generic_mpe
import numpy as np
# from my_agents import ActorCriticAgent <-- Importe ta classe A2C ici

# --- CONFIG ---
builder = MAPFBuilder(1, obs_radius=1.0) 
start_pos = [-0.8, -0.8]; goal_pos = [0.8, 0.8]
builder.set_agent(0, start_pos, goal_pos, size=0.1, color=[0.9, 0.9, 0.1]) # Jaune pour A2C
builder.reset_obstacles()
env = builder.get_env(mode='train', agent_id_train=0)

scenario = env.unwrapped.scenario
world = env.unwrapped.world
raw_agent = world.agents[0] 

# --- AGENT A2C ---
agent = ActorCriticAgent(state_dim=4, action_dim=5, lr=0.0005)

print("--- TRAINING (A2C - Standard Actor-Critic) ---")

num_episodes = 1000 
max_steps = 200

for episode in range(num_episodes):
    obs, _ = env.reset()
    state = obs['agent_0']
    
    total_reward = 0
    epoch_loss = 0
    n_updates = 0
    
    for step in range(max_steps):
        action_idx = agent.select_action(state, eval=False)
        action_vec = agent.action_to_vector(action_idx)
        
        next_obs, rewards, terms, truncs, _ = env.step({'agent_0': action_vec})
        
        if 'agent_0' not in next_obs: break 

        next_state = next_obs['agent_0']
        reward = rewards['agent_0']
        
        is_success = scenario.is_done(raw_agent, world)
        done = is_success or (step == max_steps - 1)
        
        agent.remember(state, action_idx, reward, next_state, done)
        
        # A2C apprend plus souvent que PPO (ex: tous les 5 pas)
        loss = agent.train()
        
        if loss != 0:
            epoch_loss += loss
            n_updates += 1
        
        state = next_state
        total_reward += reward
        
        if is_success:
            break
            
    avg_loss = epoch_loss / n_updates if n_updates > 0 else 0.0
    status = "üèÜ VICTOIRE" if is_success else "‚è≥ TEMPS"
    
    print(f"Ep {episode+1:03d} | R: {total_reward:6.1f} | Loss: {avg_loss:.4f} | {status}")

agent.save("model_agent_a2c.pth")
env.close()

Tous les obstacles ont √©t√© supprim√©s.
--- TRAINING (A2C - Standard Actor-Critic) ---
Ep 001 | R: -200.0 | Loss: -0.5060 | ‚è≥ TEMPS
Ep 002 | R: -200.0 | Loss: 1.1642 | ‚è≥ TEMPS
Ep 003 | R: -200.0 | Loss: 7.4385 | ‚è≥ TEMPS
Ep 004 | R: -200.0 | Loss: 15.9854 | ‚è≥ TEMPS
Ep 005 | R: -200.0 | Loss: 20.7769 | ‚è≥ TEMPS
Ep 006 | R: -200.0 | Loss: 22.5173 | ‚è≥ TEMPS
Ep 007 | R: -200.0 | Loss: 22.1452 | ‚è≥ TEMPS
Ep 008 | R: -200.0 | Loss: 23.0699 | ‚è≥ TEMPS
Ep 009 | R: -200.0 | Loss: 23.6448 | ‚è≥ TEMPS
Ep 010 | R: -200.0 | Loss: 23.1647 | ‚è≥ TEMPS
Ep 011 | R: -200.0 | Loss: 23.3140 | ‚è≥ TEMPS
Ep 012 | R: -200.0 | Loss: 23.1934 | ‚è≥ TEMPS
Ep 013 | R: -200.0 | Loss: 23.2747 | ‚è≥ TEMPS
Ep 014 | R: -200.0 | Loss: 23.5170 | ‚è≥ TEMPS
Ep 015 | R: -200.0 | Loss: 23.5648 | ‚è≥ TEMPS
Ep 016 | R: -200.0 | Loss: 23.7773 | ‚è≥ TEMPS
Ep 017 | R: -200.0 | Loss: 23.3304 | ‚è≥ TEMPS
Ep 018 | R: -200.0 | Loss: 23.3214 | ‚è≥ TEMPS
Ep 019 | R: -200.0 | Loss: 23.3800 | ‚è≥ TEMPS
Ep 020 | R: -200.0 | 

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

# --- RESEAUX NEURONAUX (Inchang√©s) ---
class ActorNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, action_dim)

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=-1)

class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        return self.fc3(x)

# --- AGENT PPO VANILLA (NO GAE) ---
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=0.0003, gamma=0.99, K_epochs=10, eps_clip=0.2, entropy_coef=0.05):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.entropy_coef = entropy_coef
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.actor = ActorNetwork(state_dim, action_dim).to(self.device)
        self.critic = ValueNetwork(state_dim).to(self.device)
        
        self.optimizer = optim.Adam([
            {'params': self.actor.parameters(), 'lr': lr},
            {'params': self.critic.parameters(), 'lr': lr}
        ])
        
        self.mse_loss = nn.MSELoss()
        self.memory = []

    def select_action(self, state, eval=False):
        state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            probs = self.actor(state_t)
        
        if eval:
            action = torch.argmax(probs, dim=1).item()
            return action, 0.0
        else:
            dist = torch.distributions.Categorical(probs)
            action = dist.sample().item()
            return action, dist.log_prob(torch.tensor(action).to(self.device)).item()

    def remember(self, state, action, log_prob, reward, done):
        self.memory.append((state, action, log_prob, reward, done))

    def train(self):
        if len(self.memory) < 100: return 0.0

        # Extraction
        states, actions, old_log_probs, rewards, dones = zip(*self.memory)
        
        states = torch.FloatTensor(np.array(states)).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        old_log_probs = torch.FloatTensor(old_log_probs).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        dones = torch.FloatTensor(np.array(dones, dtype=np.float32)).to(self.device)

        # --- 1. CALCUL DES RETURNS (Monte Carlo) ---
        # Au lieu de GAE, on calcule simplement la somme discount√©e
        returns = []
        discounted_sum = 0
        for reward, is_done in zip(reversed(rewards), reversed(dones)):
            if is_done: discounted_sum = 0
            discounted_sum = reward + (self.gamma * discounted_sum)
            returns.insert(0, discounted_sum)
        
        returns = torch.FloatTensor(returns).to(self.device)
        # Normalisation des returns (Vital pour la stabilit√©)
        returns = (returns - returns.mean()) / (returns.std() + 1e-7)

        # --- 2. CALCUL DES AVANTAGES ---
        with torch.no_grad():
            values = self.critic(states).squeeze()
            # Advantage = Return - Value
            advantages = returns - values
            # Normalisation des avantages
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-7)

        # --- 3. PPO UPDATE ---
        total_loss = 0
        for _ in range(self.K_epochs):
            # Nouvelles probas
            probs = self.actor(states)
            dist = torch.distributions.Categorical(probs)
            log_probs = dist.log_prob(actions)
            dist_entropy = dist.entropy().mean()
            
            # Nouvelles valeurs
            new_state_values = self.critic(states).squeeze()
            
            # Ratio
            ratios = torch.exp(log_probs - old_log_probs)

            # Loss Actor (Clipped)
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            loss_actor = -torch.min(surr1, surr2).mean()
            
            # Loss Critic (MSE entre Return r√©el et V(s))
            loss_critic = self.mse_loss(new_state_values, returns)

            # Loss Totale
            loss = loss_actor + 0.5 * loss_critic - self.entropy_coef * dist_entropy
            
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
            self.optimizer.step()
            
            total_loss += loss.item()

        self.memory = []
        return total_loss / self.K_epochs
    
    def action_to_vector(self, action_idx):
        vec = np.zeros(self.action_dim); vec[action_idx] = 1.0; return vec
    def save(self, filename): torch.save(self.actor.state_dict(), filename)
    def load(self, filename): self.actor.load_state_dict(torch.load(filename))

In [23]:
import MARL_env as generic_mpe
import numpy as np

# --- CONFIG ---
builder = MAPFBuilder(1, obs_radius=1.0) 
start_pos = [-0.8, -0.8]; goal_pos = [0.8, 0.8]
builder.set_agent(0, start_pos, goal_pos, size=0.1, color=[0.1, 0.8, 0.8]) # Cyan pour PPO-Vanilla
builder.reset_obstacles()
env = builder.get_env(mode='train', agent_id_train=0)

scenario = env.unwrapped.scenario
world = env.unwrapped.world
raw_agent = world.agents[0] 

# --- AGENT ---
# entropy_coef=0.05 : Force l'exploration car on n'a pas GAE pour aider
agent = PPOAgent(state_dim=4, action_dim=5, lr=0.0003, entropy_coef=0.05)

print("--- TRAINING PPO (Vanilla - No GAE) ---")

num_episodes = 5000 
max_steps = 200
update_timestep = 2048 # Gros batch n√©cessaire sans GAE
timestep_counter = 0

for episode in range(num_episodes):
    obs, _ = env.reset()
    state = obs['agent_0']
    
    total_reward = 0
    
    for step in range(max_steps):
        # Action + Log Prob
        action_idx, log_prob = agent.select_action(state, eval=False)
        action_vec = agent.action_to_vector(action_idx)
        
        next_obs, rewards, terms, truncs, _ = env.step({'agent_0': action_vec})
        
        if 'agent_0' not in next_obs: break 

        next_state = next_obs['agent_0']
        reward = rewards['agent_0']
        
        is_success = scenario.is_done(raw_agent, world)
        done = is_success or (step == max_steps - 1)
        
        # Note: Pas besoin de next_state pour remember sans GAE
        agent.remember(state, action_idx, log_prob, reward, done)
        
        timestep_counter += 1
        
        # Update
        if timestep_counter % update_timestep == 0:
            print(f"  ‚ö° UPDATE PPO (Step {timestep_counter})")
            agent.train()

        state = next_state
        total_reward += reward
        
        if is_success:
            break
            
    status = "üèÜ VICTOIRE" if is_success else "..."
    
    if episode % 20 == 0 or is_success:
        print(f"Ep {episode+1:04d} | R: {total_reward:6.1f} | Steps: {step} | {status}")

agent.save("model_agent_ppo.pth")
env.close()

Tous les obstacles ont √©t√© supprim√©s.
--- TRAINING PPO (Vanilla - No GAE) ---
Ep 0001 | R: -200.0 | Steps: 199 | ...
  ‚ö° UPDATE PPO (Step 2048)
  ‚ö° UPDATE PPO (Step 4096)
Ep 0021 | R: -200.0 | Steps: 199 | ...
  ‚ö° UPDATE PPO (Step 6144)
  ‚ö° UPDATE PPO (Step 8192)
Ep 0041 | R: -200.0 | Steps: 199 | ...
Ep 0050 | R:  -79.0 | Steps: 179 | üèÜ VICTOIRE
  ‚ö° UPDATE PPO (Step 10240)
Ep 0061 | R: -200.0 | Steps: 199 | ...
  ‚ö° UPDATE PPO (Step 12288)
  ‚ö° UPDATE PPO (Step 14336)
Ep 0079 | R:  -75.0 | Steps: 175 | üèÜ VICTOIRE
Ep 0081 | R: -200.0 | Steps: 199 | ...
  ‚ö° UPDATE PPO (Step 16384)
  ‚ö° UPDATE PPO (Step 18432)
Ep 0101 | R: -200.0 | Steps: 199 | ...
  ‚ö° UPDATE PPO (Step 20480)
  ‚ö° UPDATE PPO (Step 22528)
Ep 0121 | R: -200.0 | Steps: 199 | ...
  ‚ö° UPDATE PPO (Step 24576)
  ‚ö° UPDATE PPO (Step 26624)
Ep 0141 | R: -200.0 | Steps: 199 | ...
  ‚ö° UPDATE PPO (Step 28672)
  ‚ö° UPDATE PPO (Step 30720)
Ep 0161 | R: -200.0 | Steps: 199 | ...
  ‚ö° UPDATE PPO (Step 3

In [11]:
import time
import imageio
import numpy as np
import MARL_env as generic_mpe

print("\n--- ENREGISTREMENT VIDEO ---")

# --- CONFIG ---
builder = MAPFBuilder(num_total_agents=1, obs_radius=1.0) 
start_pos = [-0.8, -0.8]; goal_pos = [0.8, 0.8]
builder.set_agent(0, start_pos, goal_pos, size=0.1, color=[0.9, 0.1, 0.1])
builder.reset_obstacles()

# --- INITIALISATION ---
# IMPORTANT : On passe en 'rgb_array' pour r√©cup√©rer les pixels au lieu d'ouvrir une fen√™tre
env = builder.get_env(mode='test', render_mode='rgb_array')

obs, _ = env.reset()
state = obs['agent_0']

# Chargement de l'agent
agent = DeepDynaQAgent(state_dim=4, action_dim=5, lr=0.0005, epsilon=1.0, n_planning=10)
try:
    agent.load("model_agent_solo.pth")
    print("Mod√®le charg√©.")
except:
    print("Attention : Mod√®le non trouv√©, comportement al√©atoire.")

# Liste pour stocker les frames
frames = []

print("G√©n√©ration des frames en cours...")

for step in range(100):
    # Action
    action_idx = agent.select_action(state, eval=True)
    action_vec = agent.action_to_vector(action_idx)
    
    obs, _, done_dict, _, _ = env.step({'agent_0': action_vec})
    
    # --- CAPTURE ---
    frame = env.render() # R√©cup√®re l'array numpy (Height, Width, 3)
    frames.append(frame)
    
    state = obs['agent_0']
    
    if done_dict['agent_0']:
        print(f"But atteint au step {step} !")
        # On continue d'enregistrer quelques frames pour voir la fin
        for _ in range(10): frames.append(env.render()) 
        break

env.close()

# --- SAUVEGARDE ---
if len(frames) > 0:
    print(f"Sauvegarde de {len(frames)} frames...")
    
    # Option 1 : Sauvegarder en GIF (Universel, pas besoin de codec)
    imageio.mimsave('resultat_DQN.gif', frames, fps=10, loop=0)
    print("‚úÖ Vid√©o sauvegard√©e sous 'resultat_agent.gif'")
    
    # Option 2 : Sauvegarder en MP4 (Meilleure qualit√©, n√©cessite ffmpeg install√© via pip)
    # imageio.mimsave('resultat_agent.mp4', frames, fps=30)
    # print("‚úÖ Vid√©o sauvegard√©e sous 'resultat_agent.mp4'")
else:
    print("‚ùå Aucune frame captur√©e.")


--- ENREGISTREMENT VIDEO ---
Tous les obstacles ont √©t√© supprim√©s.
Mod√®le charg√©.
G√©n√©ration des frames en cours...
But atteint au step 30 !
Sauvegarde de 41 frames...
‚úÖ Vid√©o sauvegard√©e sous 'resultat_agent.gif'


In [12]:
import imageio
import numpy as np
import MARL_env as generic_mpe
# Assure-toi d'avoir import√© ta classe SACAgent ici
# from my_sac_file import SACAgent 

print("\n--- ENREGISTREMENT VIDEO SAC ---")

# --- CONFIG ENV ---
builder = MAPFBuilder(num_total_agents=1, obs_radius=1.0) 
start_pos = [-0.8, -0.8]; goal_pos = [0.8, 0.8]
builder.set_agent(0, start_pos, goal_pos, size=0.1, color=[0.1, 0.9, 0.1]) # Vert pour SAC
builder.reset_obstacles()

# IMPORTANT : render_mode='rgb_array' pour capturer les pixels
env = builder.get_env(mode='test', render_mode='rgb_array')

obs, _ = env.reset()
state = obs['agent_0']

# --- CHARGEMENT AGENT SAC ---
agent = SACAgent(state_dim=4, action_dim=5, lr=0.0003, n_updates=1)

try:
    agent.load("model_agent_sac.pth")
    print("Mod√®le SAC charg√© avec succ√®s.")
except FileNotFoundError:
    print("Erreur : 'model_agent_sac.pth' introuvable.")
    exit()

# Liste pour stocker les images
frames = []

print("G√©n√©ration de la vid√©o...")

# --- BOUCLE D'ENREGISTREMENT ---
for step in range(200):
    # eval=True pour prendre l'action optimale (d√©terministe)
    action_idx = agent.select_action(state, eval=True)
    action_vec = agent.action_to_vector(action_idx)
    
    obs, _, terms, truncs, _ = env.step({'agent_0': action_vec})
    
    # Capture de l'image
    frame = env.render()
    frames.append(frame)
    
    if 'agent_0' in obs:
        state = obs['agent_0']
    
    # Check fin
    if terms['agent_0'] or truncs['agent_0']:
        status = "But atteint ! üèÜ" if terms['agent_0'] else "Temps √©coul√©."
        print(f"Fin au step {step}: {status}")
        
        # On ajoute quelques frames statiques √† la fin pour ne pas couper brutalement la vid√©o
        final_frame = env.render()
        for _ in range(15): 
            frames.append(final_frame)
        break

env.close()

# --- SAUVEGARDE ---
if len(frames) > 0:
    print(f"Sauvegarde de {len(frames)} frames dans 'resultat_sac.gif'...")
    imageio.mimsave('resultat_sac.gif', frames, fps=15, loop=0)
    print("‚úÖ Vid√©o sauvegard√©e !")
else:
    print("‚ùå Erreur : Aucune frame captur√©e.")


--- ENREGISTREMENT VIDEO SAC ---
Tous les obstacles ont √©t√© supprim√©s.
Mod√®le SAC charg√© avec succ√®s.
G√©n√©ration de la vid√©o...
Fin au step 39: But atteint ! üèÜ
Sauvegarde de 55 frames dans 'resultat_sac.gif'...
‚úÖ Vid√©o sauvegard√©e !


In [27]:
import time
import imageio
import numpy as np
import MARL_env as generic_mpe
# from my_agents import PPOAgent 

print("\n--- ENREGISTREMENT VIDEO PPO (CORRIG√â) ---")

# --- CONFIG ---
builder = MAPFBuilder(num_total_agents=1, obs_radius=1.0) 
start_pos = [-0.8, -0.8]; goal_pos = [0.8, 0.8]
builder.set_agent(0, start_pos, goal_pos, size=0.1, color=[0.1, 0.1, 0.9])
builder.reset_obstacles()

# --- INITIALISATION ---
env = builder.get_env(mode='test', render_mode='rgb_array')

obs, _ = env.reset()
state = obs['agent_0']

agent = PPOAgent(state_dim=4, action_dim=5)

try:
    agent.load("model_agent_ppo.pth") # Ou "model_agent_ppo.pth" selon ton fichier
    print("‚úÖ Mod√®le PPO charg√©.")
except:
    print("‚ùå Attention : Mod√®le introuvable !")

frames = []
print("G√©n√©ration des frames...")

for step in range(200):
    # --- CORRECTION : On unpack le tuple (action, _) ---
    action_idx, _ = agent.select_action(state, eval=True)
    
    action_vec = agent.action_to_vector(action_idx)
    
    obs, _, terms, truncs, _ = env.step({'agent_0': action_vec})
    
    frame = env.render()
    frames.append(frame)
    
    if 'agent_0' in obs:
        state = obs['agent_0']
    
    if terms['agent_0'] or truncs['agent_0']:
        status = "But atteint ! üèÜ" if terms['agent_0'] else "Temps √©coul√©."
        print(f"Fin au step {step} : {status}")
        for _ in range(15): frames.append(env.render()) 
        break

env.close()

if len(frames) > 0:
    print(f"Sauvegarde de {len(frames)} frames...")
    imageio.mimsave('resultat_PPO.gif', frames, fps=15, loop=0)
    print("‚úÖ Vid√©o sauvegard√©e sous 'resultat_PPO.gif'")
else:
    print("‚ùå Aucune frame captur√©e.")


--- ENREGISTREMENT VIDEO PPO (CORRIG√â) ---
Tous les obstacles ont √©t√© supprim√©s.
‚úÖ Mod√®le PPO charg√©.
G√©n√©ration des frames...
Fin au step 30 : But atteint ! üèÜ
Sauvegarde de 46 frames...
‚úÖ Vid√©o sauvegard√©e sous 'resultat_PPO.gif'


In [19]:
import time
import imageio
import numpy as np
import MARL_env as generic_mpe
# from my_agents import ActorCriticAgent # Assure-toi d'importer ta classe A2C

print("\n--- ENREGISTREMENT VIDEO A2C ---")

# --- CONFIG ---
builder = MAPFBuilder(num_total_agents=1, obs_radius=1.0) 
start_pos = [-0.8, -0.8]; goal_pos = [0.8, 0.8]
# Couleur JAUNE pour A2C
builder.set_agent(0, start_pos, goal_pos, size=0.1, color=[0.9, 0.9, 0.1])
builder.reset_obstacles()

# --- INITIALISATION ---
env = builder.get_env(mode='test', render_mode='rgb_array')

obs, _ = env.reset()
state = obs['agent_0']

# Chargement de l'agent A2C
agent = ActorCriticAgent(state_dim=4, action_dim=5)

try:
    agent.load("model_agent_a2c.pth")
    print("‚úÖ Mod√®le A2C charg√©.")
except:
    print("‚ùå Attention : 'model_agent_a2c.pth' introuvable !")

frames = []
print("G√©n√©ration des frames...")

for step in range(200):
    action_idx = agent.select_action(state, eval=True)
    action_vec = agent.action_to_vector(action_idx)
    
    obs, _, terms, truncs, _ = env.step({'agent_0': action_vec})
    
    frame = env.render()
    frames.append(frame)
    
    if 'agent_0' in obs:
        state = obs['agent_0']
    
    if terms['agent_0'] or truncs['agent_0']:
        status = "But atteint ! üèÜ" if terms['agent_0'] else "Temps √©coul√©."
        print(f"Fin au step {step} : {status}")
        for _ in range(15): frames.append(env.render()) 
        break

env.close()

# --- SAUVEGARDE ---
if len(frames) > 0:
    print(f"Sauvegarde de {len(frames)} frames...")
    imageio.mimsave('resultat_A2C.gif', frames, fps=15, loop=0)
    print("‚úÖ Vid√©o sauvegard√©e sous 'resultat_A2C.gif'")
else:
    print("‚ùå Aucune frame captur√©e.")


--- ENREGISTREMENT VIDEO A2C ---
Tous les obstacles ont √©t√© supprim√©s.
‚úÖ Mod√®le A2C charg√©.
G√©n√©ration des frames...
Sauvegarde de 200 frames...
‚úÖ Vid√©o sauvegard√©e sous 'resultat_A2C.gif'
