In [37]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import argparse
import time
from typing import Tuple, Optional, Dict
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random

from process_state import StateNormalizer
from env import MazeEnv
from agent_basic import MyAgents

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import os
os.getcwd()
checkpoint_dir = 'checkpoints_qlearn_CTDE'
os.makedirs(checkpoint_dir, exist_ok=True)

In [39]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [41]:
# Define normalizer
normalizer = StateNormalizer(grid_size=100, max_lidar_range=10, max_agents=1)

# Example state (including 1 deactivated agent)
state_list = [
    45, 60, 2.0, 0, 80, 20,  # Agent pos, orientation, status, goal pos
    5.0, 1, 10.0, 0, 2.0, 3  # LIDAR readings (main, right, left)
    
    
]

# Normalize the state
normalized_state = normalizer.normalize_agent_state(state_list)

print("Normalized State Shape:", normalized_state.shape)
print("Normalized State:", normalized_state)


Normalized State Shape: (52,)
Normalized State: [0.45 0.6  0.   0.   1.   0.   1.   0.   0.   0.8  0.2  0.5  0.   1.
 0.   0.   1.   1.   0.   0.   0.   0.2  0.   0.   0.   1.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]


In [6]:
import random
import numpy as np
from collections import deque

class MultiAgentReplayBuffer:
    def __init__(self, capacity, num_agents):
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)
        self.num_agents = num_agents

    def push(self, states, actions, rewards, next_states, dones):
        """ Store transition for all agents """
        self.buffer.append((states, actions, rewards, next_states, dones))

    def sample(self, batch_size):
        """ Sample a batch of experiences """
        batch = random.sample(self.buffer, batch_size)
        
        states, actions, rewards, next_states, dones = zip(*batch)

        # Convert to NumPy arrays
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        next_states = np.array(next_states)
        dones = np.array(dones)
        
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)  # Q-values for each action


In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

class MultiAgentDQN:
    def __init__(self, num_agents, state_dim, action_dim, grid_size, max_lidar_range, device, lr=0.0001, gamma=0.99):
        self.num_agents = num_agents
        self.gamma = gamma
        self.normalizer = StateNormalizer(grid_size, max_lidar_range)
        self.device = device
        self.batch_size = 1024
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        # Initialize agents and target networks
        self.agents = [DQN(state_dim, action_dim).to(self.device) for _ in range(num_agents)]
        self.target_agents = [DQN(state_dim, action_dim).to(self.device)  for _ in range(num_agents)]

        # Sync target networks
        for i in range(num_agents):
            self.target_agents[i].load_state_dict(self.agents[i].state_dict())

        self.optimizers = [optim.Adam(agent.parameters(), lr=lr) for agent in self.agents]

        self.replay_buffer = MultiAgentReplayBuffer(capacity=10000, num_agents=num_agents)
        self.loss_fn = nn.MSELoss()

    def select_actions(self, raw_states):
        """ Normalize states and select actions using ε-greedy policy """
        normalized_states = [self.normalizer.normalize_agent_state(s) for s in raw_states]
        actions = []
        for i in range(self.num_agents):
            if np.random.rand() < self.epsilon:
                actions.append(np.random.randint(0, self.action_dim))  # Random action (exploration)
            else:
                state_tensor = torch.FloatTensor(normalized_states[i]).to(self.device).unsqueeze(0)
                q_values = self.agents[i](state_tensor)
                actions.append(torch.argmax(q_values).item())  # Best action (exploitation)
        return actions

    def update_agents(self):
        """ Train each agent using centralized buffer """
        if len(self.replay_buffer) < self.batch_size:
            return  

        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)

        for i in range(self.num_agents):
            state_batch = torch.FloatTensor(np.array([self.normalizer.normalize_agent_state(s[i]) for s in states])).to(self.device)
            next_state_batch = torch.FloatTensor(np.array([self.normalizer.normalize_agent_state(ns[i]) for ns in next_states])).to(self.device)
            action_batch = torch.LongTensor(actions[:, i]).unsqueeze(1).to(self.device)
            reward_batch = torch.FloatTensor(rewards[:, i]).to(self.device)
            done_batch = torch.FloatTensor(dones[:, i]).to(self.device)

            with torch.no_grad():
                next_q_values = self.target_agents[i](next_state_batch).max(1)[0]
                target_q_values = reward_batch + (self.gamma * next_q_values * (1 - done_batch))

            current_q_values = self.agents[i](state_batch).gather(1, action_batch).squeeze()
            loss = self.loss_fn(current_q_values, target_q_values)

            self.optimizers[i].zero_grad()
            loss.backward()
            self.optimizers[i].step()
    
    def update_target_networks(self):
        """ Update target networks using Polyak averaging """
        for i in range(self.num_agents):
            for target_param, param in zip(self.target_agents[i].parameters(), self.agents[i].parameters()):
                target_param.data.copy_(0.995 * target_param.data + 0.005 * param.data)

In [46]:
def simulation_config(config_path: str, new_agent: bool = True) -> Tuple[MazeEnv, Optional[MyAgents], Dict]:
    """
    Configure the environment and optionally an agent using a JSON configuration file.

    Args:
        config_path (str): Path to the configuration JSON file.
        new_agent (bool): Whether to initialize the agent. Defaults to True.

    Returns:
        Tuple[MazeEnv, Optional[MyAgent], Dict]: Configured environment, agent (if new), and the configuration dictionary.
    """
    
    # Read config
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)

    # Env configuration
    env = MazeEnv(
        size= 15,                               # Grid size
        walls_proportion=0.3,            # Walls proportion in the grid
        num_dynamic_obstacles=1,  # Number of dynamic obstacles
        num_agents=1,                        # Number of agents
        communication_range=config.get('communication_range'),      # Maximum distance for agent communications
        max_lidar_dist_main=config.get('max_lidar_dist_main'),      # Maximum distance for main LIDAR scan
        max_lidar_dist_second=config.get('max_lidar_dist_second'),  # Maximum distance for secondary LIDAR scan
        max_episode_steps=config.get('max_episode_steps'),          # Number of steps before episode termination
        render_mode=config.get('render_mode', None),
        seed=config.get('seed', None)                               # Seed for reproducibility
    )

    # Agent configuration
    agents = MultiAgentDQN(num_agents=1,state_dim=52,action_dim=env.action_space.n,grid_size=env.grid_size, max_lidar_range=env.max_lidar_dist_main, device=device) if new_agent else None

    return env, agents, config

In [65]:
# Environment and agent configuration
env, agents, config = simulation_config('config.json')
max_episodes = 20000
num_agents = 1

# Metrics to follow the performance
all_rewards = []
total_reward = 0
episode_count = 0
epsilon_history = []
loss_history = np.zeros((num_agents, max_episodes)) 
# Initial reset of the environment
state, info = env.reset()
time.sleep(1)
try:
    while episode_count < max_episodes:
        states, info = env.reset()
        done = False
        while not done:
            # Determine agents actions
            actions = agents.select_actions(states)

            # Execution of a simulation step
            next_state, rewards, terminated, truncated, info = env.step(actions)
            total_reward += np.sum(rewards)
            
            agents.replay_buffer.push(states, actions, rewards, next_state, [terminated] * num_agents)
            agents.update_agents()
            
            done = terminated or truncated
        
        #update espilon
        """ if episode_count > 1: """
        agents.epsilon = max(agents.epsilon_min, agents.epsilon * 0.995)
            
        # Update the target network
        if episode_count % 20 == 1:
            agents.update_target_networks()
        
        if episode_count % 100 == 1:
            for i in range(num_agents):
                torch.save(agents.agents[i].state_dict(), f"{checkpoint_dir}/reward_trick_agent_{i}_episode_{episode_count}.pth")
                torch.save(agents.target_agents[i].state_dict(), f"{checkpoint_dir}/reward_trick_target_agent_{i}_episode_{episode_count}.pth")
        # Display of the step information
        print(f"\rEpisode {episode_count + 1}, Step {info['current_step']}, "
                f"Reward: {total_reward:.2f}, "
                f"Evacuated: {len(info['evacuated_agents'])}, "
                f"Deactivated: {len(info['deactivated_agents'])}", end='')
        
        # Pause
        #time.sleep(1)
        # If the episode is terminated
        if terminated or truncated:
            print("\r")
            episode_count += 1
            all_rewards.append(total_reward)
            total_reward = 0
            
            if episode_count < max_episodes:
                state, info = env.reset()

except KeyboardInterrupt:
    print("\nSimulation interrupted by the user")

finally:
    env.close()


Episode 1, Step 5, Reward: -129.63, Evacuated: 0, Deactivated: 1
Episode 2, Step 4, Reward: -119.63, Evacuated: 0, Deactivated: 1
Episode 3, Step 11, Reward: -158.43, Evacuated: 0, Deactivated: 1
Episode 4, Step 10, Reward: -179.63, Evacuated: 0, Deactivated: 1
Episode 5, Step 2, Reward: -99.63, Evacuated: 0, Deactivated: 1
Episode 6, Step 28, Reward: -276.37, Evacuated: 0, Deactivated: 1
Episode 7, Step 23, Reward: -257.63, Evacuated: 0, Deactivated: 1
Episode 8, Step 24, Reward: -309.26, Evacuated: 0, Deactivated: 1
Episode 9, Step 68, Reward: -624.17, Evacuated: 0, Deactivated: 1
Episode 10, Step 77, Reward: -683.29, Evacuated: 0, Deactivated: 1
Episode 11, Step 10, Reward: -179.63, Evacuated: 0, Deactivated: 1
Episode 12, Step 5, Reward: -129.63, Evacuated: 0, Deactivated: 1
Episode 13, Step 109, Reward: -919.45, Evacuated: 0, Deactivated: 1
Episode 14, Step 11, Reward: -189.63, Evacuated: 0, Deactivated: 1
Episode 15, Step 21, Reward: -279.25, Evacuated: 0, Deactivated: 1
Episode 

In [62]:
def evaluate(configs_paths: list, trained_agent: MyAgents, num_episodes: int = 10) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Evaluate a trained agent on multiple configurations, calculate metrics, and visualize results.

    Args:
        config_path (list): List of paths to the configuration JSON files.
        trained_agent (MyAgent): A pre-trained agent to evaluate.
        num_episodes (int): Number of episodes to run for evaluation per configuration. Defaults to 10.

    Returns:
        pd.DataFrame: A DataFrame containing evaluation metrics for each episode and configuration.
    """

    # Evaluation results
    all_results = pd.DataFrame()

    for config_path in configs_paths:
        print(f"\n--- Evaluating Configuration: {config_path} ---")

        # Environment configuration
        env, _, config = simulation_config(config_path, new_agent=False)

        # Metrics to follow the performance
        metrics = []
        total_reward = 0
        episode_count = 0
        
        # Initial reset of the environment
        state, info = env.reset()
        time.sleep(1) 
   
        # Run evaluation for the specified number of episodes
        try:
            while episode_count < num_episodes:
                # Determine agents actions
                actions = trained_agent.select_actions(state)
                #print(actions)
                # Execution of a simulation step
                state, rewards, terminated, truncated, info = env.step(actions)
                #print(rewards)
                total_reward += np.sum(rewards)

                # Display of the step information
                print(f"\rEpisode {episode_count + 1}/{num_episodes}, Step {info['current_step']}, "
                    f"Reward: {total_reward:.2f}, "
                    f"Evacuated: {len(info['evacuated_agents'])}, "
                    f"Deactivated: {len(info['deactivated_agents'])}", end='')
            
                # Pause
                

                # If the episode is terminated
                if terminated or truncated:
                    print("\r")
                    # Save metrics
                    metrics.append({
                        "config_path": config_path,
                        "episode": episode_count + 1,
                        "steps": info['current_step'],
                        "reward": total_reward,
                        "evacuated": len(info['evacuated_agents']),
                        "deactivated": len(info['deactivated_agents'])
                    })

                    episode_count += 1
                    total_reward = 0

                    if episode_count < num_episodes:
                        state, info = env.reset()
        
        except KeyboardInterrupt:
            print("\nSimulation interrupted by the user")
        
        finally:
            env.close()

        # Convert the current configuration's metrics to a DataFrame
        config_results = pd.DataFrame(metrics)
        all_results = pd.concat([all_results, config_results], ignore_index=True)
    
    env.close()

    all_results.to_csv('all_results.csv', index=False)

    return all_results

In [34]:
env, _, config = simulation_config('config.json')

In [66]:
# Load the trained agent
agents = MultiAgentDQN(num_agents=1,state_dim=52,action_dim=env.action_space.n,grid_size=env.grid_size, max_lidar_range=env.max_lidar_dist_main, device=device)
max_episodes = 20000
num_agents = 1

for i in range(num_agents):
    agents.agents[i].load_state_dict(torch.load(f"{checkpoint_dir}/reward_trick_agent_{i}_episode_901.pth"))
    agents.target_agents[i].load_state_dict(torch.load(f"{checkpoint_dir}/reward_trick_target_agent_{i}_episode_901.pth"))
agents.epsilon = 0
all_results = evaluate(configs_paths=['config.json'], trained_agent=agents)

  agents.agents[i].load_state_dict(torch.load(f"{checkpoint_dir}/reward_trick_agent_{i}_episode_901.pth"))
  agents.target_agents[i].load_state_dict(torch.load(f"{checkpoint_dir}/reward_trick_target_agent_{i}_episode_901.pth"))



--- Evaluating Configuration: config.json ---
Episode 1/10, Step 1000, Reward: -9968.85, Evacuated: 0, Deactivated: 0
Episode 2/10, Step 44, Reward: -319.48, Evacuated: 0, Deactivated: 1
Episode 3/10, Step 1000, Reward: -10000.00, Evacuated: 0, Deactivated: 0
Episode 4/10, Step 627, Reward: -6360.00, Evacuated: 0, Deactivated: 1
Episode 5/10, Step 518, Reward: -2594.44, Evacuated: 0, Deactivated: 1
Episode 6/10, Step 845, Reward: -8529.63, Evacuated: 0, Deactivated: 1
Episode 7/10, Step 169, Reward: -1738.43, Evacuated: 0, Deactivated: 1
Episode 8/10, Step 1000, Reward: -4814.81, Evacuated: 0, Deactivated: 0
Episode 9/10, Step 115, Reward: -1240.00, Evacuated: 0, Deactivated: 1
Episode 10/10, Step 1000, Reward: -10000.00, Evacuated: 0, Deactivated: 0
