In [1]:
import numpy as np
import random
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
from env import MazeEnv

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
class UAVEnv:
    def __init__(self, grid_size=(10, 10), num_drones=3):
        self.grid_size = grid_size
        self.num_drones = num_drones
        self.action_space = ["up", "down", "left", "right", "stay"]
        self.goal = (9, 9)
        self.reset()

    def reset(self):
        self.drones = [(0, i) for i in range(self.num_drones)]
        return torch.tensor(self.drones, dtype=torch.float32, device=device).flatten()

    def step(self, actions):
        new_positions = []
        for i, (x, y) in enumerate(self.drones):
            action = actions[i]
            next_pos = (x, y)
            if action == "up" and y < self.grid_size[1] - 1:
                next_pos = (x, y + 1)
            elif action == "down" and y > 0:
                next_pos = (x, y - 1)
            elif action == "left" and x > 0:
                next_pos = (x - 1, y)
            elif action == "right" and x < self.grid_size[0] - 1:
                next_pos = (x + 1, y)
            new_positions.append(next_pos)

        self.drones = new_positions
        reward = 100 if all(drone == self.goal for drone in self.drones) else -1
        return torch.tensor(new_positions, dtype=torch.float32, device=device).flatten(), reward, reward == 100


In [5]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

env = MazeEnv()
state_dim = 42 #10 par drone + 2 pour le goal
#action_dim = len(env.action_space)
action_dim = env.action_space.n

q_network = DQN(state_dim, action_dim).to(device)
target_network = DQN(state_dim, action_dim).to(device)
target_network.load_state_dict(q_network.state_dict())

optimizer = optim.Adam(q_network.parameters(), lr=0.0005)  # Lower LR for stability
loss_fn = nn.MSELoss()

replay_buffer = deque(maxlen=10000)

epsilon = 1.0
epsilon_decay = 0.997
epsilon_min = 0.05
gamma = 0.99
batch_size = 64
num_episodes = 500  # Reduced for faster training

# def get_action(state, epsilon):
#     q_values=[]
#     if np.random.rand() < epsilon:
#         return [random.choice(env.action_space) for _ in range(env.num_drones)]
    
#     with torch.no_grad():
#         state_tensor = state.unsqueeze(0)  # Add batch dimension
#         for k in range(env.num_agents) :
#             q_values.append(q_network(state_tensor))
        
#         return [env.action_space[q_values[k].argmax().item()] for k in range(env.num_drones)]
    

def get_action(state, epsilon):
    actions = []

    if np.random.rand() < epsilon:
        return [random.randrange(env.action_space.n) for _ in range(env.num_agents)]

    with torch.no_grad():
        for k in range(env.num_agents):
            agent_state = torch.tensor(state[k], dtype=torch.float32).unsqueeze(0).to(device)
            q_values = q_network(agent_state)
            best_action_idx = q_values.argmax().item()
            actions.append(best_action_idx)

    return actions

In [8]:
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        actions = get_action(state, epsilon)
        next_state, rewards, done, truncated, info = env.step(actions)
        #print(rewards)
        reward = sum(rewards)  # somme des rewards des 4 drones
        #print(reward)
        
        #stockage dans replay buffer (états complets pour les 4 drones)
        replay_buffer.append((state, actions, reward, next_state, done))

        if len(replay_buffer) >= batch_size:
            batch = random.sample(replay_buffer, batch_size)
            states_batch, actions_batch, rewards_batch, next_states_batch, dones_batch = zip(*batch)

            rewards_tensor = torch.tensor(rewards_batch, dtype=torch.float32, device=device)
            dones_tensor = torch.tensor(dones_batch, dtype=torch.float32, device=device)

            total_loss = 0

            for agent_idx in range(env.num_agents):
                #état du drone k dans chaque élément du batch
                states_tensor = torch.stack([
                    torch.tensor(s[agent_idx], dtype=torch.float32) for s in states_batch
                ]).to(device)

                next_states_tensor = torch.stack([
                    torch.tensor(ns[agent_idx], dtype=torch.float32) for ns in next_states_batch
                ]).to(device)

                actions_tensor = torch.tensor([
                    actions[agent_idx] for actions in actions_batch
                ], dtype=torch.long, device=device).unsqueeze(1)

                q_values = q_network(states_tensor)
                next_q_values = target_network(next_states_tensor)
                target_q_values = rewards_tensor + gamma * torch.max(next_q_values, dim=1)[0] * (1 - dones_tensor)
                selected_q_values = q_values.gather(1, actions_tensor).squeeze(1)

                loss = loss_fn(selected_q_values, target_q_values.detach())
                total_loss += loss

            total_loss /= env.num_agents  #moyenne des pertes sur les drones

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

        state = next_state
        total_reward += reward
        #print(total_reward)

    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    if episode % 10 == 0:
        target_network.load_state_dict(q_network.state_dict())

    if episode % 50 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")

print("Training Completed.")


Pygame window closed


KeyboardInterrupt: 

### État global donné au DQN

In [None]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

env = MazeEnv()
state_dim = 42 * 4 #10 par drone + 2 pour le goal
#action_dim = len(env.action_space)
action_dim = 4 * env.action_space.n 

q_network = DQN(state_dim, action_dim).to(device)
target_network = DQN(state_dim, action_dim).to(device)
target_network.load_state_dict(q_network.state_dict())

optimizer = optim.Adam(q_network.parameters(), lr=0.0005)  # Lower LR for stability
loss_fn = nn.MSELoss()

replay_buffer = deque(maxlen=10000)

epsilon = 1.0
epsilon_decay = 0.997
epsilon_min = 0.05
gamma = 0.99
batch_size = 64
num_episodes = 500  # Reduced for faster training

# def get_action(state, epsilon):
#     q_values=[]
#     if np.random.rand() < epsilon:
#         return [random.choice(env.action_space) for _ in range(env.num_drones)]
    
#     with torch.no_grad():
#         state_tensor = state.unsqueeze(0)  # Add batch dimension
#         for k in range(env.num_agents) :
#             q_values.append(q_network(state_tensor))
        
#         return [env.action_space[q_values[k].argmax().item()] for k in range(env.num_drones)]
    

def get_action(state, epsilon):
    actions = []

    if np.random.rand() < epsilon:
        return [random.randrange(env.action_space.n) for _ in range(env.num_agents)]

    with torch.no_grad():
        for k in range(env.num_agents):
            agent_state = torch.tensor(state[k], dtype=torch.float32).unsqueeze(0).to(device)
            q_values = q_network(agent_state)
            best_action_idx = q_values.argmax().item()
            actions.append(best_action_idx)

    return actions