In [3]:
import pygame
import random
import math

# Grid and cell configuration
GRID_SIZE = 10
CELL_SIZE = 50  # pixels per grid cell
WINDOW_SIZE = GRID_SIZE * CELL_SIZE
NUM_OBSTACLES = 10

# Actions available
ACTIONS = ["up", "down", "left", "right"]

class Gridworld:
    def __init__(self):
        self.grid_size = GRID_SIZE
        self.cell_size = CELL_SIZE
        # Define goals: agent 0's goal is bottom-right, agent 1's goal is bottom-left
        self.goals = [(GRID_SIZE - 1, GRID_SIZE - 1), (0, GRID_SIZE - 1)]
        self.obstacles = self.generate_obstacles()
        # Both agents start at top-left (0,0)
        self.agents = [(0, 0), (0, 0)]
        # Penalty for agents being too close
        self.proximity_penalty = -10
        
    def generate_obstacles(self):
        obstacles = set()
        # Generate random fixed obstacles while avoiding start and goal positions
        while len(obstacles) < NUM_OBSTACLES:
            x = random.randint(0, self.grid_size - 1)
            y = random.randint(0, self.grid_size - 1)
            if (x, y) != (0, 0) and (x, y) not in self.goals:
                obstacles.add((x, y))
        return list(obstacles)
        
    def reset(self):
        # Reset agents to the starting position; obstacles remain fixed
        self.agents = [(0, 0), (0, 0)]
    
    def step(self, actions):
        """
        Takes a list of actions (one per agent) and updates their positions.
        Actions can be one of "up", "down", "left", "right".
        Returns the new positions and a reward based on proximity and reaching goals.
        """
        new_positions = []
        for idx, (x, y) in enumerate(self.agents):
            dx, dy = 0, 0
            action = actions[idx]
            if action == "up":
                dy = -1
            elif action == "down":
                dy = 1
            elif action == "left":
                dx = -1
            elif action == "right":
                dx = 1
            new_x, new_y = x + dx, y + dy
            # Check boundaries
            if new_x < 0 or new_x >= self.grid_size or new_y < 0 or new_y >= self.grid_size:
                new_x, new_y = x, y  # invalid move, stay in place
            # Check obstacles; cannot move into an obstacle
            if (new_x, new_y) in self.obstacles:
                new_x, new_y = x, y
            new_positions.append((new_x, new_y))
        self.agents = new_positions
        
        # Compute reward
        reward = 0
        # Penalty if agents are too close (Euclidean distance < 2)
        dist = math.sqrt((self.agents[0][0] - self.agents[1][0])**2 +
                         (self.agents[0][1] - self.agents[1][1])**2)
        if dist < 2:
            reward += self.proximity_penalty
        # Reward for reaching the designated goal
        for idx, pos in enumerate(self.agents):
            if pos == self.goals[idx]:
                reward += 10  # arbitrary reward for goal achievement
        return self.agents, reward
    
    def render(self, screen):
        # Clear screen
        screen.fill((255, 255, 255))
        
        # Draw grid lines
        for x in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (x, 0), (x, WINDOW_SIZE))
        for y in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (0, y), (WINDOW_SIZE, y))
        
        # Draw obstacles as black rectangles
        for obs in self.obstacles:
            rect = pygame.Rect(obs[0]*self.cell_size, obs[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 0, 0), rect)
        
        # Draw goals as green rectangles
        for goal in self.goals:
            rect = pygame.Rect(goal[0]*self.cell_size, goal[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 255, 0), rect)
        
        # Draw agents as circles (agent 0 in red, agent 1 in blue)
        colors = [(255, 0, 0), (0, 0, 255)]
        for idx, pos in enumerate(self.agents):
            center = (pos[0]*self.cell_size + self.cell_size//2, pos[1]*self.cell_size + self.cell_size//2)
            pygame.draw.circle(screen, colors[idx], center, self.cell_size//3)
        
        pygame.display.flip()

def main():
    pygame.init()
    screen = pygame.display.set_mode((WINDOW_SIZE, WINDOW_SIZE))
    pygame.display.set_caption("Gridworld RL Framework")
    
    gridworld = Gridworld()
    clock = pygame.time.Clock()
    running = True
    
    while running:
        # Event handling: quit if the window is closed
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False
        
        # For demonstration, we take random actions for both agents
        actions = [random.choice(ACTIONS) for _ in range(2)]
        agents, reward = gridworld.step(actions)
        print(f"Agent positions: {agents}, Reward: {reward}")
        
        gridworld.render(screen)
        clock.tick(15)  # Run at 2 frames per second for visualization purposes
        
    pygame.quit()

if __name__ == '__main__':
    main()


Agent positions: [(1, 0), (1, 0)], Reward: -10
Agent positions: [(1, 0), (2, 0)], Reward: -10
Agent positions: [(1, 1), (2, 0)], Reward: -10
Agent positions: [(0, 1), (2, 0)], Reward: 0
Agent positions: [(1, 1), (3, 0)], Reward: 0
Agent positions: [(1, 0), (2, 0)], Reward: -10
Agent positions: [(0, 0), (1, 0)], Reward: -10
Agent positions: [(0, 0), (1, 1)], Reward: -10
Agent positions: [(0, 0), (1, 1)], Reward: -10
Agent positions: [(0, 1), (1, 1)], Reward: -10
Agent positions: [(0, 1), (1, 1)], Reward: -10
Agent positions: [(1, 1), (1, 0)], Reward: -10
Agent positions: [(1, 2), (0, 0)], Reward: 0
Agent positions: [(0, 2), (0, 0)], Reward: 0
Agent positions: [(0, 2), (0, 1)], Reward: -10
Agent positions: [(0, 1), (0, 0)], Reward: -10
Agent positions: [(0, 2), (0, 1)], Reward: -10
Agent positions: [(0, 2), (1, 1)], Reward: -10


In [5]:
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pygame

# Constants for gridworld
GRID_SIZE = 10
CELL_SIZE = 50  # pixels per cell
WINDOW_SIZE = GRID_SIZE * CELL_SIZE
NUM_OBSTACLES = 10

# ----------------------------
# Environment (Gridworld)
# ----------------------------
class Gridworld:
    def __init__(self):
        self.grid_size = GRID_SIZE
        self.cell_size = CELL_SIZE
        # Define goals: agent 0's goal is bottom-right, agent 1's goal is bottom-left
        self.goals = [(GRID_SIZE - 1, GRID_SIZE - 1), (0, GRID_SIZE - 1)]
        self.obstacles = self.generate_obstacles()
        # Both agents start at top-left (0,0)
        self.agents = [(0, 0), (0, 0)]
        # Penalty if agents are too close
        self.proximity_penalty = -10

    def generate_obstacles(self):
        obstacles = set()
        # Generate random fixed obstacles while avoiding start and goal positions
        while len(obstacles) < NUM_OBSTACLES:
            x = random.randint(0, self.grid_size - 1)
            y = random.randint(0, self.grid_size - 1)
            if (x, y) != (0, 0) and (x, y) not in self.goals:
                obstacles.add((x, y))
        return list(obstacles)

    def reset(self):
        # Reset agents to starting positions; obstacles remain fixed.
        self.agents = [(0, 0), (0, 0)]
        return self.get_states()

    def get_states(self):
        # For each agent, state = [agent_x, agent_y, goal_x, goal_y]
        states = []
        for idx, pos in enumerate(self.agents):
            goal = self.goals[idx]
            states.append([pos[0], pos[1], goal[0], goal[1]])
        return states

    def step(self, actions):
        """
        Takes a list of actions (one per agent, where:
           0: up, 1: down, 2: left, 3: right)
        and updates their positions.
        Returns:
           next_states: list of states for each agent
           reward: a scalar reward computed from proximity and goal achievements
           done: True if both agents have reached their goals
        """
        new_positions = []
        for idx, (x, y) in enumerate(self.agents):
            dx, dy = 0, 0
            action = actions[idx]
            if action == 0:  # up
                dy = -1
            elif action == 1:  # down
                dy = 1
            elif action == 2:  # left
                dx = -1
            elif action == 3:  # right
                dx = 1
            new_x, new_y = x + dx, y + dy
            # Check boundaries
            if new_x < 0 or new_x >= self.grid_size or new_y < 0 or new_y >= self.grid_size:
                new_x, new_y = x, y  # invalid move, stay in place
            # Check obstacles; if new cell is an obstacle, agent remains in place
            if (new_x, new_y) in self.obstacles:
                new_x, new_y = x, y
            new_positions.append((new_x, new_y))
        self.agents = new_positions

        # Compute reward and check termination
        reward = 0
        done = False
        # Apply proximity penalty if agents are too close (Euclidean distance < 2)
        dist = math.sqrt((self.agents[0][0] - self.agents[1][0])**2 +
                         (self.agents[0][1] - self.agents[1][1])**2)
        if dist < 2:
            reward += self.proximity_penalty
        # Reward for each agent reaching its goal
        for idx, pos in enumerate(self.agents):
            if pos == self.goals[idx]:
                reward += 10  # arbitrary reward for goal achievement

        # Episode ends if both agents have reached their goals
        if self.agents[0] == self.goals[0] and self.agents[1] == self.goals[1]:
            done = True

        next_states = self.get_states()
        return next_states, reward, done

    def render(self, screen):
        # Clear screen
        screen.fill((255, 255, 255))
        # Draw grid lines
        for x in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (x, 0), (x, WINDOW_SIZE))
        for y in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (0, y), (WINDOW_SIZE, y))
        # Draw obstacles (black)
        for obs in self.obstacles:
            rect = pygame.Rect(obs[0]*self.cell_size, obs[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 0, 0), rect)
        # Draw goals (green)
        for goal in self.goals:
            rect = pygame.Rect(goal[0]*self.cell_size, goal[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 255, 0), rect)
        # Draw agents (agent 0 in red, agent 1 in blue)
        colors = [(255, 0, 0), (0, 0, 255)]
        for idx, pos in enumerate(self.agents):
            center = (pos[0]*self.cell_size + self.cell_size//2,
                      pos[1]*self.cell_size + self.cell_size//2)
            pygame.draw.circle(screen, colors[idx], center, self.cell_size//3)
        pygame.display.flip()

# ----------------------------
# Replay Buffer for DQN
# ----------------------------
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

# ----------------------------
# DQN Network and Agent
# ----------------------------
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )
        
    def forward(self, x):
        return self.net(x)

class DQNAgent:
    def __init__(self, input_dim, output_dim, lr=1e-3, gamma=0.99,
                 buffer_capacity=10000, batch_size=32):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma
        self.replay_buffer = ReplayBuffer(buffer_capacity)
        self.batch_size = batch_size
        
    def select_action(self, state, epsilon):
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            return random.randrange(4)  # 4 discrete actions
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.policy_net(state_tensor)
        return q_values.argmax().item()
    
    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        transitions = self.replay_buffer.sample(self.batch_size)
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*transitions)
        
        batch_state = torch.FloatTensor(batch_state).to(self.device)
        batch_action = torch.LongTensor(batch_action).unsqueeze(1).to(self.device)
        batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(self.device)
        batch_next_state = torch.FloatTensor(batch_next_state).to(self.device)
        batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(self.device)
        
        current_q = self.policy_net(batch_state).gather(1, batch_action)
        next_q = self.target_net(batch_next_state).max(1)[0].unsqueeze(1)
        target_q = batch_reward + self.gamma * next_q * (1 - batch_done)
        
        loss = nn.MSELoss()(current_q, target_q)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# ----------------------------
# Training Loop
# ----------------------------
def train():
    num_episodes = 500
    max_steps = 50
    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_decay = 0.995
    target_update_freq = 10
    
    env = Gridworld()
    
    # Each agent's state: [agent_x, agent_y, goal_x, goal_y] -> 4 dimensions.
    state_dim = 4
    action_dim = 4  # up, down, left, right
    agents = [DQNAgent(state_dim, action_dim), DQNAgent(state_dim, action_dim)]
    
    for episode in range(num_episodes):
        states = env.reset()
        total_reward = 0
        for step in range(max_steps):
            actions = []
            # Each agent selects an action using its own DQN policy
            for i in range(2):
                action = agents[i].select_action(states[i], epsilon)
                actions.append(action)
            next_states, reward, done = env.step(actions)
            total_reward += reward
            
            # Store experiences for each agent; here we use the same reward for both
            for i in range(2):
                agents[i].replay_buffer.push(states[i], actions[i], reward, next_states[i], done)
            
            states = next_states
            
            # Update each agent's policy network from its replay buffer
            for i in range(2):
                agents[i].update()
            
            if done:
                break
        
        # Update target networks periodically
        if episode % target_update_freq == 0:
            for agent in agents:
                agent.update_target()
        
        # Epsilon decay
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Epsilon: {epsilon:.3f}")
        
        # Optional: To visualize training, you can uncomment and run the render code below.
        # pygame.init()
        # screen = pygame.display.set_mode((WINDOW_SIZE, WINDOW_SIZE))
        # env.render(screen)
        # pygame.time.wait(200)
        # pygame.quit()

if __name__ == '__main__':
    train()


Episode 0, Total Reward: -150.00, Epsilon: 0.995
Episode 1, Total Reward: -200.00, Epsilon: 0.990
Episode 2, Total Reward: -140.00, Epsilon: 0.985
Episode 3, Total Reward: -160.00, Epsilon: 0.980
Episode 4, Total Reward: -20.00, Epsilon: 0.975
Episode 5, Total Reward: -260.00, Epsilon: 0.970
Episode 6, Total Reward: -160.00, Epsilon: 0.966
Episode 7, Total Reward: -140.00, Epsilon: 0.961
Episode 8, Total Reward: -20.00, Epsilon: 0.956
Episode 9, Total Reward: -190.00, Epsilon: 0.951
Episode 10, Total Reward: -100.00, Epsilon: 0.946
Episode 11, Total Reward: -270.00, Epsilon: 0.942
Episode 12, Total Reward: -170.00, Epsilon: 0.937
Episode 13, Total Reward: -130.00, Epsilon: 0.932
Episode 14, Total Reward: -30.00, Epsilon: 0.928
Episode 15, Total Reward: -90.00, Epsilon: 0.923
Episode 16, Total Reward: -100.00, Epsilon: 0.918
Episode 17, Total Reward: -120.00, Epsilon: 0.914
Episode 18, Total Reward: -200.00, Epsilon: 0.909
Episode 19, Total Reward: -70.00, Epsilon: 0.905
Episode 20, Tot


KeyboardInterrupt



In [1]:
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pygame

# Constants for gridworld and visualization
GRID_SIZE = 10
CELL_SIZE = 50  # pixels per cell
WINDOW_SIZE = GRID_SIZE * CELL_SIZE
NUM_OBSTACLES = 10

# ----------------------------
# Environment (Gridworld)
# ----------------------------
class Gridworld:
    def __init__(self):
        self.grid_size = GRID_SIZE
        self.cell_size = CELL_SIZE
        # Define goals: agent 0's goal is bottom-right, agent 1's goal is bottom-left
        self.goals = [(GRID_SIZE - 1, GRID_SIZE - 1), (0, GRID_SIZE - 1)]
        self.obstacles = self.generate_obstacles()
        # Both agents start at top-left (0,0)
        self.agents = [(0, 0), (0, 0)]
        # Penalty if agents are too close (Euclidean distance < 2)
        self.proximity_penalty = -10

    def generate_obstacles(self):
        obstacles = set()
        # Generate random fixed obstacles while avoiding start and goal positions
        while len(obstacles) < NUM_OBSTACLES:
            x = random.randint(0, self.grid_size - 1)
            y = random.randint(0, self.grid_size - 1)
            if (x, y) != (0, 0) and (x, y) not in self.goals:
                obstacles.add((x, y))
        return list(obstacles)

    def reset(self):
        # Reset agents to starting positions; obstacles remain fixed.
        self.agents = [(0, 0), (0, 0)]
        return self.get_states()

    def get_states(self):
        # Each agent's state: [agent_x, agent_y, goal_x, goal_y]
        states = []
        for idx, pos in enumerate(self.agents):
            goal = self.goals[idx]
            states.append([pos[0], pos[1], goal[0], goal[1]])
        return states

    def step(self, actions):
        """
        Expects a list of actions (one per agent: 0: up, 1: down, 2: left, 3: right).
        Updates agent positions and returns:
          - next_states: list of states for each agent,
          - reward: a scalar reward (includes a penalty for proximity),
          - done: True if both agents have reached their goals.
        """
        new_positions = []
        for idx, (x, y) in enumerate(self.agents):
            dx, dy = 0, 0
            action = actions[idx]
            if action == 0:  # up
                dy = -1
            elif action == 1:  # down
                dy = 1
            elif action == 2:  # left
                dx = -1
            elif action == 3:  # right
                dx = 1
            new_x, new_y = x + dx, y + dy
            # Check boundaries; invalid moves result in staying in place.
            if new_x < 0 or new_x >= self.grid_size or new_y < 0 or new_y >= self.grid_size:
                new_x, new_y = x, y
            # Check obstacles; if moving into an obstacle, agent remains in place.
            if (new_x, new_y) in self.obstacles:
                new_x, new_y = x, y
            new_positions.append((new_x, new_y))
        self.agents = new_positions

        # Compute reward and termination condition
        reward = 0
        done = False
        # Penalty if agents are too close (Euclidean distance < 2)
        dist = math.sqrt((self.agents[0][0] - self.agents[1][0])**2 +
                         (self.agents[0][1] - self.agents[1][1])**2)
        if dist < 2:
            reward += self.proximity_penalty
        # Reward for each agent reaching its goal
        for idx, pos in enumerate(self.agents):
            if pos == self.goals[idx]:
                reward += 10

        if self.agents[0] == self.goals[0] and self.agents[1] == self.goals[1]:
            done = True

        next_states = self.get_states()
        return next_states, reward, done

    def render(self, screen):
        # Clear screen
        screen.fill((255, 255, 255))
        # Draw grid lines
        for x in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (x, 0), (x, WINDOW_SIZE))
        for y in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (0, y), (WINDOW_SIZE, y))
        # Draw obstacles as black rectangles
        for obs in self.obstacles:
            rect = pygame.Rect(obs[0]*self.cell_size, obs[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 0, 0), rect)
        # Draw goals as green rectangles
        for goal in self.goals:
            rect = pygame.Rect(goal[0]*self.cell_size, goal[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 255, 0), rect)
        # Draw agents as circles (agent 0 in red, agent 1 in blue)
        colors = [(255, 0, 0), (0, 0, 255)]
        for idx, pos in enumerate(self.agents):
            center = (pos[0]*self.cell_size + self.cell_size//2,
                      pos[1]*self.cell_size + self.cell_size//2)
            pygame.draw.circle(screen, colors[idx], center, self.cell_size//3)
        pygame.display.flip()

# ----------------------------
# Replay Buffer for DQN
# ----------------------------
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

# ----------------------------
# DQN Network and Agent
# ----------------------------
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )
        
    def forward(self, x):
        return self.net(x)

class DQNAgent:
    def __init__(self, input_dim, output_dim, lr=1e-3, gamma=0.99,
                 buffer_capacity=10000, batch_size=32):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma
        self.replay_buffer = ReplayBuffer(buffer_capacity)
        self.batch_size = batch_size
        
    def select_action(self, state, epsilon):
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            return random.randrange(4)  # 4 discrete actions
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.policy_net(state_tensor)
        return q_values.argmax().item()
    
    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        transitions = self.replay_buffer.sample(self.batch_size)
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*transitions)
        
        batch_state = torch.FloatTensor(batch_state).to(self.device)
        batch_action = torch.LongTensor(batch_action).unsqueeze(1).to(self.device)
        batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(self.device)
        batch_next_state = torch.FloatTensor(batch_next_state).to(self.device)
        batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(self.device)
        
        current_q = self.policy_net(batch_state).gather(1, batch_action)
        next_q = self.target_net(batch_next_state).max(1)[0].unsqueeze(1)
        target_q = batch_reward + self.gamma * next_q * (1 - batch_done)
        
        loss = nn.MSELoss()(current_q, target_q)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# ----------------------------
# Training Loop with Milestone Rendering
# ----------------------------
def train():
    num_episodes = 500
    max_steps = 50
    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_decay = 0.995
    target_update_freq = 10
    # Define milestone episodes (25%, 50%, 75%, and 100% of training)
    milestones = {int(num_episodes * 0.25), int(num_episodes * 0.5), int(num_episodes * 0.75), num_episodes}
    
    env = Gridworld()
    
    # Each agent's state has 4 elements: [agent_x, agent_y, goal_x, goal_y]
    state_dim = 4
    action_dim = 4  # up, down, left, right
    agents = [DQNAgent(state_dim, action_dim), DQNAgent(state_dim, action_dim)]
    
    for episode in range(num_episodes):
        states = env.reset()
        total_reward = 0
        
        # Check if this is a milestone episode (episode indices are 0-based)
        do_render = (episode + 1) in milestones
        if do_render:
            pygame.init()
            screen = pygame.display.set_mode((WINDOW_SIZE, WINDOW_SIZE))
            pygame.display.set_caption(f"Training Episode {episode + 1}")
        
        for step in range(max_steps):
            actions = []
            # Each agent selects an action using its own DQN policy
            for i in range(2):
                action = agents[i].select_action(states[i], epsilon)
                actions.append(action)
            next_states, reward, done = env.step(actions)
            total_reward += reward
            
            # Store experiences for each agent
            for i in range(2):
                agents[i].replay_buffer.push(states[i], actions[i], reward, next_states[i], done)
            
            states = next_states
            
            # Update each agent's policy network
            for i in range(2):
                agents[i].update()
            
            if do_render:
                env.render(screen)
                # Process quit events in visualization mode
                for event in pygame.event.get():
                    if event.type == pygame.QUIT:
                        pygame.quit()
                        return
                pygame.time.wait(200)
                
            if done:
                break
        
        if do_render:
            # Pause to allow viewing the final state of the milestone episode
            pygame.time.wait(1000)
            pygame.quit()
        
        # Update target networks periodically
        if (episode + 1) % target_update_freq == 0:
            for agent in agents:
                agent.update_target()
        
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        print(f"Episode {episode + 1}, Total Reward: {total_reward:.2f}, Epsilon: {epsilon:.3f}")

if __name__ == '__main__':
    train()


pygame 2.5.2 (SDL 2.28.3, Python 3.11.8)
Hello from the pygame community. https://www.pygame.org/contribute.html
Episode 1, Total Reward: -100.00, Epsilon: 0.995
Episode 2, Total Reward: -110.00, Epsilon: 0.990
Episode 3, Total Reward: -70.00, Epsilon: 0.985
Episode 4, Total Reward: -80.00, Epsilon: 0.980
Episode 5, Total Reward: -220.00, Epsilon: 0.975
Episode 6, Total Reward: -90.00, Epsilon: 0.970
Episode 7, Total Reward: -30.00, Epsilon: 0.966
Episode 8, Total Reward: -70.00, Epsilon: 0.961
Episode 9, Total Reward: -150.00, Epsilon: 0.956
Episode 10, Total Reward: -100.00, Epsilon: 0.951
Episode 11, Total Reward: -140.00, Epsilon: 0.946
Episode 12, Total Reward: -150.00, Epsilon: 0.942
Episode 13, Total Reward: -150.00, Epsilon: 0.937
Episode 14, Total Reward: -120.00, Epsilon: 0.932
Episode 15, Total Reward: -150.00, Epsilon: 0.928
Episode 16, Total Reward: -280.00, Epsilon: 0.923
Episode 17, Total Reward: -130.00, Epsilon: 0.918
Episode 18, Total Reward: -40.00, Epsilon: 0.914
Ep

KeyboardInterrupt: 

In [2]:
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pygame

# Constants for gridworld and visualization
GRID_SIZE = 10
CELL_SIZE = 50   # pixels per cell
WINDOW_SIZE = GRID_SIZE * CELL_SIZE
NUM_OBSTACLES = 10

# ----------------------------
# Environment (Gridworld)
# ----------------------------
class Gridworld:
    def __init__(self):
        self.grid_size = GRID_SIZE
        self.cell_size = CELL_SIZE
        self.vision_radius = 5  # drones can sense obstacles within 5 cells
        # Define goals: agent 0's goal is bottom-right, agent 1's goal is bottom-left
        self.goals = [(GRID_SIZE - 1, GRID_SIZE - 1), (0, GRID_SIZE - 1)]
        self.obstacles = self.generate_obstacles()
        # Both agents start at top-left (0,0)
        self.agents = [(0, 0), (0, 0)]
        # Penalty values
        self.movement_penalty = -1       # per move per agent
        self.closeness_penalty = -10     # if Manhattan distance < 2
        self.collision_penalty = -50     # if agent attempts to move into an obstacle

    def generate_obstacles(self):
        obstacles = set()
        # Generate random fixed obstacles while avoiding start and goal positions
        while len(obstacles) < NUM_OBSTACLES:
            x = random.randint(0, self.grid_size - 1)
            y = random.randint(0, self.grid_size - 1)
            if (x, y) != (0, 0) and (x, y) not in self.goals:
                obstacles.add((x, y))
        return list(obstacles)

    def reset(self):
        # Reset agents to starting positions; obstacles remain fixed.
        self.agents = [(0, 0), (0, 0)]
        return self.get_states()

    def get_states(self):
        # Each agent's state: [agent_x, agent_y, goal_x, goal_y]
        # (Extend this in the future with vision info if desired)
        states = []
        for idx, pos in enumerate(self.agents):
            goal = self.goals[idx]
            states.append([pos[0], pos[1], goal[0], goal[1]])
        return states

    def step(self, actions):
        """
        Expects a list of actions (one per agent: 0: up, 1: down, 2: left, 3: right).
        Updates agent positions and returns:
          - next_states: list of states for each agent,
          - reward: a scalar reward that includes:
                * movement penalty,
                * collision penalty (if agent attempts to move into an obstacle),
                * closeness penalty (if Manhattan distance < 2),
                * goal achievement rewards.
          - done: True if both agents have reached their goals.
        """
        reward = 0
        # Apply movement penalty for each agent regardless of move validity.
        reward += self.movement_penalty * len(self.agents)
        
        new_positions = []
        for idx, (x, y) in enumerate(self.agents):
            dx, dy = 0, 0
            action = actions[idx]
            if action == 0:  # up
                dy = -1
            elif action == 1:  # down
                dy = 1
            elif action == 2:  # left
                dx = -1
            elif action == 3:  # right
                dx = 1
            new_x, new_y = x + dx, y + dy
            # Check boundaries; invalid moves result in staying in place.
            if new_x < 0 or new_x >= self.grid_size or new_y < 0 or new_y >= self.grid_size:
                new_x, new_y = x, y
            # Check obstacles; if moving into an obstacle, apply collision penalty and stay in place.
            if (new_x, new_y) in self.obstacles:
                reward += self.collision_penalty
                new_x, new_y = x, y
            new_positions.append((new_x, new_y))
        self.agents = new_positions

        # Compute closeness penalty based on Manhattan distance between agents.
        manhattan_distance = abs(self.agents[0][0] - self.agents[1][0]) + abs(self.agents[0][1] - self.agents[1][1])
        if manhattan_distance < 2:
            reward += self.closeness_penalty

        # Reward for each agent reaching its goal.
        for idx, pos in enumerate(self.agents):
            if pos == self.goals[idx]:
                reward += 10  # goal reward

        # Episode ends if both agents have reached their goals.
        done = (self.agents[0] == self.goals[0] and self.agents[1] == self.goals[1])
        next_states = self.get_states()
        return next_states, reward, done

    def render(self, screen):
        # Clear screen
        screen.fill((255, 255, 255))
        # Draw grid lines
        for x in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (x, 0), (x, WINDOW_SIZE))
        for y in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (0, y), (WINDOW_SIZE, y))
        # Draw obstacles as black rectangles
        for obs in self.obstacles:
            rect = pygame.Rect(obs[0]*self.cell_size, obs[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 0, 0), rect)
        # Draw goals as green rectangles
        for goal in self.goals:
            rect = pygame.Rect(goal[0]*self.cell_size, goal[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 255, 0), rect)
        # Draw agents as circles (agent 0 in red, agent 1 in blue)
        colors = [(255, 0, 0), (0, 0, 255)]
        for idx, pos in enumerate(self.agents):
            center = (pos[0]*self.cell_size + self.cell_size//2,
                      pos[1]*self.cell_size + self.cell_size//2)
            pygame.draw.circle(screen, colors[idx], center, self.cell_size//3)
        pygame.display.flip()

# ----------------------------
# Replay Buffer for DQN
# ----------------------------
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

# ----------------------------
# DQN Network and Agent
# ----------------------------
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )
        
    def forward(self, x):
        return self.net(x)

class DQNAgent:
    def __init__(self, input_dim, output_dim, lr=1e-3, gamma=0.99,
                 buffer_capacity=10000, batch_size=32):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma
        self.replay_buffer = ReplayBuffer(buffer_capacity)
        self.batch_size = batch_size
        
    def select_action(self, state, epsilon):
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            return random.randrange(4)  # 4 discrete actions: up, down, left, right
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.policy_net(state_tensor)
        return q_values.argmax().item()
    
    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        transitions = self.replay_buffer.sample(self.batch_size)
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*transitions)
        
        batch_state = torch.FloatTensor(batch_state).to(self.device)
        batch_action = torch.LongTensor(batch_action).unsqueeze(1).to(self.device)
        batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(self.device)
        batch_next_state = torch.FloatTensor(batch_next_state).to(self.device)
        batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(self.device)
        
        current_q = self.policy_net(batch_state).gather(1, batch_action)
        next_q = self.target_net(batch_next_state).max(1)[0].unsqueeze(1)
        target_q = batch_reward + self.gamma * next_q * (1 - batch_done)
        
        loss = nn.MSELoss()(current_q, target_q)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# ----------------------------
# Training Loop with Milestone Rendering
# ----------------------------
def train():
    num_episodes = 500
    max_steps = 50
    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_decay = 0.995
    target_update_freq = 10
    # Define milestone episodes (25%, 50%, 75%, and 100% of training)
    milestones = {int(num_episodes * 0.25), int(num_episodes * 0.5), int(num_episodes * 0.75), num_episodes}
    
    env = Gridworld()
    
    # Each agent's state has 4 elements: [agent_x, agent_y, goal_x, goal_y]
    state_dim = 4
    action_dim = 4  # up, down, left, right
    agents = [DQNAgent(state_dim, action_dim), DQNAgent(state_dim, action_dim)]
    
    for episode in range(num_episodes):
        states = env.reset()
        total_reward = 0
        
        # Check if this is a milestone episode (episode indices are 0-based)
        do_render = (episode + 1) in milestones
        if do_render:
            pygame.init()
            screen = pygame.display.set_mode((WINDOW_SIZE, WINDOW_SIZE))
            pygame.display.set_caption(f"Training Episode {episode + 1}")
        
        for step in range(max_steps):
            actions = []
            # Each agent selects an action using its own DQN policy
            for i in range(2):
                action = agents[i].select_action(states[i], epsilon)
                actions.append(action)
            next_states, reward, done = env.step(actions)
            total_reward += reward
            
            # Store experiences for each agent
            for i in range(2):
                agents[i].replay_buffer.push(states[i], actions[i], reward, next_states[i], done)
            
            states = next_states
            
            # Update each agent's policy network
            for i in range(2):
                agents[i].update()
            
            if do_render:
                env.render(screen)
                # Process quit events in visualization mode
                for event in pygame.event.get():
                    if event.type == pygame.QUIT:
                        pygame.quit()
                        return
                pygame.time.wait(200)
                
            if done:
                break
        
        if do_render:
            # Pause to allow viewing the final state of the milestone episode
            pygame.time.wait(1000)
            pygame.quit()
        
        # Update target networks periodically
        if (episode + 1) % target_update_freq == 0:
            for agent in agents:
                agent.update_target()
        
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        print(f"Episode {episode + 1}, Total Reward: {total_reward:.2f}, Epsilon: {epsilon:.3f}")

if __name__ == '__main__':
    train()


Episode 1, Total Reward: -800.00, Epsilon: 0.995
Episode 2, Total Reward: -420.00, Epsilon: 0.990
Episode 3, Total Reward: -770.00, Epsilon: 0.985
Episode 4, Total Reward: -550.00, Epsilon: 0.980
Episode 5, Total Reward: -690.00, Epsilon: 0.975
Episode 6, Total Reward: -630.00, Epsilon: 0.970
Episode 7, Total Reward: -690.00, Epsilon: 0.966
Episode 8, Total Reward: -240.00, Epsilon: 0.961
Episode 9, Total Reward: -390.00, Epsilon: 0.956
Episode 10, Total Reward: -620.00, Epsilon: 0.951
Episode 11, Total Reward: -800.00, Epsilon: 0.946
Episode 12, Total Reward: -340.00, Epsilon: 0.942
Episode 13, Total Reward: -660.00, Epsilon: 0.937
Episode 14, Total Reward: -210.00, Epsilon: 0.932
Episode 15, Total Reward: -580.00, Epsilon: 0.928
Episode 16, Total Reward: -530.00, Epsilon: 0.923
Episode 17, Total Reward: -610.00, Epsilon: 0.918
Episode 18, Total Reward: -770.00, Epsilon: 0.914
Episode 19, Total Reward: -380.00, Epsilon: 0.909
Episode 20, Total Reward: -320.00, Epsilon: 0.905
Episode 2

In [6]:
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pygame

# Constants for gridworld and visualization
GRID_SIZE = 10
CELL_SIZE = 50   # pixels per cell
WINDOW_SIZE = GRID_SIZE * CELL_SIZE
NUM_OBSTACLES = 10

# ----------------------------
# Environment (Gridworld)
# ----------------------------
class Gridworld:
    def __init__(self):
        self.grid_size = GRID_SIZE
        self.cell_size = CELL_SIZE
        self.vision_radius = 5  # drones can sense obstacles within 5 cells
        # Define goals: agent 0's goal is bottom-right, agent 1's goal is bottom-left
        self.goals = [(GRID_SIZE - 1, GRID_SIZE - 1), (0, GRID_SIZE - 1)]
        self.obstacles = self.generate_obstacles()
        # Both agents start at top-left (0,0)
        self.agents = [(0, 0), (0, 0)]
        # Penalty and reward values
        self.movement_penalty = -1       # per move per agent
        self.closeness_penalty = -10     # if Manhattan distance < 2
        self.collision_penalty = -50     # if agent attempts to move into an obstacle
        self.goal_reward = 50            # increased goal reward to incentivize reaching the goal
        self.shaping_factor = 0.5        # reward shaping factor for progress towards goal

    def generate_obstacles(self):
        obstacles = set()
        # Generate random fixed obstacles while avoiding start and goal positions
        while len(obstacles) < NUM_OBSTACLES:
            x = random.randint(0, self.grid_size - 1)
            y = random.randint(0, self.grid_size - 1)
            if (x, y) != (0, 0) and (x, y) not in self.goals:
                obstacles.add((x, y))
        return list(obstacles)

    def get_obstacle_view(self, agent_position):
        view = []
        ax, ay = agent_position
        for dy in range(-self.vision_radius, self.vision_radius + 1):
            for dx in range(-self.vision_radius, self.vision_radius + 1):
                cx = ax + dx
                cy = ay + dy
                if 0 <= cx < self.grid_size and 0 <= cy < self.grid_size:
                    if (cx, cy) in self.obstacles:
                        view.append(1.0)
                    else:
                        view.append(0.0)
                else:
                    view.append(0.0)
        return view

    def reset(self):
        # Reset agents to starting positions; obstacles remain fixed.
        self.agents = [(0, 0), (0, 0)]
        return self.get_states()

    def get_states(self):
        # Each agent's state: [agent_x, agent_y, goal_x, goal_y] + flattened obstacle view
        states = []
        for idx, pos in enumerate(self.agents):
            goal = self.goals[idx]
            base_state = [pos[0], pos[1], goal[0], goal[1]]
            obstacle_view = self.get_obstacle_view(pos)
            states.append(base_state + obstacle_view)
        return states

    def step(self, actions):
        """
        Expects a list of actions (one per agent: 0: up, 1: down, 2: left, 3: right).
        Updates agent positions and returns:
          - next_states: list of states for each agent,
          - reward: a scalar reward that includes:
                * movement penalty,
                * collision penalty (if agent attempts to move into an obstacle),
                * reward shaping (for progress toward the goal),
                * closeness penalty (if Manhattan distance < 2),
                * goal achievement rewards.
          - done: True if both agents have reached their goals.
        """
        reward = 0
        # Apply movement penalty for each agent
        reward += self.movement_penalty * len(self.agents)
        
        # Compute old Manhattan distances for reward shaping
        old_distances = []
        for idx, (x, y) in enumerate(self.agents):
            goal = self.goals[idx]
            old_distance = abs(x - goal[0]) + abs(y - goal[1])
            old_distances.append(old_distance)
        
        new_positions = []
        for idx, (x, y) in enumerate(self.agents):
            dx, dy = 0, 0
            action = actions[idx]
            if action == 0:  # up
                dy = -1
            elif action == 1:  # down
                dy = 1
            elif action == 2:  # left
                dx = -1
            elif action == 3:  # right
                dx = 1
            new_x, new_y = x + dx, y + dy
            # Check boundaries; invalid moves result in staying in place.
            if new_x < 0 or new_x >= self.grid_size or new_y < 0 or new_y >= self.grid_size:
                new_x, new_y = x, y
            # Check obstacles; if moving into an obstacle, apply collision penalty and stay in place.
            if (new_x, new_y) in self.obstacles:
                reward += self.collision_penalty
                new_x, new_y = x, y
            new_positions.append((new_x, new_y))
        self.agents = new_positions

        # Reward shaping: add reward proportional to progress toward goal
        for idx, (x, y) in enumerate(self.agents):
            goal = self.goals[idx]
            new_distance = abs(x - goal[0]) + abs(y - goal[1])
            # If agent gets closer, the difference (old - new) is positive.
            shaping_reward = self.shaping_factor * (old_distances[idx] - new_distance)
            reward += shaping_reward

        # Closeness penalty: if Manhattan distance between agents < 2
        manhattan_distance = abs(self.agents[0][0] - self.agents[1][0]) + abs(self.agents[0][1] - self.agents[1][1])
        if manhattan_distance < 2:
            reward += self.closeness_penalty

        # Reward for reaching the goal for each agent.
        for idx, pos in enumerate(self.agents):
            if pos == self.goals[idx]:
                reward += self.goal_reward

        # Episode ends if both agents have reached their goals.
        done = (self.agents[0] == self.goals[0] and self.agents[1] == self.goals[1])
        next_states = self.get_states()
        return next_states, reward, done

    def render(self, screen):
        # Clear screen
        screen.fill((255, 255, 255))
        # Draw grid lines
        for x in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (x, 0), (x, WINDOW_SIZE))
        for y in range(0, WINDOW_SIZE, self.cell_size):
            pygame.draw.line(screen, (200, 200, 200), (0, y), (WINDOW_SIZE, y))
        # Draw obstacles as black rectangles
        for obs in self.obstacles:
            rect = pygame.Rect(obs[0]*self.cell_size, obs[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 0, 0), rect)
        # Draw goals as green rectangles
        for goal in self.goals:
            rect = pygame.Rect(goal[0]*self.cell_size, goal[1]*self.cell_size, self.cell_size, self.cell_size)
            pygame.draw.rect(screen, (0, 255, 0), rect)
        # Draw agents as circles (agent 0 in red, agent 1 in blue)
        colors = [(255, 0, 0), (0, 0, 255)]
        for idx, pos in enumerate(self.agents):
            center = (pos[0]*self.cell_size + self.cell_size//2,
                      pos[1]*self.cell_size + self.cell_size//2)
            pygame.draw.circle(screen, colors[idx], center, self.cell_size//3)
        pygame.display.flip()

# ----------------------------
# Replay Buffer for DQN
# ----------------------------
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

# ----------------------------
# DQN Network and Agent
# ----------------------------
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )
        
    def forward(self, x):
        return self.net(x)

class DQNAgent:
    def __init__(self, input_dim, output_dim, lr=1e-3, gamma=0.99,
                 buffer_capacity=10000, batch_size=32):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net = DQN(input_dim, output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma
        self.replay_buffer = ReplayBuffer(buffer_capacity)
        self.batch_size = batch_size
        
    def select_action(self, state, epsilon):
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            return random.randrange(4)  # 4 discrete actions: up, down, left, right
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.policy_net(state_tensor)
        return q_values.argmax().item()
    
    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        transitions = self.replay_buffer.sample(self.batch_size)
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*transitions)
        
        batch_state = torch.FloatTensor(batch_state).to(self.device)
        batch_action = torch.LongTensor(batch_action).unsqueeze(1).to(self.device)
        batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(self.device)
        batch_next_state = torch.FloatTensor(batch_next_state).to(self.device)
        batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(self.device)
        
        current_q = self.policy_net(batch_state).gather(1, batch_action)
        next_q = self.target_net(batch_next_state).max(1)[0].unsqueeze(1)
        target_q = batch_reward + self.gamma * next_q * (1 - batch_done)
        
        loss = nn.MSELoss()(current_q, target_q)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# ----------------------------
# Training Loop with Milestone Rendering
# ----------------------------
def train():
    num_episodes = 1000
    max_steps = 200
    epsilon = 1.0
    epsilon_min = 0.05
    epsilon_decay = 0.9967
    target_update_freq = 10
    # Define milestone episodes (25%, 50%, 75%, and 100% of training)
    milestones = {int(num_episodes * 0.25), int(num_episodes * 0.5), int(num_episodes * 0.75), num_episodes}
    
    env = Gridworld()
    # Each agent's state dimension: 4 + (2*vision_radius+1)^2.
    state_dim = 4 + (2 * env.vision_radius + 1) ** 2
    action_dim = 4  # up, down, left, right
    agents = [DQNAgent(state_dim, action_dim), DQNAgent(state_dim, action_dim)]
    
    for episode in range(num_episodes):
        states = env.reset()
        total_reward = 0
        
        # Check if this is a milestone episode (episode indices are 0-based)
        do_render = (episode + 1) in milestones
        if do_render:
            pygame.init()
            screen = pygame.display.set_mode((WINDOW_SIZE, WINDOW_SIZE))
            pygame.display.set_caption(f"Training Episode {episode + 1}")
        
        for step in range(max_steps):
            actions = []
            # Each agent selects an action using its own DQN policy
            for i in range(2):
                action = agents[i].select_action(states[i], epsilon)
                actions.append(action)
            next_states, reward, done = env.step(actions)
            total_reward += reward
            
            # Store experiences for each agent
            for i in range(2):
                agents[i].replay_buffer.push(states[i], actions[i], reward, next_states[i], done)
            
            states = next_states
            
            # Update each agent's policy network
            for i in range(2):
                agents[i].update()
            
            if do_render:
                env.render(screen)
                # Process quit events in visualization mode
                for event in pygame.event.get():
                    if event.type == pygame.QUIT:
                        pygame.quit()
                        return
                pygame.time.wait(200)
                
            if done:
                break
        
        if do_render:
            # Pause to allow viewing the final state of the milestone episode
            pygame.time.wait(1000)
            pygame.quit()
        
        # Update target networks periodically
        if (episode + 1) % target_update_freq == 0:
            for agent in agents:
                agent.update_target()
        
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        print(f"Episode {episode + 1}, Total Reward: {total_reward:.2f}, Epsilon: {epsilon:.3f}")

if __name__ == '__main__':
    train()


Episode 1, Total Reward: -2706.50, Epsilon: 1.000
Episode 2, Total Reward: -2118.00, Epsilon: 0.999
Episode 3, Total Reward: -1733.00, Epsilon: 0.999
Episode 4, Total Reward: -1460.00, Epsilon: 0.998
Episode 5, Total Reward: -1259.00, Epsilon: 0.998
Episode 6, Total Reward: -1526.50, Epsilon: 0.997
Episode 7, Total Reward: -1250.00, Epsilon: 0.997
Episode 8, Total Reward: -2258.50, Epsilon: 0.996
Episode 9, Total Reward: -1134.00, Epsilon: 0.996
Episode 10, Total Reward: -1380.00, Epsilon: 0.995
Episode 11, Total Reward: -1535.00, Epsilon: 0.995
Episode 12, Total Reward: -2342.00, Epsilon: 0.994
Episode 13, Total Reward: -2122.00, Epsilon: 0.994
Episode 14, Total Reward: -1156.00, Epsilon: 0.993
Episode 15, Total Reward: -2175.00, Epsilon: 0.993
Episode 16, Total Reward: -1365.00, Epsilon: 0.992
Episode 17, Total Reward: -3869.50, Epsilon: 0.992
Episode 18, Total Reward: -1144.50, Epsilon: 0.991
Episode 19, Total Reward: -1268.50, Epsilon: 0.991
Episode 20, Total Reward: -492.00, Epsil