### Imports

In [69]:
from gymnasium import spaces
from gym.spaces import Discrete, Box
import numpy as np
import pygame
from copy import copy
import numpy as np
from gymnasium.spaces import Discrete, MultiDiscrete
from pettingzoo import ParallelEnv

### Custom Environment

In [66]:
class MultiTrainEnv(ParallelEnv):
    metadata = {'render.modes': ['human']}

    def __init__(self, visualize=False):

        self.states = [] 
        self.states.append([0.0, 100.0, 0.0, 150.0])  
        self.states.append([0.0, 150.0, 0.0, 200.0])
        self.states.append([0.0, 200.0, 0.0, -1.0]) # -1 means no train in front
        
        # Actions: accelerate, decelerate, keep speed
        self.action_space = Discrete(3)
        # Observation space includes [speed, position, front_train_speed, front_train_position]
        self.observation_space = Box(
            low=np.array([0.0, 0.0, 0.0, 0.0]),
            high=np.array([1.0, 900.0, 1.0, 900.0])
        )

        self.target = np.array([900, 150])
        self.visualize = visualize
        if visualize:
            self.init_pygame()
    
    def reset(self, seed=None, options=None):
        self.states = []
        self.states.append([0.0, 100.0, 0.0, 150.0])  
        self.states.append([0.0, 150.0, 0.0, 200.0])
        self.states.append([0.0, 200.0, 0.0, -1.0])
        return self.states

    def init_pygame(self):
        pygame.init()
        self.screen = pygame.display.set_mode((1000, 300))
        self.clock = pygame.time.Clock()

    def step(self, actions, dones):
        rewards = []
        infos = []

        for idx, action in enumerate(actions):
            reward = 0  # Default reward
            if not dones[idx]:  # Process only if the agent is not done
                speed, position, front_train_speed, front_train_position = self.states[idx]

                # Action: 0 = accelerate, 1 = decelerate, 2 = maintain speed
                if action == 0:  # Accelerate
                    speed = min(1.0, speed + 0.005)
                elif action == 1:  # Decelerate
                    speed = max(0.0, speed - 0.005)

                # Update speed and position
                self.states[idx][0] = speed
                self.states[idx][1] += speed
                
                # Update previous trains' knowledge on the train in front 
                if idx > 0:  # Only update if there is a previous train
                    self.states[idx - 1][2] = speed
                    self.states[idx - 1][3] += speed

                # Reward logic
                reward = -0.01  # Small penalty for time
                if speed > 0.9:
                    reward += 0.1  # Reward for high speed

                if (front_train_position - position) < 30:
                    reward -= 100  # Collision penalty

                if (900 - position) < 5:  # Reached destination
                    reward += 100
                    self.states[idx][2] = -1  # No front train speed
                    self.states[idx][3] = -1 
                    self.states[idx][0] = 0.0  # Stop at destination
                    dones[idx] = True  # Mark this agent as done

            rewards.append(reward if not dones[idx] else 0)
            infos.append({})  

        return self.states, rewards, dones, infos


    def render(self, mode='human'):
        # Clear the screen
        self.screen.fill((0, 0, 0))

        # Draw the target
        pygame.draw.circle(self.screen, (255, 0, 0), (int(self.target[0]), int(self.target[1])), 10)

        # Draw the trains
        for train in self.states:
            train_position = (int(train[1]), 150)  # Assuming vertical position is fixed at 150
            pygame.draw.circle(self.screen, (0, 255, 0), train_position, 10)
    

        # Update the display
        pygame.display.flip()

        # Cap the frame rate
        self.clock.tick(60)

    def close(self):
        if self.is_pygame_initialized:
            print("Closing Pygame...")
            pygame.quit()
            self.is_pygame_initialized = False

### Model DeepQN

In [67]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class DeepQNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        return actions
    
class Agent():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                 max_mem_size=100000, eps_end=0.01, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0 # memory counter

        self.Q_eval = DeepQNetwork(self.lr, input_dims=input_dims, fc1_dims=256, fc2_dims=256, n_actions=n_actions)

        
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done
        self.mem_cntr += 1

    
    # explore or exploit
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def learn(self):
        # learn as soon as batch size full of memory
        if self.mem_cntr < self.batch_size: # if memory counter smaller than batch size
            return
        self.Q_eval.optimizer.zero_grad()
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

        action_batch = self.action_memory[batch]
        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0

        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]

        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()

        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min \
            else self.eps_min    

### Training Loop

In [68]:
# Initialize the environment
env = MultiTrainEnv(True)

# Initialize the agent (same model for all agents)
agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=3, eps_end=0.01, input_dims=[4], lr=0.003)

# Variables to track performance
scores, eps_history = [], []
n_games = 2  # Number of training episodes

# Loop over episodes
for i in range(n_games):
    score = 0
    done = [False] * len(env.states)  # Done flag for each agent
    observations = env.reset()  # Reset environment and get initial states for all agents (dim: num_agents x 4)

    # Game loop for each episode
    while not all(done):  # Continue until all agents are done
        actions = []  # Store actions for all agents
        for agent_id in range(len(env.states)):
            if not done[agent_id]:
                # Each agent chooses its action based on its observation (state)
                action = agent.choose_action(observations[agent_id])
                actions.append(action)
            else:
                actions.append(None)

        # Perform actions for all agents in the environment
        new_observations, rewards, done, info = env.step(actions, done)
        
        # Update score for each agent (sum the rewards)
        for agent_id in range(len(env.states)):
            if not done[agent_id]:
                agent.store_transition(observations[agent_id], actions[agent_id], rewards[agent_id],
                                    new_observations[agent_id], done[agent_id])
                agent.learn()
                score += rewards[agent_id]

        # Update the observations for the next step
        observations = new_observations

        env.render()

    # Store the score for this episode
    scores.append(score)
    eps_history.append(agent.epsilon)

    # Calculate and print average score over the last 100 games
    avg_score = np.mean(scores[-100:])
    print(f'Episode {i} | Score: {score:.2f} | Avg Score: {avg_score:.2f} | Epsilon: {agent.epsilon:.2f}')


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)


Episode 0 | Score: -384259.71 | Avg Score: -384259.71 | Epsilon: 0.01
Episode 1 | Score: -236267.84 | Avg Score: -310263.78 | Epsilon: 0.01
