<a href="https://colab.research.google.com/github/albertocj1/Taxi-v3-Reinforcement-Learning/blob/main/Taxi_v3_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
!pip install gymnasium torch numpy matplotlib imageio[ffmpeg] --quiet

In [67]:
import random
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video


# Select GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)



Using device: cpu


In [68]:
import gymnasium as gym

# Create the Taxi-v3 environment
env = gym.make("Taxi-v3")

# Print the observation space
print("Observation Space:", env.observation_space)

# Print the action space
print("Action Space:", env.action_space)

# Close the environment
env.close()

Observation Space: Discrete(500)
Action Space: Discrete(6)


In [69]:
class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 80)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(80, 80)
        self.output_layer = nn.Linear(80, n_actions)

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        return self.output_layer(x)

# Get the number of observations and actions from the environment
n_observations = env.observation_space.n
n_actions = env.action_space.n

# Instantiate the DQN model
policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

<All keys matched successfully>

In [70]:
from collections import deque
import random
import torch
import numpy as np


class ReplayBuffer:
    def __init__(self, capacity=5000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.FloatTensor(states),
            torch.LongTensor(actions),
            torch.FloatTensor(rewards),
            torch.FloatTensor(next_states),
            torch.FloatTensor(dones),
        )

    def __len__(self):
        return len(self.buffer)

In [71]:
def train_dqn(episodes,
              gamma,
              lr,
              batch_size,
              epsilon_decay,
              min_epsilon):

    env = gym.make("Taxi-v3")
    n_observations = env.observation_space.n
    n_actions = env.action_space.n

    policy_net = DQN(n_observations, n_actions).to(device)
    target_net = DQN(n_observations, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    criterion = nn.MSELoss()

    memory = ReplayBuffer(5000)
    epsilon = 1.0
    rewards_history = []

    for ep in range(episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            # ε-greedy policy
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                # Convert state to one-hot encoding for network input
                state_one_hot = torch.zeros(n_observations, dtype=torch.float32).to(device)
                state_one_hot[state] = 1.0
                state_tensor = state_one_hot.unsqueeze(0)

                with torch.no_grad():
                    action = torch.argmax(policy_net(state_tensor)).item()

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            memory.push(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            # Train network
            if len(memory) >= batch_size:
                states, actions, rewards, next_states, dones = memory.sample(batch_size)

                # 🔹 Move tensors to GPU/CPU device and ensure correct shape
                # Convert states and next_states to one-hot encoding for batch processing
                states_one_hot = torch.zeros(batch_size, n_observations, dtype=torch.float32).to(device)
                states_one_hot[torch.arange(batch_size), states.long()] = 1.0

                next_states_one_hot = torch.zeros(batch_size, n_observations, dtype=torch.float32).to(device)
                next_states_one_hot[torch.arange(batch_size), next_states.long()] = 1.0

                actions = actions.to(device)
                rewards = rewards.to(device)
                dones = dones.to(device)


                q_values = policy_net(states_one_hot).gather(1, actions.unsqueeze(1)).squeeze()
                next_q_values = target_net(next_states_one_hot).max(1)[0]
                expected_q_values = rewards + gamma * next_q_values * (1 - dones)

                loss = criterion(q_values, expected_q_values.detach())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Epsilon decay
        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        rewards_history.append(total_reward)

        # Update target network
        if ep % 10 == 0: # Update target network every 10 episodes
             target_net.load_state_dict(policy_net.state_dict())


        print(f"Episode {ep+1}/{episodes}, Total Reward: {total_reward}, Epsilon: {epsilon:.2f}")

    env.close()
    return rewards_history, policy_net

In [72]:
# Example hyperparameters
episodes = 2000
gamma = 0.99
lr = 0.001
batch_size = 64
epsilon_decay = 0.995
min_epsilon = 0.01

# Train the DQN agent
rewards, trained_policy_net = train_dqn(episodes, gamma, lr, batch_size, epsilon_decay, min_epsilon)

Episode 1/2000, Total Reward: -713, Epsilon: 0.99
Episode 2/2000, Total Reward: -767, Epsilon: 0.99
Episode 3/2000, Total Reward: -830, Epsilon: 0.99
Episode 4/2000, Total Reward: -731, Epsilon: 0.98
Episode 5/2000, Total Reward: -722, Epsilon: 0.98
Episode 6/2000, Total Reward: -657, Epsilon: 0.97
Episode 7/2000, Total Reward: -749, Epsilon: 0.97
Episode 8/2000, Total Reward: -794, Epsilon: 0.96
Episode 9/2000, Total Reward: -695, Epsilon: 0.96
Episode 10/2000, Total Reward: -803, Epsilon: 0.95
Episode 11/2000, Total Reward: -722, Epsilon: 0.95
Episode 12/2000, Total Reward: -749, Epsilon: 0.94
Episode 13/2000, Total Reward: -722, Epsilon: 0.94
Episode 14/2000, Total Reward: -848, Epsilon: 0.93
Episode 15/2000, Total Reward: -794, Epsilon: 0.93
Episode 16/2000, Total Reward: -713, Epsilon: 0.92
Episode 17/2000, Total Reward: -758, Epsilon: 0.92
Episode 18/2000, Total Reward: -668, Epsilon: 0.91
Episode 19/2000, Total Reward: -740, Epsilon: 0.91
Episode 20/2000, Total Reward: -767, Eps

In [79]:
# Create the Taxi-v3 environment with render_mode="rgb_array"
env = gym.make("Taxi-v3", render_mode="rgb_array")

# Define the video path and create an imageio writer
video_path = "taxi_dqn.mp4"
writer = imageio.get_writer(video_path, fps=15)

# Run multiple episodes and record the video
episodes_to_record = 20  # Record 10 episodes

for ep in range(episodes_to_record):
    state, _ = env.reset()
    done = False
    while not done:
        # Convert state to one-hot encoding for network input
        state_one_hot = torch.zeros(env.observation_space.n, dtype=torch.float32).to(device)
        state_one_hot[state] = 1.0
        state_tensor = state_one_hot.unsqueeze(0)

        with torch.no_grad():
            action = torch.argmax(trained_policy_net(state_tensor)).item()

        next_state, _, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        state = next_state

        # Render the environment and append the frame to the video writer
        frame = env.render()
        writer.append_data(frame)

# Close the video writer and the environment
writer.close()
env.close()

print(f"Video saved to {video_path}")

# Display the video
Video(video_path)



Video saved to taxi_dqn.mp4


In [80]:
Video("taxi_dqn.mp4", embed=True)

In [78]:
def evaluate_agent(env, policy_net, episodes=20):
    total_rewards = []
    for _ in range(episodes):
        state, _ = env.reset()
        done = False
        ep_reward = 0
        while not done:
            # Convert state to one-hot encoding for network input
            state_one_hot = torch.zeros(env.observation_space.n, dtype=torch.float32).to(device)
            state_one_hot[state] = 1.0
            state_tensor = state_one_hot.unsqueeze(0)

            with torch.no_grad():
                action = torch.argmax(policy_net(state_tensor)).item()

            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            ep_reward += reward
        total_rewards.append(ep_reward)
    return np.mean(total_rewards)

# Create the Taxi-v3 environment for evaluation
eval_env = gym.make("Taxi-v3")

# Evaluate the trained agent
avg_reward = evaluate_agent(eval_env, trained_policy_net, episodes=20)
print(f"Average reward over 20 evaluation episodes: {avg_reward:.2f}")

# Close the environment
eval_env.close()

Average reward over 20 evaluation episodes: -2.45
