# Atari Skiing

## Part 2: Q-Learning for Atari Game Skiing

In [None]:
import torch
import torch.nn as nn


# DQN Model
class DQN(nn.Module):
    def __init__(self, action_size):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 512)
        self.fc2 = nn.Linear(512, action_size)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

    # Save a model
    def save_model(self):
        torch.save(self.state_dict(), "./models/" + self.filename + ".pth")

    # Loads a model
    def load_model(self):
        self.load_state_dict(torch.load("./models/" + self.filename + ".pth"))

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

Action = int
State = int


class DeepQLearningAgent:
    def __init__(
        self,
        learning_rate: float,
        epsilon: float,
        gamma: float,
        n_actions: int,
        batch_size: int = 32,
        memory_size: int = 100000,
    ):
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.gamma = gamma
        self.n_actions = n_actions
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.policy_net = DQN(n_actions).to(self.device)
        self.target_net = DQN(n_actions).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        self.loss_fn = nn.MSELoss()

        print("======== DeepQLearningAgent ========")
        print("Learning rate:", self.learning_rate)
        print("Epsilon:", self.epsilon)
        print("Gamma:", self.gamma)
        print("Batch size:", self.batch_size)
        print("Max memory size:", memory_size)
        print("Number of actions:", self.n_actions, end="\n\n")
        print("Current device used:", self.device)
        print("Optimizer:", "Adam()")
        print("Loss:", self.loss_fn)
        print("====================================", end="\n\n")

    def get_action(self, state: State) -> Action:
        # Exploration
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        # Exploitation
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                q_values = self.policy_net(state)

                # Retrieve the action that maximize the approximation of Q*(s, a, theta)
                return q_values.argmax().item()

    def update(self, state, action, reward, next_state, done):
        """Train the model"""
        reward = max(min(reward, 1.0), -1.0)
        self.memory.append((state, action, reward, next_state, done))

        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(np.array(states)).to(self.device)
        actions = torch.LongTensor(np.array(actions)).to(self.device)
        rewards = torch.FloatTensor(np.array(rewards)).to(self.device)
        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
        dones = torch.FloatTensor(np.array(dones)).to(self.device)

        # Set the label y
        next_q_values = self.target_net(next_states).max(1)[0].detach()
        # -> The goal is to delete the second part of equation in cases done=True
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

        # The so called prediction
        current_q_values = self.policy_net(states).gather(1, actions.unsqueeze(1))

        loss = self.loss_fn(current_q_values, target_q_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self, save_max=False, max_reward=0, epoch=0):
        """Update target network"""
        self.target_net.load_state_dict(self.policy_net.state_dict())
        torch.save(self.policy_net.state_dict(), f"models/model_{epoch}_state_dict.pt")
        if save_max:
            torch.save(
                self.policy_net.state_dict(),
                f"models/max_model_load_state_dict_{max_reward}.pt",
            )

In [21]:
# exercise.py
import gymnasium as gym
import numpy as np
from gymnasium.wrappers import AtariPreprocessing
import ale_py
from tqdm import tqdm
import matplotlib.pyplot as plt


def preprocess_observation(obs):
    return np.array(obs).astype(np.float32) / 255.0


def main():
    gym.register_envs(ale_py)
    env = gym.make("ALE/Skiing-v5", obs_type="grayscale", frameskip=4)
    env = AtariPreprocessing(env, frame_skip=1)
    env = gym.wrappers.FrameStackObservation(env, 4)

    n_actions = env.action_space.n
    print("Available actions:", env.unwrapped.get_action_meanings(), end="\n\n")

    agent = DeepQLearningAgent(
        learning_rate=0.0001,
        epsilon=1.0,
        gamma=0.99,
        n_actions=n_actions,
        batch_size=32,
        memory_size=100000,
    )

    n_episodes = 500000
    target_update_frequency = 50000
    epsilon_decay = 0.9995
    epsilon_min = 0.05
    one_epoch = 50000

    mean_reward = []
    mini_batch = 0
    max_reward = float("-inf")
    min_reward = float("inf")

    epoch_values = [0]
    std_values = [0]
    max_rewards = [0]
    min_rewards = [0]

    best_reward = 0
    pbar = tqdm(range(1, n_episodes), desc="Starting")
    for episode in pbar:
        obs, _ = env.reset()
        state = preprocess_observation(obs)
        done = False
        total_reward = 0.0
        current_frame = mini_batch

        while not done:
            action = 1 if current_frame == mini_batch else agent.get_action(state)

            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated or info["lives"] != 5

            total_reward -= 1 if done else -max(-1, min(1, reward))

            next_state = preprocess_observation(obs)
            agent.update(state, action, reward, next_state, done)
            state = next_state
            mini_batch += 1

            if done:
                mean_reward.append(total_reward)

                # Update progression description
                avg_reward = round(np.mean(mean_reward).item(), 2)
                max_reward = round(max(max_reward, total_reward), 2)
                min_reward = round(min(min_reward, total_reward), 2)
                std_reward = round(np.std(mean_reward).item(), 2)

                pbar.set_description(
                    desc=f"Episode: {episode} (mean: {avg_reward}, std: {std_reward}, min: {min_reward}, max: {max_reward}) |"
                )

            if mini_batch % one_epoch == 0:
                current_epoch = len(epoch_values)
                avg_reward = round(np.mean(mean_reward).item(), 2)
                max_reward = round(max(max_reward, total_reward), 2)
                min_reward = round(min(min_reward, total_reward), 2)
                std_reward = round(np.std(mean_reward).item(), 2)

                print(f"\n===== Epoch {current_epoch} stats =====")
                print(f"Min reward: {min_reward}")
                print(f"Max reward: {max_reward}")
                print(f"Mean reward: {avg_reward}")
                print(f"Std reward: {std_reward}\n")
                print("Current agent epsilon:", round(agent.epsilon, 2))
                mean_reward.clear()

                epoch_values.append(avg_reward)
                std_values.append(std_reward)
                min_rewards.append(min_reward)
                max_rewards.append(max_reward)

                max_reward = float("-inf")
                min_reward = float("inf")

            # Update target network and plot stats
            if mini_batch % target_update_frequency == 0:
                current_epoch = len(epoch_values)
                save_max_network = False
                if epoch_values[-1] > best_reward:
                    best_reward = epoch_values[-1]
                    save_max_network = True
                    print(f"New max_reward net with r={max_reward}")
                agent.update_target_network(
                    save_max=save_max_network,
                    max_reward=max_reward,
                    epoch=current_epoch,
                )

                plt.figure(figsize=(15, 10))
                epochs = np.arange(len(epoch_values))
                eeepoch_values = np.array(epoch_values)
                std_dev = np.array(std_values)
                mins = np.array(min_rewards)
                maxs = np.array(max_rewards)

                plt.plot(
                    epochs,
                    eeepoch_values,
                    label="average reward",
                    color="blue",
                    linewidth=2,
                )
                plt.plot(
                    epochs,
                    mins,
                    label="minimum reward",
                    color="red",
                    linestyle="--",
                    marker="o",
                    markersize=2,
                )
                plt.plot(
                    epochs,
                    maxs,
                    label="maximum reward",
                    color="green",
                    linestyle="--",
                    marker="o",
                    markersize=2,
                )
                plt.fill_between(
                    epochs,
                    epoch_values - std_dev,
                    epoch_values + std_dev,
                    color="blue",
                    alpha=0.15,
                    label="std dev",
                )
                plt.axhline(
                    y=31.8, color="purple", linestyle="--", label="Human performance"
                )

                plt.title("Average reward with std dev and extreme values on breakout")
                plt.xlabel("Training epochs")
                plt.ylabel("Average reward per episode")
                plt.grid(visible=True, linestyle="--", linewidth=0.5)
                plt.legend()
                plt.savefig(
                    f"figures/reward_breakout_epoch_{len(epoch_values) - 1}.png"
                )
                plt.close()

        # Decay epsilon
        agent.epsilon = max(epsilon_min, agent.epsilon * epsilon_decay)

    env.close()


main()

Available actions: ['NOOP', 'RIGHT', 'LEFT']

Learning rate: 0.0001
Epsilon: 1.0
Gamma: 0.99
Batch size: 32
Max memory size: 100000
Number of actions: 3

Current device used: cuda
Optimizer: Adam()
Loss: MSELoss()



Episode: 6129 (mean: -1.0, std: 0.0, min: -1.0, max: -1.0) |:   1%|          | 6129/499999 [02:18<3:06:14, 44.19it/s]


KeyboardInterrupt: 