### Step1. Install AI Gym, the instructions can be found at OpenAI-Lunar-LanderLinks to an external site.

In [1]:
!sudo apt-get update
!sudo apt-get install python3.10-venv
!python3 -m venv env
!source env/bin/activate
!sudo apt-get install swig libpython3.10-dev
!pip install box2d-py
!pip install gym[box2d]==0.25.2
! sudo apt-get install xvfb
! pip install pyvirtualdisplay

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [2,383 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,118 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/uni

### Step2. Import the environment

In [7]:
import copy
import os
import random

import numpy as np
import torch
from gym import make
from torch import nn
from torch.optim import Adam
from tqdm.notebook import tqdm

SEED = 21
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
GAMMA = 0.99
TAU = 1e-3
INITIAL_STEPS = 1024
TRANSITIONS = 500_000
STEPS_PER_UPDATE = 4
STEPS_PER_TARGET_UPDATE = STEPS_PER_UPDATE * 1000
BATCH_SIZE = 512
LEARNING_RATE = 5e-4
HIDDEN_DIMENSION = 64
ENVIRONMENT_NAME = "LunarLander-v2"

def set_seed(seed: int = 42) -> None:
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def evaluate_policy(agent, episodes=5, verbose=False):
    env = make(ENVIRONMENT_NAME)
    returns = []
    if verbose:
        pbar = tqdm(total=episodes)
    for _ in range(episodes):
        done = False
        state = env.reset()
        total_reward = 0.0

        while not done:
            state, reward, done, *_ = env.step(agent.act(state))
            total_reward += reward
        returns.append(total_reward)

        if verbose:
            pbar.update(1)

    return returns

class ExperienceBuffer:
    def __init__(self, capacity=10_000, device=DEVICE):
        self.capacity = capacity
        self.num_stored = 0
        self.next_index = 0
        self.device = device

        self.states = None
        self.actions = None
        self.next_states = None
        self.rewards = None
        self.dones = None

    def is_sampleable(self, replay_size):
        return replay_size <= self.num_stored

    def add(self, state, action, next_state, reward, done):
        state = torch.tensor(state)
        next_state = torch.tensor(next_state)

        if self.states is None:
            state_shape = [self.capacity] + list(state.shape)
            action_shape = [self.capacity]
            next_state_shape = [self.capacity] + list(next_state.shape)
            reward_shape = [self.capacity]
            done_shape = [self.capacity]
            self.states = torch.empty(state_shape, dtype=torch.float32, device=self.device)
            self.actions = torch.empty(action_shape, dtype=torch.long, device=self.device)
            self.next_states = torch.empty(next_state_shape, dtype=torch.float32, device=self.device)
            self.rewards = torch.empty(reward_shape, dtype=torch.float32, device=self.device)
            self.dones = torch.empty(done_shape, dtype=torch.long, device=self.device)

        self.states[self.next_index] = state
        self.actions[self.next_index] = action
        self.next_states[self.next_index] = next_state
        self.rewards[self.next_index] = reward
        self.dones[self.next_index] = done

        self.next_index = (self.next_index + 1) % self.capacity
        self.num_stored = min(self.capacity, self.num_stored + 1)

    def sample_batch(self, replay_size=BATCH_SIZE):
        indexes = torch.randperm(self.num_stored)[:replay_size]
        return (
            self.states[indexes],
            self.actions[indexes].view(-1, 1),
            self.next_states[indexes],
            self.rewards[indexes].view(-1, 1),
            self.dones[indexes].view(-1, 1),
        )

# Deep Q-Network Model
class DeepQNetworkModel(torch.nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=HIDDEN_DIMENSION):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.activation = torch.nn.ReLU()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, state):
        h = self.activation(self.fc1(state))
        h = self.activation(self.fc2(h))
        out = self.fc3(h)
        return out

# DQN Agent
class DQNAgent:
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        self.steps = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim
        self.buffer = ExperienceBuffer(10**5)
        self.local_model = DeepQNetworkModel(state_dim, action_dim, hidden_dim).to(DEVICE)
        self.target_model = DeepQNetworkModel(state_dim, action_dim, hidden_dim).to(DEVICE)
        self.target_model.eval()
        self.optimizer = Adam(self.local_model.parameters())
        self.criterion = nn.MSELoss()

    def consume_transition(self, transition):
        self.buffer.add(*transition)

    def sample_batch(self):
        return self.buffer.sample_batch()

    def train_step(self, batch):
        states, actions, next_states, rewards, dones = batch

        q_pred = self.local_model(states).gather(1, actions)
        with torch.no_grad():
            q_next = self.target_model(next_states).max(1)[0].unsqueeze(1)
        q_target = rewards + GAMMA * q_next * (1 - dones)

        self.optimizer.zero_grad()
        loss = self.criterion(q_pred, q_target)
        loss.backward()
        self.optimizer.step()

        self.soft_update_target_network()

    def soft_update_target_network(self):
        for target_param, local_param in zip(
            self.target_model.parameters(), self.local_model.parameters()
        ):
            target_param.data.copy_(
                TAU * local_param.data + (1.0 - TAU) * target_param.data
            )

    def update_target_network(self):
        self.target_model = copy.deepcopy(self.local_model)

    def act(self, state, target=False):
        state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)

        self.local_model.eval()
        with torch.no_grad():
            action = np.argmax(self.local_model(state).cpu().numpy())
        self.local_model.train()

        return action

    def update(self, transition):
        self.consume_transition(transition)
        if self.steps % STEPS_PER_UPDATE == 0:
            batch = self.sample_batch()
            self.train_step(batch)
        if self.steps % STEPS_PER_TARGET_UPDATE == 0:
            self.update_target_network()
        self.steps += 1

    def save(self):
        torch.save(self.local_model.state_dict(), "agent.pth")



### Step3. Train a model

In [8]:

import torch

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Training
set_seed(SEED)
env = make(ENVIRONMENT_NAME)
dqn_agent = DQNAgent(state_dim=env.observation_space.shape[0], action_dim=env.action_space.n, hidden_dim=HIDDEN_DIMENSION)
dqn_agent.local_model.to(device)
dqn_agent.target_model.to(device)
eps = 0.1
state = env.reset()

for _ in range(INITIAL_STEPS):
    action = env.action_space.sample()

    next_state, reward, done, *_ = env.step(action)
    dqn_agent.consume_transition((state, action, next_state, reward, done))

    state = next_state if not done else env.reset()

best_avg_rewards = -np.inf
for i in range(TRANSITIONS):
    # Epsilon-greedy policy
    if random.random() < eps:
        action = env.action_space.sample()
    else:
        action = dqn_agent.act(state)

    next_state, reward, done, *_ = env.step(action)
    dqn_agent.update((state, action, next_state, reward, done))

    state = next_state if not done else env.reset()

    if (i + 1) % (TRANSITIONS // 100) == 0:
        rewards = evaluate_policy(dqn_agent, 5)
        avg_reward = np.mean(rewards)
        print(f"Step: {i + 1}/{TRANSITIONS}, Best reward mean: {best_avg_rewards:.2f}, Reward mean: {avg_reward:.2f}, Reward std: {np.std(rewards):.2f}")
        if avg_reward > best_avg_rewards:
            best_avg_rewards = avg_reward
            dqn_agent.save()


Step: 5000/500000, Best reward mean: -inf, Reward mean: -50.43, Reward std: 51.08
Step: 10000/500000, Best reward mean: -50.43, Reward mean: -6.55, Reward std: 168.32
Step: 15000/500000, Best reward mean: -6.55, Reward mean: -124.98, Reward std: 72.14
Step: 20000/500000, Best reward mean: -6.55, Reward mean: -254.98, Reward std: 238.03
Step: 25000/500000, Best reward mean: -6.55, Reward mean: -50.80, Reward std: 53.79
Step: 30000/500000, Best reward mean: -6.55, Reward mean: -104.78, Reward std: 47.42
Step: 35000/500000, Best reward mean: -6.55, Reward mean: -150.18, Reward std: 10.88
Step: 40000/500000, Best reward mean: -6.55, Reward mean: -99.47, Reward std: 66.82
Step: 45000/500000, Best reward mean: -6.55, Reward mean: -107.89, Reward std: 25.95
Step: 50000/500000, Best reward mean: -6.55, Reward mean: -96.68, Reward std: 17.98
Step: 55000/500000, Best reward mean: -6.55, Reward mean: -94.34, Reward std: 32.51
Step: 60000/500000, Best reward mean: -6.55, Reward mean: -64.84, Rewar

In [9]:
# Evaluation
class TrainedAgent:
    def __init__(self, weights="agent.pth"):
        self.model = DeepQNetworkModel(8, 4, HIDDEN_DIMENSION)
        weights = torch.load(weights, map_location=DEVICE)
        self.model.load_state_dict(weights)
        self.model.to(DEVICE)
        self.model.eval()

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            action = np.argmax(self.model(state).cpu().numpy())
        return action

trained_agent = TrainedAgent("agent.pth")
rewards = evaluate_policy(trained_agent, 5, True)
print("Average reward on 5 episodes:", np.mean(rewards))

  0%|          | 0/5 [00:00<?, ?it/s]

Average reward on 5 episodes: 245.03502156354642


### Step5. Play an episode of the problem using your agent.

In [12]:
# Rendering and displaying video
import glob
import io
import base64
from gym.wrappers.monitoring import video_recorder
from IPython import display
from IPython.display import FileLink

def show_video(env_name, video_dir="."):
    mp4list = glob.glob(f'{video_dir}/*.mp4')
    if len(mp4list) > 0:
        mp4 = f'{video_dir}/{env_name}.mp4'
        video = io.open(mp4, 'rb').read()
        encoded = base64.b64encode(video)
        display.display(display.HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
        display.display(FileLink(mp4, result_html_prefix="Click here to download: "))
    else:
        print("Could not find video")

def render_video_of_model(agent, env_name):
    env = make(env_name)
    vid = video_recorder.VideoRecorder(env, path=f"{env_name}.mp4")
    state = env.reset()
    done = False
    while not done:
        frame = env.render(mode='rgb_array')
        vid.capture_frame()

        action = agent.act(state)

        state, reward, done, _ = env.step(action)
    env.close()

render_video_of_model(trained_agent, ENVIRONMENT_NAME)
show_video(ENVIRONMENT_NAME)

  deprecation(
  deprecation(
  logger.deprecation(
  logger.deprecation(
  self.pid = _posixsubprocess.fork_exec(
  if not isinstance(terminated, (bool, np.bool8)):
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


### Step4. Discuss the parameters used to produce your training results

The parameters used for training the Deep Q-Network (DQN) agent are as follows:
```
SEED = 21
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
GAMMA = 0.99
TAU = 1e-3
INITIAL_STEPS = 1024
TRANSITIONS = 500_000
STEPS_PER_UPDATE = 4
STEPS_PER_TARGET_UPDATE = STEPS_PER_UPDATE * 1000
BATCH_SIZE = 512
LEARNING_RATE = 5e-4
HIDDEN_DIMENSION = 64
ENVIRONMENT_NAME = "LunarLander-v2"
```
SEED: The random seed used for reproducibility. We have used a seed of 21 to repreduce the same steps for training.

GAMMA: The discount factor for future rewards. We have used 0.99 as the discount factor.

TAU: The parameter for soft update of target network weights. Value of 1e-3 is used here.

INITIAL_STEPS: The number of initial steps taken by the agent to populate the experience buffer. The model takes 1024 steps to populate the experience buffer.

TRANSITIONS: The total number of transitions (steps) taken during training. We take 500000 steps to train.

STEPS_PER_UPDATE: The number of steps between each update of the local network. We use 4 steps per update.

STEPS_PER_TARGET_UPDATE: The number of steps between each update of the target network.

BATCH_SIZE: The size of the mini-batch used for training. We take 512 as a batch size to speed up the training.

LEARNING_RATE: The learning rate used by the Adam optimizer. We use 5e-4 as value.

HIDDEN_DIMENSION: The dimension of the hidden layers in the neural network. We use a dimension of 64.

### Step 6. Discuss the results in terms of success rate

The training results show improvement in terms of the average reward mean over time, with occasional fluctuations. The best reward mean achieved during training was around 277.91.

Average reward on 5 episodes that were used for evaluation is: 245.03502156354642

From the video, we can see that the lander lands properly on the target.

