Environment setup and imports :

In [None]:
import random
import collections
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ENV_NAME       = "LunarLander-v3"
BUFFER_SIZE    = 100_000
BATCH_SIZE     = 64
GAMMA          = 0.99
LR             = 1e-3
EPS_START      = 1.0
EPS_END        = 0.01
EPS_DECAY      = 0.995
TARGET_UPDATE  = 10
NUM_EPISODES   = 1000
MAX_STEPS      = 1000

NN architecture :

- Inputs: 8-dimensional state
- Hidden: two layers of 128 units with ReLU
- utputs: 4 Q-values (one per discrete action)

In [None]:
class QNetwork(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, act_dim)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
!pip install swig
!pip install "gymnasium[box2d]"



Training loop : (Reward > 200 is considered a solution)

In [None]:
Transition = collections.namedtuple('Transition',
    ['state', 'action', 'reward', 'next_state', 'done'])

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        return Transition(*zip(*batch))

    def __len__(self):
        return len(self.buffer)

env = gym.make(ENV_NAME)
obs_dim  = env.observation_space.shape[0]
act_dim  = env.action_space.n

policy_net = QNetwork(obs_dim, act_dim).to(device)
target_net = QNetwork(obs_dim, act_dim).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
buffer    = ReplayBuffer(BUFFER_SIZE)

eps = EPS_START
episode_rewards = []

for ep in range(1, NUM_EPISODES+1):
    state, _ = env.reset()
    total_reward = 0

    for t in range(MAX_STEPS):
        if random.random() < eps:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                qs = policy_net(torch.FloatTensor(state).unsqueeze(0).to(device))
                action = qs.argmax(dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        buffer.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if len(buffer) >= BATCH_SIZE:
            batch = buffer.sample(BATCH_SIZE)
            s      = torch.FloatTensor(batch.state).to(device)
            a      = torch.LongTensor(batch.action).unsqueeze(1).to(device)
            r      = torch.FloatTensor(batch.reward).unsqueeze(1).to(device)
            s_next = torch.FloatTensor(batch.next_state).to(device)
            d      = torch.FloatTensor(batch.done).unsqueeze(1).to(device)

            q_values      = policy_net(s).gather(1, a)
            with torch.no_grad():
                next_q_values = target_net(s_next).max(1, keepdim=True)[0]
                target_q = r + GAMMA * next_q_values * (1 - d)

            loss = F.mse_loss(q_values, target_q)

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)
            optimizer.step()

        if done:
            break

    episode_rewards.append(total_reward)
    eps = max(EPS_END, EPS_DECAY * eps)

    if ep % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

    if ep % 10 == 0:
        avg_r = np.mean(episode_rewards[-10:])
        print(f"Ep {ep:03d} | Avg10R: {avg_r:.1f} | ε: {eps:.2f}")

Ep 010 | Avg10R: -158.2 | ε: 0.95
Ep 020 | Avg10R: -173.6 | ε: 0.90
Ep 030 | Avg10R: -122.1 | ε: 0.86
Ep 040 | Avg10R: -124.6 | ε: 0.82
Ep 050 | Avg10R: -110.1 | ε: 0.78
Ep 060 | Avg10R: -112.6 | ε: 0.74
Ep 070 | Avg10R: -94.4 | ε: 0.70
Ep 080 | Avg10R: -80.9 | ε: 0.67
Ep 090 | Avg10R: -134.4 | ε: 0.64
Ep 100 | Avg10R: -98.7 | ε: 0.61
Ep 110 | Avg10R: -54.4 | ε: 0.58
Ep 120 | Avg10R: -49.2 | ε: 0.55
Ep 130 | Avg10R: -34.5 | ε: 0.52
Ep 140 | Avg10R: -26.2 | ε: 0.50
Ep 150 | Avg10R: -32.0 | ε: 0.47
Ep 160 | Avg10R: -23.8 | ε: 0.45
Ep 170 | Avg10R: -1.8 | ε: 0.43
Ep 180 | Avg10R: 27.8 | ε: 0.41
Ep 190 | Avg10R: -14.2 | ε: 0.39
Ep 200 | Avg10R: 23.2 | ε: 0.37
Ep 210 | Avg10R: -6.7 | ε: 0.35
Ep 220 | Avg10R: -45.9 | ε: 0.33
Ep 230 | Avg10R: 54.7 | ε: 0.32
Ep 240 | Avg10R: -0.8 | ε: 0.30
Ep 250 | Avg10R: 4.2 | ε: 0.29
Ep 260 | Avg10R: 33.7 | ε: 0.27
Ep 270 | Avg10R: 48.8 | ε: 0.26
Ep 280 | Avg10R: 44.2 | ε: 0.25
Ep 290 | Avg10R: 90.5 | ε: 0.23
Ep 300 | Avg10R: 63.5 | ε: 0.22
Ep 310 | Avg10R:

Testing and evaluation :
- Metrics:
 - Average return over test episodes
 - Std deviation to gauge stability

In [None]:
policy_net.eval()

test_rewards = []
NUM_TEST = 20
for _ in range(NUM_TEST):
    state, _ = env.reset()
    total_r = 0
    done = False
    while not done:
        with torch.no_grad():
            q    = policy_net(torch.FloatTensor(state).unsqueeze(0).to(device))
            action = q.argmax(dim=1).item()
        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        total_r += reward
    test_rewards.append(total_r)

print(f"Test over {NUM_TEST} episodes: "
      f"mean={np.mean(test_rewards):.1f}, "
      f"std={np.std(test_rewards):.1f}")

Test over 20 episodes: mean=267.4, std=34.4


In [None]:
import imageio
import gymnasium as gym
import torch
import numpy as np

test_env = gym.make(ENV_NAME, render_mode="rgb_array")
policy_net.eval()

frames = []
state, _ = test_env.reset(seed=42)
done = False
while not done:
    with torch.no_grad():
        q_vals = policy_net(torch.FloatTensor(state).unsqueeze(0).to(device))
        action = q_vals.argmax(dim=1).item()
    state, reward, terminated, truncated, _ = test_env.step(action)
    done = terminated or truncated
    frame = test_env.render()
    frames.append(frame)

test_env.close()

gif_path = "lunarlander_test.gif"
imageio.mimsave(gif_path, frames, fps=30)

print(f"Saved test run GIF → {gif_path}")

Saved test run GIF → lunarlander_test.gif
