Dependencies

In [None]:
import gymnasium as gym
import ale_py
import imageio.v2 as imageio

Training Code for Breakout

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import numpy as np
import cv2
import random
import copy
import os
import imageio
from collections import deque
import ale_py

env = gym.make("BreakoutNoFrameskip-v4")

FIRE_ACTION = 1

class Q_Network(nn.Module):
    def __init__(self, output_size):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),
            nn.Linear(512, output_size)
        )

    def forward(self, x):
        x = x.float() / 255.0
        x = self.cnn(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

class ReplayBuffer:
    def __init__(self, capacity, device):
        self.buffer = deque(maxlen=capacity)
        self.device = device

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.tensor(np.stack(states), device=self.device),
            torch.tensor(actions, device=self.device),
            torch.tensor(rewards, dtype=torch.float32, device=self.device),
            torch.tensor(np.stack(next_states), device=self.device),
            torch.tensor(dones, dtype=torch.float32, device=self.device),
        )

    def len(self):
        return len(self.buffer)

def preprocess(obs, prev_obs):
    gray = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
    prev_gray = cv2.cvtColor(prev_obs, cv2.COLOR_RGB2GRAY)
    max_frame = np.maximum(gray, prev_gray)
    max_frame = max_frame[34:194, :]
    max_frame = cv2.resize(max_frame, (84, 84), interpolation=cv2.INTER_AREA)
    return max_frame.astype(np.uint8)

def step_with_skip(env, action, skip=4):
    total_reward = 0.0
    done = False
    last_info = {}
    last_obs = None
    for _ in range(skip):
        obs, r, terminated, truncated, info = env.step(action)
        total_reward += float(r)
        done = bool(terminated or truncated)
        last_info = info
        last_obs = obs
        if done:
            break
    return last_obs, total_reward, done, last_info

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TOTAL_STEPS = 2_500_000
BUFFER_SIZE = 300_000
batch_size = 32
GAMMA = 0.99
LR = 1e-4

os.makedirs("models", exist_ok=True)
MODEL_PATH = "models/qnet_latest.pt"

TARGET_UPDATE_FREQ = 10_000
EPSILON_START = 0.8
EPSILON_MIN = 0.1
EPSILON_DECAY_START = 2_000
EPSILON_DECAY_STEPS = 1_750_000

FRAME_STACK = 4
MAX_EPISODE_STEPS = 10_000

SAVE_VIDEO_FREQ = 300
VIDEO_DIR = "videos"
os.makedirs(VIDEO_DIR, exist_ok=True)

q_net = Q_Network(env.action_space.n).to(DEVICE)
target_q_net = copy.deepcopy(q_net).to(DEVICE)
target_q_net.eval()

optimizer = torch.optim.Adam(q_net.parameters(), lr=LR)
loss_fn = nn.SmoothL1Loss()

buffer = ReplayBuffer(BUFFER_SIZE, DEVICE)

epsilon = EPSILON_START
global_steps = 0
episode_num = 1

while global_steps < TOTAL_STEPS:

    obs, info = env.reset()

    prev_obs = obs
    obs, _, _, _, info = env.step(FIRE_ACTION)
    prev_obs = obs
    obs, _, _, _, info = env.step(FIRE_ACTION)

    prev_lives = info.get("lives", None)

    obs_proc = preprocess(obs, prev_obs)
    frame_stack = deque([obs_proc] * FRAME_STACK, maxlen=FRAME_STACK)
    state = np.stack(frame_stack, axis=0)

    episode_reward = 0.0

    for _ in range(MAX_EPISODE_STEPS):
        global_steps += 1

        if random.random() < epsilon:
            action = random.choice([0, 2, 3])
        else:
            with torch.no_grad():
                q_vals = q_net(torch.tensor(state, device=DEVICE).unsqueeze(0))
                action = torch.argmax(q_vals).item()

        prev_obs = obs
        obs, reward, done, info = step_with_skip(env, action)
        total_reward = reward
        episode_reward += reward

        if (not done) and ("lives" in info) and (prev_lives is not None) and (info["lives"] < prev_lives):
            prev_obs_fire = obs
            obs, reward_fire, done_fire, info_fire = step_with_skip(env, FIRE_ACTION)
            total_reward += reward_fire
            episode_reward += reward_fire
            prev_obs = prev_obs_fire
            done = done_fire
            info = info_fire

        if "lives" in info:
            prev_lives = info["lives"]

        obs_proc = preprocess(obs, prev_obs)
        frame_stack.append(obs_proc)
        next_state = np.stack(frame_stack, axis=0)

        buffer.add(state, action, total_reward, next_state, done)
        state = next_state

        if buffer.len() >= batch_size and global_steps > 2_000:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)

            with torch.no_grad():
                next_q = target_q_net(next_states).max(1)[0]
                targets = rewards + GAMMA * next_q * (1 - dones)

            q = q_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)

            loss = loss_fn(q, targets)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(q_net.parameters(), 10.0)
            optimizer.step()

        if global_steps % TARGET_UPDATE_FREQ == 0:
            target_q_net.load_state_dict(q_net.state_dict())

        if global_steps > EPSILON_DECAY_START:
            decay_ratio = min(1.0, (global_steps - EPSILON_DECAY_START) / EPSILON_DECAY_STEPS)
            epsilon = max(EPSILON_MIN, EPSILON_START - decay_ratio * (EPSILON_START - EPSILON_MIN))

        if done:
            break

    if episode_num % SAVE_VIDEO_FREQ == 0:
        q_net.eval()
        frames = []
        eval_reward = 0.0

        obs, info = env.reset()
        prev_obs = obs
        obs, _, _, _, info = env.step(FIRE_ACTION)
        prev_obs = obs
        obs, _, _, _, info = env.step(FIRE_ACTION)

        prev_lives = info.get("lives", None)

        obs_proc = preprocess(obs, prev_obs)
        eval_stack = deque([obs_proc] * FRAME_STACK, maxlen=FRAME_STACK)
        eval_state = np.stack(eval_stack, axis=0)

        for _ in range(2000):
            frames.append(obs)

            with torch.no_grad():
                q_vals = q_net(torch.tensor(eval_state, device=DEVICE).unsqueeze(0))
                action = torch.argmax(q_vals).item()

            prev_obs = obs
            obs, r, done, info = step_with_skip(env, action)
            total_r = r
            eval_reward += r

            if (not done) and ("lives" in info) and (prev_lives is not None) and (info["lives"] < prev_lives):
                prev_obs_fire = obs
                obs, r_fire, done_fire, info_fire = step_with_skip(env, FIRE_ACTION)
                total_r += r_fire
                eval_reward += r_fire
                prev_obs = prev_obs_fire
                done = done_fire
                info = info_fire

            if "lives" in info:
                prev_lives = info["lives"]

            obs_proc = preprocess(obs, prev_obs)
            eval_stack.append(obs_proc)
            eval_state = np.stack(eval_stack, axis=0)

            if done:
                break

        path = f"{VIDEO_DIR}/episode_{episode_num}_steps_{global_steps}.mp4"
        imageio.mimsave(path, frames, fps=30)
        print("VIDEO SAVED:", path, "reward:", eval_reward)

        torch.save(
            {
                "episode": episode_num,
                "global_steps": global_steps,
                "model_state_dict": q_net.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epsilon": epsilon,
            },
            MODEL_PATH
        )

        q_net.train()

    print(
        f"Episode {episode_num} | Reward {episode_reward} | Buffer length {buffer.len()} | Epsilon {epsilon:.3f} | Steps {global_steps}"
    )

    episode_num += 1


Episode 1 | Reward 1.0 | Buffer length 167 | Epsilon 0.800 | Steps 167
Episode 2 | Reward 2.0 | Buffer length 364 | Epsilon 0.800 | Steps 364
Episode 3 | Reward 1.0 | Buffer length 531 | Epsilon 0.800 | Steps 531
Episode 4 | Reward 2.0 | Buffer length 746 | Epsilon 0.800 | Steps 746
Episode 5 | Reward 1.0 | Buffer length 915 | Epsilon 0.800 | Steps 915
Episode 6 | Reward 3.0 | Buffer length 1163 | Epsilon 0.800 | Steps 1163
Episode 7 | Reward 0.0 | Buffer length 1283 | Epsilon 0.800 | Steps 1283
Episode 8 | Reward 2.0 | Buffer length 1479 | Epsilon 0.800 | Steps 1479
Episode 9 | Reward 1.0 | Buffer length 1646 | Epsilon 0.800 | Steps 1646
Episode 10 | Reward 0.0 | Buffer length 1765 | Epsilon 0.800 | Steps 1765
Episode 11 | Reward 2.0 | Buffer length 1958 | Epsilon 0.800 | Steps 1958
Episode 12 | Reward 2.0 | Buffer length 2155 | Epsilon 0.800 | Steps 2155
Episode 13 | Reward 0.0 | Buffer length 2274 | Epsilon 0.800 | Steps 2274
Episode 14 | Reward 0.0 | Buffer length 2394 | Epsilon 0.



VIDEO SAVED: videos/episode_300_steps_45956.mp4 reward: 0.0
Episode 300 | Reward 0.0 | Buffer length 45956 | Epsilon 0.782 | Steps 45956
Episode 301 | Reward 0.0 | Buffer length 46076 | Epsilon 0.782 | Steps 46076
Episode 302 | Reward 0.0 | Buffer length 46196 | Epsilon 0.782 | Steps 46196
Episode 303 | Reward 0.0 | Buffer length 46316 | Epsilon 0.782 | Steps 46316
Episode 304 | Reward 0.0 | Buffer length 46436 | Epsilon 0.782 | Steps 46436
Episode 305 | Reward 1.0 | Buffer length 46585 | Epsilon 0.782 | Steps 46585
Episode 306 | Reward 0.0 | Buffer length 46705 | Epsilon 0.782 | Steps 46705
Episode 307 | Reward 1.0 | Buffer length 46853 | Epsilon 0.782 | Steps 46853
Episode 308 | Reward 1.0 | Buffer length 47001 | Epsilon 0.782 | Steps 47001
Episode 309 | Reward 0.0 | Buffer length 47121 | Epsilon 0.782 | Steps 47121
Episode 310 | Reward 0.0 | Buffer length 47241 | Epsilon 0.782 | Steps 47241
Episode 311 | Reward 0.0 | Buffer length 47361 | Epsilon 0.782 | Steps 47361
Episode 312 | Re



VIDEO SAVED: videos/episode_600_steps_91742.mp4 reward: 0.0
Episode 600 | Reward 1.0 | Buffer length 91742 | Epsilon 0.764 | Steps 91742
Episode 601 | Reward 0.0 | Buffer length 91861 | Epsilon 0.764 | Steps 91861
Episode 602 | Reward 1.0 | Buffer length 92009 | Epsilon 0.764 | Steps 92009
Episode 603 | Reward 2.0 | Buffer length 92206 | Epsilon 0.764 | Steps 92206
Episode 604 | Reward 0.0 | Buffer length 92326 | Epsilon 0.764 | Steps 92326
Episode 605 | Reward 0.0 | Buffer length 92446 | Epsilon 0.764 | Steps 92446
Episode 606 | Reward 0.0 | Buffer length 92565 | Epsilon 0.764 | Steps 92565
Episode 607 | Reward 0.0 | Buffer length 92683 | Epsilon 0.764 | Steps 92683
Episode 608 | Reward 2.0 | Buffer length 92878 | Epsilon 0.764 | Steps 92878
Episode 609 | Reward 0.0 | Buffer length 92996 | Epsilon 0.764 | Steps 92996
Episode 610 | Reward 1.0 | Buffer length 93144 | Epsilon 0.764 | Steps 93144
Episode 611 | Reward 3.0 | Buffer length 93407 | Epsilon 0.763 | Steps 93407
Episode 612 | Re



VIDEO SAVED: videos/episode_900_steps_138143.mp4 reward: 1.0
Episode 900 | Reward 1.0 | Buffer length 138143 | Epsilon 0.746 | Steps 138143
Episode 901 | Reward 2.0 | Buffer length 138338 | Epsilon 0.745 | Steps 138338
Episode 902 | Reward 2.0 | Buffer length 138535 | Epsilon 0.745 | Steps 138535
Episode 903 | Reward 0.0 | Buffer length 138654 | Epsilon 0.745 | Steps 138654
Episode 904 | Reward 1.0 | Buffer length 138823 | Epsilon 0.745 | Steps 138823
Episode 905 | Reward 1.0 | Buffer length 138971 | Epsilon 0.745 | Steps 138971
Episode 906 | Reward 1.0 | Buffer length 139118 | Epsilon 0.745 | Steps 139118
Episode 907 | Reward 1.0 | Buffer length 139266 | Epsilon 0.745 | Steps 139266
Episode 908 | Reward 1.0 | Buffer length 139433 | Epsilon 0.745 | Steps 139433
Episode 909 | Reward 0.0 | Buffer length 139552 | Epsilon 0.745 | Steps 139552
Episode 910 | Reward 0.0 | Buffer length 139672 | Epsilon 0.745 | Steps 139672
Episode 911 | Reward 0.0 | Buffer length 139791 | Epsilon 0.745 | Step



VIDEO SAVED: videos/episode_1200_steps_188524.mp4 reward: 3.0
Episode 1200 | Reward 1.0 | Buffer length 188524 | Epsilon 0.725 | Steps 188524
Episode 1201 | Reward 4.0 | Buffer length 188817 | Epsilon 0.725 | Steps 188817
Episode 1202 | Reward 3.0 | Buffer length 189044 | Epsilon 0.725 | Steps 189044
Episode 1203 | Reward 4.0 | Buffer length 189295 | Epsilon 0.725 | Steps 189295
Episode 1204 | Reward 3.0 | Buffer length 189540 | Epsilon 0.725 | Steps 189540
Episode 1205 | Reward 1.0 | Buffer length 189706 | Epsilon 0.725 | Steps 189706
Episode 1206 | Reward 2.0 | Buffer length 189923 | Epsilon 0.725 | Steps 189923
Episode 1207 | Reward 0.0 | Buffer length 190042 | Epsilon 0.725 | Steps 190042
Episode 1208 | Reward 2.0 | Buffer length 190240 | Epsilon 0.725 | Steps 190240
Episode 1209 | Reward 1.0 | Buffer length 190388 | Epsilon 0.725 | Steps 190388
Episode 1210 | Reward 1.0 | Buffer length 190556 | Epsilon 0.725 | Steps 190556
Episode 1211 | Reward 2.0 | Buffer length 190733 | Epsilon



VIDEO SAVED: videos/episode_1500_steps_256536.mp4 reward: 32.0
Episode 1500 | Reward 4.0 | Buffer length 256536 | Epsilon 0.698 | Steps 256536
Episode 1501 | Reward 2.0 | Buffer length 256717 | Epsilon 0.698 | Steps 256717
Episode 1502 | Reward 2.0 | Buffer length 256912 | Epsilon 0.698 | Steps 256912
Episode 1503 | Reward 6.0 | Buffer length 257242 | Epsilon 0.698 | Steps 257242
Episode 1504 | Reward 1.0 | Buffer length 257390 | Epsilon 0.698 | Steps 257390
Episode 1505 | Reward 4.0 | Buffer length 257647 | Epsilon 0.698 | Steps 257647
Episode 1506 | Reward 4.0 | Buffer length 257921 | Epsilon 0.698 | Steps 257921
Episode 1507 | Reward 4.0 | Buffer length 258174 | Epsilon 0.698 | Steps 258174
Episode 1508 | Reward 0.0 | Buffer length 258294 | Epsilon 0.697 | Steps 258294
Episode 1509 | Reward 3.0 | Buffer length 258505 | Epsilon 0.697 | Steps 258505
Episode 1510 | Reward 1.0 | Buffer length 258653 | Epsilon 0.697 | Steps 258653
Episode 1511 | Reward 3.0 | Buffer length 258879 | Epsilo



VIDEO SAVED: videos/episode_1800_steps_334526.mp4 reward: 11.0
Episode 1800 | Reward 1.0 | Buffer length 300000 | Epsilon 0.667 | Steps 334526
Episode 1801 | Reward 2.0 | Buffer length 300000 | Epsilon 0.667 | Steps 334723
Episode 1802 | Reward 4.0 | Buffer length 300000 | Epsilon 0.667 | Steps 334999
Episode 1803 | Reward 4.0 | Buffer length 300000 | Epsilon 0.667 | Steps 335272
Episode 1804 | Reward 1.0 | Buffer length 300000 | Epsilon 0.667 | Steps 335420
Episode 1805 | Reward 2.0 | Buffer length 300000 | Epsilon 0.667 | Steps 335618
Episode 1806 | Reward 2.0 | Buffer length 300000 | Epsilon 0.666 | Steps 335838
Episode 1807 | Reward 6.0 | Buffer length 300000 | Epsilon 0.666 | Steps 336152
Episode 1808 | Reward 4.0 | Buffer length 300000 | Epsilon 0.666 | Steps 336427
Episode 1809 | Reward 6.0 | Buffer length 300000 | Epsilon 0.666 | Steps 336818
Episode 1810 | Reward 2.0 | Buffer length 300000 | Epsilon 0.666 | Steps 337016
Episode 1811 | Reward 4.0 | Buffer length 300000 | Epsilo



VIDEO SAVED: videos/episode_2100_steps_415777.mp4 reward: 34.0
Episode 2100 | Reward 3.0 | Buffer length 300000 | Epsilon 0.634 | Steps 415777
Episode 2101 | Reward 4.0 | Buffer length 300000 | Epsilon 0.634 | Steps 416028
Episode 2102 | Reward 5.0 | Buffer length 300000 | Epsilon 0.634 | Steps 416319
Episode 2103 | Reward 3.0 | Buffer length 300000 | Epsilon 0.634 | Steps 416543
Episode 2104 | Reward 0.0 | Buffer length 300000 | Epsilon 0.634 | Steps 416663
Episode 2105 | Reward 6.0 | Buffer length 300000 | Epsilon 0.634 | Steps 417049
Episode 2106 | Reward 8.0 | Buffer length 300000 | Epsilon 0.634 | Steps 417433
Episode 2107 | Reward 6.0 | Buffer length 300000 | Epsilon 0.634 | Steps 417817
Episode 2108 | Reward 0.0 | Buffer length 300000 | Epsilon 0.634 | Steps 417937
Episode 2109 | Reward 9.0 | Buffer length 300000 | Epsilon 0.633 | Steps 418410
Episode 2110 | Reward 0.0 | Buffer length 300000 | Epsilon 0.633 | Steps 418529
Episode 2111 | Reward 2.0 | Buffer length 300000 | Epsilo



VIDEO SAVED: videos/episode_2400_steps_482076.mp4 reward: 4.0
Episode 2400 | Reward 2.0 | Buffer length 300000 | Epsilon 0.608 | Steps 482076
Episode 2401 | Reward 0.0 | Buffer length 300000 | Epsilon 0.608 | Steps 482196
Episode 2402 | Reward 4.0 | Buffer length 300000 | Epsilon 0.608 | Steps 482453
Episode 2403 | Reward 4.0 | Buffer length 300000 | Epsilon 0.608 | Steps 482737
Episode 2404 | Reward 0.0 | Buffer length 300000 | Epsilon 0.608 | Steps 482857
Episode 2405 | Reward 5.0 | Buffer length 300000 | Epsilon 0.608 | Steps 483153
Episode 2406 | Reward 6.0 | Buffer length 300000 | Epsilon 0.607 | Steps 483485
Episode 2407 | Reward 0.0 | Buffer length 300000 | Epsilon 0.607 | Steps 483603
Episode 2408 | Reward 0.0 | Buffer length 300000 | Epsilon 0.607 | Steps 483722
Episode 2409 | Reward 1.0 | Buffer length 300000 | Epsilon 0.607 | Steps 483871
Episode 2410 | Reward 0.0 | Buffer length 300000 | Epsilon 0.607 | Steps 483991
Episode 2411 | Reward 13.0 | Buffer length 300000 | Epsilo



VIDEO SAVED: videos/episode_2700_steps_552398.mp4 reward: 15.0
Episode 2700 | Reward 4.0 | Buffer length 300000 | Epsilon 0.580 | Steps 552398
Episode 2701 | Reward 5.0 | Buffer length 300000 | Epsilon 0.580 | Steps 552711
Episode 2702 | Reward 7.0 | Buffer length 300000 | Epsilon 0.580 | Steps 553072
Episode 2703 | Reward 8.0 | Buffer length 300000 | Epsilon 0.579 | Steps 553498
Episode 2704 | Reward 4.0 | Buffer length 300000 | Epsilon 0.579 | Steps 553739
Episode 2705 | Reward 4.0 | Buffer length 300000 | Epsilon 0.579 | Steps 553994
Episode 2706 | Reward 3.0 | Buffer length 300000 | Epsilon 0.579 | Steps 554215
Episode 2707 | Reward 8.0 | Buffer length 300000 | Epsilon 0.579 | Steps 554641
Episode 2708 | Reward 3.0 | Buffer length 300000 | Epsilon 0.579 | Steps 554854
Episode 2709 | Reward 0.0 | Buffer length 300000 | Epsilon 0.579 | Steps 554974
Episode 2710 | Reward 8.0 | Buffer length 300000 | Epsilon 0.579 | Steps 555374
Episode 2711 | Reward 4.0 | Buffer length 300000 | Epsilo



VIDEO SAVED: videos/episode_3000_steps_651152.mp4 reward: 18.0
Episode 3000 | Reward 4.0 | Buffer length 300000 | Epsilon 0.540 | Steps 651152
Episode 3001 | Reward 11.0 | Buffer length 300000 | Epsilon 0.540 | Steps 651655
Episode 3002 | Reward 8.0 | Buffer length 300000 | Epsilon 0.540 | Steps 652068
Episode 3003 | Reward 13.0 | Buffer length 300000 | Epsilon 0.540 | Steps 652431
Episode 3004 | Reward 6.0 | Buffer length 300000 | Epsilon 0.540 | Steps 652763
Episode 3005 | Reward 9.0 | Buffer length 300000 | Epsilon 0.539 | Steps 653271
Episode 3006 | Reward 7.0 | Buffer length 300000 | Epsilon 0.539 | Steps 653675
Episode 3007 | Reward 10.0 | Buffer length 300000 | Epsilon 0.539 | Steps 654180
Episode 3008 | Reward 8.0 | Buffer length 300000 | Epsilon 0.539 | Steps 654598
Episode 3009 | Reward 5.0 | Buffer length 300000 | Epsilon 0.539 | Steps 654903
Episode 3010 | Reward 4.0 | Buffer length 300000 | Epsilon 0.539 | Steps 655160
Episode 3011 | Reward 5.0 | Buffer length 300000 | Eps



VIDEO SAVED: videos/episode_3300_steps_762618.mp4 reward: 13.0
Episode 3300 | Reward 6.0 | Buffer length 300000 | Epsilon 0.496 | Steps 762618
Episode 3301 | Reward 6.0 | Buffer length 300000 | Epsilon 0.496 | Steps 762944
Episode 3302 | Reward 11.0 | Buffer length 300000 | Epsilon 0.495 | Steps 763331
Episode 3303 | Reward 10.0 | Buffer length 300000 | Epsilon 0.495 | Steps 763844
Episode 3304 | Reward 7.0 | Buffer length 300000 | Epsilon 0.495 | Steps 764216
Episode 3305 | Reward 4.0 | Buffer length 300000 | Epsilon 0.495 | Steps 764469
Episode 3306 | Reward 12.0 | Buffer length 300000 | Epsilon 0.495 | Steps 765027
Episode 3307 | Reward 6.0 | Buffer length 300000 | Epsilon 0.495 | Steps 765376
Episode 3308 | Reward 5.0 | Buffer length 300000 | Epsilon 0.495 | Steps 765682
Episode 3309 | Reward 2.0 | Buffer length 300000 | Epsilon 0.494 | Steps 765878
Episode 3310 | Reward 2.0 | Buffer length 300000 | Epsilon 0.494 | Steps 766074
Episode 3311 | Reward 5.0 | Buffer length 300000 | Eps



VIDEO SAVED: videos/episode_3600_steps_867943.mp4 reward: 22.0
Episode 3600 | Reward 10.0 | Buffer length 300000 | Epsilon 0.454 | Steps 867943
Episode 3601 | Reward 9.0 | Buffer length 300000 | Epsilon 0.453 | Steps 868264
Episode 3602 | Reward 12.0 | Buffer length 300000 | Epsilon 0.453 | Steps 868716
Episode 3603 | Reward 8.0 | Buffer length 300000 | Epsilon 0.453 | Steps 869094
Episode 3604 | Reward 3.0 | Buffer length 300000 | Epsilon 0.453 | Steps 869337
Episode 3605 | Reward 10.0 | Buffer length 300000 | Epsilon 0.453 | Steps 869844
Episode 3606 | Reward 8.0 | Buffer length 300000 | Epsilon 0.453 | Steps 870297
Episode 3607 | Reward 5.0 | Buffer length 300000 | Epsilon 0.453 | Steps 870601
Episode 3608 | Reward 9.0 | Buffer length 300000 | Epsilon 0.452 | Steps 871069
Episode 3609 | Reward 13.0 | Buffer length 300000 | Epsilon 0.452 | Steps 871524
Episode 3610 | Reward 4.0 | Buffer length 300000 | Epsilon 0.452 | Steps 871779
Episode 3611 | Reward 11.0 | Buffer length 300000 | E



VIDEO SAVED: videos/episode_3900_steps_984233.mp4 reward: 18.0
Episode 3900 | Reward 11.0 | Buffer length 300000 | Epsilon 0.407 | Steps 984233
Episode 3901 | Reward 22.0 | Buffer length 300000 | Epsilon 0.407 | Steps 984890
Episode 3902 | Reward 13.0 | Buffer length 300000 | Epsilon 0.407 | Steps 985333
Episode 3903 | Reward 10.0 | Buffer length 300000 | Epsilon 0.406 | Steps 985796
Episode 3904 | Reward 14.0 | Buffer length 300000 | Epsilon 0.406 | Steps 986294
Episode 3905 | Reward 9.0 | Buffer length 300000 | Epsilon 0.406 | Steps 986771
Episode 3906 | Reward 13.0 | Buffer length 300000 | Epsilon 0.406 | Steps 987254
Episode 3907 | Reward 7.0 | Buffer length 300000 | Epsilon 0.406 | Steps 987623
Episode 3908 | Reward 10.0 | Buffer length 300000 | Epsilon 0.406 | Steps 988062
Episode 3909 | Reward 11.0 | Buffer length 300000 | Epsilon 0.405 | Steps 988585
Episode 3910 | Reward 4.0 | Buffer length 300000 | Epsilon 0.405 | Steps 988843
Episode 3911 | Reward 9.0 | Buffer length 300000 



VIDEO SAVED: videos/episode_4200_steps_1114291.mp4 reward: 17.0
Episode 4200 | Reward 19.0 | Buffer length 300000 | Epsilon 0.355 | Steps 1114291
Episode 4201 | Reward 8.0 | Buffer length 300000 | Epsilon 0.355 | Steps 1114703
Episode 4202 | Reward 9.0 | Buffer length 300000 | Epsilon 0.355 | Steps 1115116
Episode 4203 | Reward 12.0 | Buffer length 300000 | Epsilon 0.355 | Steps 1115571
Episode 4204 | Reward 12.0 | Buffer length 300000 | Epsilon 0.354 | Steps 1116128
Episode 4205 | Reward 19.0 | Buffer length 300000 | Epsilon 0.354 | Steps 1116571
Episode 4206 | Reward 7.0 | Buffer length 300000 | Epsilon 0.354 | Steps 1116939
Episode 4207 | Reward 17.0 | Buffer length 300000 | Epsilon 0.354 | Steps 1117580
Episode 4208 | Reward 10.0 | Buffer length 300000 | Epsilon 0.354 | Steps 1118064
Episode 4209 | Reward 5.0 | Buffer length 300000 | Epsilon 0.353 | Steps 1118345
Episode 4210 | Reward 9.0 | Buffer length 300000 | Epsilon 0.353 | Steps 1118779
Episode 4211 | Reward 13.0 | Buffer len



VIDEO SAVED: videos/episode_4500_steps_1242784.mp4 reward: 29.0
Episode 4500 | Reward 22.0 | Buffer length 300000 | Epsilon 0.304 | Steps 1242784
Episode 4501 | Reward 10.0 | Buffer length 300000 | Epsilon 0.303 | Steps 1243302
Episode 4502 | Reward 9.0 | Buffer length 300000 | Epsilon 0.303 | Steps 1243723
Episode 4503 | Reward 24.0 | Buffer length 300000 | Epsilon 0.303 | Steps 1244510
Episode 4504 | Reward 7.0 | Buffer length 300000 | Epsilon 0.303 | Steps 1244874
Episode 4505 | Reward 15.0 | Buffer length 300000 | Epsilon 0.303 | Steps 1245425
Episode 4506 | Reward 11.0 | Buffer length 300000 | Epsilon 0.302 | Steps 1245918
Episode 4507 | Reward 14.0 | Buffer length 300000 | Epsilon 0.302 | Steps 1246477
Episode 4508 | Reward 18.0 | Buffer length 300000 | Epsilon 0.302 | Steps 1246977
Episode 4509 | Reward 44.0 | Buffer length 300000 | Epsilon 0.302 | Steps 1247600
Episode 4510 | Reward 19.0 | Buffer length 300000 | Epsilon 0.302 | Steps 1248119
Episode 4511 | Reward 18.0 | Buffer 



VIDEO SAVED: videos/episode_4800_steps_1380572.mp4 reward: 35.0
Episode 4800 | Reward 12.0 | Buffer length 300000 | Epsilon 0.249 | Steps 1380572
Episode 4801 | Reward 10.0 | Buffer length 300000 | Epsilon 0.248 | Steps 1381022
Episode 4802 | Reward 31.0 | Buffer length 300000 | Epsilon 0.248 | Steps 1381531
Episode 4803 | Reward 10.0 | Buffer length 300000 | Epsilon 0.248 | Steps 1382048
Episode 4804 | Reward 20.0 | Buffer length 300000 | Epsilon 0.248 | Steps 1382708
Episode 4805 | Reward 20.0 | Buffer length 300000 | Epsilon 0.248 | Steps 1383154
Episode 4806 | Reward 21.0 | Buffer length 300000 | Epsilon 0.247 | Steps 1383784
Episode 4807 | Reward 21.0 | Buffer length 300000 | Epsilon 0.247 | Steps 1384402
Episode 4808 | Reward 17.0 | Buffer length 300000 | Epsilon 0.247 | Steps 1384879
Episode 4809 | Reward 24.0 | Buffer length 300000 | Epsilon 0.247 | Steps 1385358
Episode 4810 | Reward 15.0 | Buffer length 300000 | Epsilon 0.246 | Steps 1385795
Episode 4811 | Reward 17.0 | Buffe



VIDEO SAVED: videos/episode_5100_steps_1504754.mp4 reward: 26.0
Episode 5100 | Reward 21.0 | Buffer length 300000 | Epsilon 0.199 | Steps 1504754
Episode 5101 | Reward 10.0 | Buffer length 300000 | Epsilon 0.199 | Steps 1505109
Episode 5102 | Reward 20.0 | Buffer length 300000 | Epsilon 0.199 | Steps 1505608
Episode 5103 | Reward 7.0 | Buffer length 300000 | Epsilon 0.198 | Steps 1505977
Episode 5104 | Reward 9.0 | Buffer length 300000 | Epsilon 0.198 | Steps 1506363
Episode 5105 | Reward 7.0 | Buffer length 300000 | Epsilon 0.198 | Steps 1506707
Episode 5106 | Reward 14.0 | Buffer length 300000 | Epsilon 0.198 | Steps 1507086
Episode 5107 | Reward 13.0 | Buffer length 300000 | Epsilon 0.198 | Steps 1507492
Episode 5108 | Reward 9.0 | Buffer length 300000 | Epsilon 0.198 | Steps 1507816
Episode 5109 | Reward 12.0 | Buffer length 300000 | Epsilon 0.198 | Steps 1508247
Episode 5110 | Reward 20.0 | Buffer length 300000 | Epsilon 0.197 | Steps 1508722
Episode 5111 | Reward 13.0 | Buffer le



VIDEO SAVED: videos/episode_5400_steps_1631755.mp4 reward: 49.0
Episode 5400 | Reward 3.0 | Buffer length 300000 | Epsilon 0.148 | Steps 1631755
Episode 5401 | Reward 9.0 | Buffer length 300000 | Epsilon 0.148 | Steps 1632079
Episode 5402 | Reward 4.0 | Buffer length 300000 | Epsilon 0.148 | Steps 1632339
Episode 5403 | Reward 28.0 | Buffer length 300000 | Epsilon 0.148 | Steps 1632760
Episode 5404 | Reward 5.0 | Buffer length 300000 | Epsilon 0.148 | Steps 1633023
Episode 5405 | Reward 21.0 | Buffer length 300000 | Epsilon 0.147 | Steps 1633516
Episode 5406 | Reward 15.0 | Buffer length 300000 | Epsilon 0.147 | Steps 1633908
Episode 5407 | Reward 9.0 | Buffer length 300000 | Epsilon 0.147 | Steps 1634215
Episode 5408 | Reward 21.0 | Buffer length 300000 | Epsilon 0.147 | Steps 1634819
Episode 5409 | Reward 14.0 | Buffer length 300000 | Epsilon 0.147 | Steps 1635182
Episode 5410 | Reward 20.0 | Buffer length 300000 | Epsilon 0.147 | Steps 1635640
Episode 5411 | Reward 8.0 | Buffer leng



VIDEO SAVED: videos/episode_5700_steps_1772683.mp4 reward: 41.0
Episode 5700 | Reward 39.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1772683
Episode 5701 | Reward 15.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1773057
Episode 5702 | Reward 24.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1773641
Episode 5703 | Reward 10.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1773999
Episode 5704 | Reward 5.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1774270
Episode 5705 | Reward 29.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1774713
Episode 5706 | Reward 38.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1775236
Episode 5707 | Reward 16.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1775668
Episode 5708 | Reward 38.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1776132
Episode 5709 | Reward 42.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1776748
Episode 5710 | Reward 11.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1777122
Episode 5711 | Reward 28.0 | Buffer



VIDEO SAVED: videos/episode_6000_steps_1904222.mp4 reward: 5.0
Episode 6000 | Reward 4.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1904222
Episode 6001 | Reward 14.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1904563
Episode 6002 | Reward 26.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1905046
Episode 6003 | Reward 24.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1905427
Episode 6004 | Reward 8.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1905802
Episode 6005 | Reward 31.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1906186
Episode 6006 | Reward 14.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1906528
Episode 6007 | Reward 4.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1906767
Episode 6008 | Reward 9.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1907089
Episode 6009 | Reward 23.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1907533
Episode 6010 | Reward 8.0 | Buffer length 300000 | Epsilon 0.100 | Steps 1907927
Episode 6011 | Reward 45.0 | Buffer leng



VIDEO SAVED: videos/episode_6300_steps_2027602.mp4 reward: 37.0
Episode 6300 | Reward 231.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2027602
Episode 6301 | Reward 17.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2027963
Episode 6302 | Reward 35.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2028468
Episode 6303 | Reward 24.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2028940
Episode 6304 | Reward 35.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2029361
Episode 6305 | Reward 29.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2029779
Episode 6306 | Reward 29.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2030197
Episode 6307 | Reward 4.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2030453
Episode 6308 | Reward 16.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2030879
Episode 6309 | Reward 36.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2031314
Episode 6310 | Reward 34.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2031779
Episode 6311 | Reward 38.0 | Buffe



VIDEO SAVED: videos/episode_6600_steps_2149024.mp4 reward: 22.0
Episode 6600 | Reward 45.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2149024
Episode 6601 | Reward 39.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2149537
Episode 6602 | Reward 9.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2149860
Episode 6603 | Reward 35.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2150389
Episode 6604 | Reward 15.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2150843
Episode 6605 | Reward 15.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2151249
Episode 6606 | Reward 10.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2151603
Episode 6607 | Reward 6.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2151921
Episode 6608 | Reward 5.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2152189
Episode 6609 | Reward 21.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2152560
Episode 6610 | Reward 31.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2152959
Episode 6611 | Reward 5.0 | Buffer le



VIDEO SAVED: videos/episode_6900_steps_2264931.mp4 reward: 239.0
Episode 6900 | Reward 16.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2264931
Episode 6901 | Reward 13.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2265339
Episode 6902 | Reward 18.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2265818
Episode 6903 | Reward 5.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2266108
Episode 6904 | Reward 22.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2266527
Episode 6905 | Reward 18.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2266939
Episode 6906 | Reward 9.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2267263
Episode 6907 | Reward 10.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2267616
Episode 6908 | Reward 27.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2268110
Episode 6909 | Reward 5.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2268396
Episode 6910 | Reward 28.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2268800
Episode 6911 | Reward 157.0 | Buffer



VIDEO SAVED: videos/episode_7200_steps_2383664.mp4 reward: 9.0
Episode 7200 | Reward 12.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2383664
Episode 7201 | Reward 29.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2384070
Episode 7202 | Reward 6.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2384404
Episode 7203 | Reward 4.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2384638
Episode 7204 | Reward 35.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2385085
Episode 7205 | Reward 9.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2385388
Episode 7206 | Reward 10.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2385759
Episode 7207 | Reward 26.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2386189
Episode 7208 | Reward 7.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2386434
Episode 7209 | Reward 41.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2386887
Episode 7210 | Reward 18.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2387282
Episode 7211 | Reward 12.0 | Buffer len



VIDEO SAVED: videos/episode_7500_steps_2483442.mp4 reward: 14.0
Episode 7500 | Reward 7.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2483442
Episode 7501 | Reward 12.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2483721
Episode 7502 | Reward 3.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2483928
Episode 7503 | Reward 9.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2484253
Episode 7504 | Reward 14.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2484610
Episode 7505 | Reward 12.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2484889
Episode 7506 | Reward 2.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2485066
Episode 7507 | Reward 228.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2485620
Episode 7508 | Reward 5.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2485889
Episode 7509 | Reward 8.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2486161
Episode 7510 | Reward 13.0 | Buffer length 300000 | Epsilon 0.100 | Steps 2486510
Episode 7511 | Reward 303.0 | Buffer le

In [None]:
import os, imageio, torch, numpy as np
from collections import deque

EVAL_EPISODES = 2
EVAL_DIR = "videos"
os.makedirs(EVAL_DIR, exist_ok=True)

q_net.eval()

for ep in range(1, EVAL_EPISODES + 1):

    frames = []
    total_reward = 0.0

    obs, info = env.reset()

    prev_obs = obs
    obs, _, _, _, info = env.step(FIRE_ACTION)
    prev_obs = obs
    obs, _, _, _, info = env.step(FIRE_ACTION)

    prev_lives = info.get("lives", None)

    obs_proc = preprocess(obs, prev_obs)
    frame_stack = deque([obs_proc] * FRAME_STACK, maxlen=FRAME_STACK)
    state = np.stack(frame_stack, axis=0)

    for step in range(MAX_EPISODE_STEPS):
        frames.append(obs)

        with torch.no_grad():
            q_vals = q_net(torch.tensor(state, device=DEVICE).unsqueeze(0))
            action = torch.argmax(q_vals).item()

        prev_obs = obs
        obs, reward, done, info = step_with_skip(env, action)
        total_reward += reward

        if (not done) and ("lives" in info) and (prev_lives is not None) and (info["lives"] < prev_lives):
            prev_obs_fire = obs
            obs, reward_fire, done_fire, info_fire = step_with_skip(env, FIRE_ACTION)
            total_reward += reward_fire
            prev_obs = prev_obs_fire
            done = done_fire
            info = info_fire

        if "lives" in info:
            prev_lives = info["lives"]

        obs_proc = preprocess(obs, prev_obs)
        frame_stack.append(obs_proc)
        state = np.stack(frame_stack, axis=0)

        if done:
            break

    video_path = f"{EVAL_DIR}/eval_{ep}.mp4"
    imageio.mimsave(video_path, frames, fps=30)

    print(f"EVAL {ep}: reward = {total_reward}, steps = {step + 1}")




EVAL 1: reward = 14.0, steps = 386




EVAL 2: reward = 14.0, steps = 386


In [None]:
!zip -r videos.zip videos
from google.colab import files
files.download("videos.zip")

  adding: videos/ (stored 0%)
  adding: videos/episode_2100_steps_415777.mp4 (deflated 23%)
  adding: videos/eval_1.mp4 (deflated 23%)
  adding: videos/episode_1200_steps_188524.mp4 (deflated 28%)
  adding: videos/episode_3000_steps_651152.mp4 (deflated 24%)
  adding: videos/episode_3900_steps_984233.mp4 (deflated 24%)
  adding: videos/episode_1800_steps_334526.mp4 (deflated 24%)
  adding: videos/episode_600_steps_91742.mp4 (deflated 45%)
  adding: videos/episode_7500_steps_2483442.mp4 (deflated 22%)
  adding: videos/episode_7200_steps_2383664.mp4 (deflated 24%)
  adding: videos/episode_5700_steps_1772683.mp4 (deflated 22%)
  adding: videos/episode_4200_steps_1114291.mp4 (deflated 24%)
  adding: videos/episode_6300_steps_2027602.mp4 (deflated 27%)
  adding: videos/episode_4500_steps_1242784.mp4 (deflated 23%)
  adding: videos/episode_6000_steps_1904222.mp4 (deflated 28%)
  adding: videos/episode_900_steps_138143.mp4 (deflated 43%)
  adding: videos/episode_1500_steps_256536.mp4 (deflate

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r models.zip models
from google.colab import files
files.download("models.zip")

  adding: models/ (stored 0%)
  adding: models/qnet_latest.pt (deflated 20%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>