In [1]:
!sudo update-alternatives --config python3
!python3 -m pip install --upgrade pip
!python3 -m pip install "gymnasium[atari]" ale-py autorom
!which python3
!python3 --version
!wget https://bootstrap.pypa.io/get-pip.py
!python3 get-pip.py
!python3 -m pip --version
!python3 -m pip install "gymnasium[atari]==0.29.1"
!python3 -m pip install "ale-py==0.8.1"
!python3 -m pip install autorom
!python3 -m AutoROM --accept-license
!python3 -m pip install "gymnasium[atari]" ale-py AutoROM AutoROM.accept-rom-license

!python3 -m AutoROM.accept_rom_license --accept-license
import gymnasium as gym
import ale_py

gym.register_envs(ale_py)
print("Atari envs registered!")

gym.pprint_registry()

There are 2 choices for the alternative python3 (providing /usr/bin/python3).

  Selection    Path                 Priority   Status
------------------------------------------------------------
* 0            /usr/bin/python3.12   2         auto mode
  1            /usr/bin/python3.10   1         manual mode
  2            /usr/bin/python3.12   2         manual mode

Press <enter> to keep the current choice[*], or type selection number: 1
update-alternatives: using /usr/bin/python3.10 to provide /usr/bin/python3 (python3) in manual mode
/usr/bin/python3: No module named pip
/usr/bin/python3: No module named pip
/usr/bin/python3
Python 3.10.12
--2025-12-03 00:41:18--  https://bootstrap.pypa.io/get-pip.py
Resolving bootstrap.pypa.io (bootstrap.pypa.io)... 151.101.0.175, 151.101.64.175, 151.101.128.175, ...
Connecting to bootstrap.pypa.io (bootstrap.pypa.io)|151.101.0.175|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2182415 (2.1M) [text/x-python]
Saving to: ‘g

In [2]:
!pip install "gymnasium[atari]==0.29.1"
!pip install "ale-py==0.8.1"
!pip install autorom



In [None]:
import os
import random
import numpy as np
from collections import deque

import gymnasium as gym
from gymnasium.wrappers import AtariPreprocessing, FrameStackObservation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

#  Utils

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

#  PPO CNN Policy for Atari

class PPOAtariPolicy(nn.Module):
    def __init__(self, action_dim: int):
        super().__init__()

        # Input: (B, 4, 84, 84)
        self.conv = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4), nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(),
        )

        self.fc = nn.Sequential(
            nn.Linear(3136, 512), nn.ReLU(),
        )

        self.pi = nn.Linear(512, action_dim)
        self.v  = nn.Linear(512, 1)

    def forward(self, x):
        x = self.conv(x)
        x = x.reshape(x.size(0), -1)
        x = self.fc(x)
        logits = self.pi(x)
        value  = self.v(x).squeeze(-1)
        return logits, value

    def get_action_and_value(self, x, action=None):
        logits, value = self.forward(x)
        dist = Categorical(logits=logits)

        if action is None:
            action = dist.sample()

        log_prob = dist.log_prob(action)
        entropy  = dist.entropy()
        return action, log_prob, entropy, value

#  Environment helpers (SPACE INVADERS)

def make_single_spaceinvader_env(seed: int = 0):
    """
    Creates one SpaceInvaders environment with:
    - grayscale
    - frame-skip=4
    - resize to 84x84
    - stacked 4 frames (84,84,4)
    """

    def thunk():
        env = gym.make(
            "ALE/SpaceInvaders-v5",
            frameskip=1,
            repeat_action_probability=0.0,
            render_mode=None
        )

        env = AtariPreprocessing(
            env,
            frame_skip=4,
            grayscale_obs=True,
            screen_size=84,
            scale_obs=False,
        )

        # produce obs shape (84,84,4)
        env = FrameStackObservation(env, stack_size=4)

        env.reset(seed=seed)
        return env

    return thunk


def preprocess_obs(obs: np.ndarray):
    """
    For SpaceInvaders:
    env observation space is (4, 84, 84), i.e. channels-first already.
    Vectorized env gives (num_envs, 4, 84, 84).

    We just normalize to [0,1] and keep the shape as (N,4,84,84).
    """
    obs_t = torch.from_numpy(obs).float().to(DEVICE) / 255.0
    return obs_t

#  PPO TRAINING LOOP

def train_ppo_spaceinvaders(
    total_timesteps: int = 8_000_000,
    n_envs: int = 16,
    num_steps: int = 256,
    gamma: float = 0.99,
    gae_lambda: float = 0.95,
    clip_coef: float = 0.1,
    lr: float = 2.5e-4,
    update_epochs: int = 4,
    minibatch_size: int = 256,
    ent_coef: float = 0.01,
    vf_coef: float = 0.5,
    max_grad_norm: float = 0.5,
    seed: int = 42,
):
    set_seed(seed)

    # Vectorized envs
    env_fns = [make_single_spaceinvader_env(seed + i) for i in range(n_envs)]
    envs = gym.vector.SyncVectorEnv(env_fns)

    print("Observation space:", envs.single_observation_space)
    print("Action space:", envs.single_action_space)

    # Action space size
    action_dim = envs.single_action_space.n

    # PPO Model
    model = PPOAtariPolicy(action_dim).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-5)

    # Rollout buffers
    obs_buf      = torch.zeros(num_steps, n_envs, 4, 84, 84, device=DEVICE)
    actions_buf  = torch.zeros(num_steps, n_envs, dtype=torch.long, device=DEVICE)
    logprobs_buf = torch.zeros(num_steps, n_envs, device=DEVICE)
    rewards_buf  = torch.zeros(num_steps, n_envs, device=DEVICE)
    dones_buf    = torch.zeros(num_steps, n_envs, device=DEVICE)
    values_buf   = torch.zeros(num_steps, n_envs, device=DEVICE)

    # Initial obs
    obs, _ = envs.reset()
    obs_t = preprocess_obs(obs)

    episode_returns = np.zeros(n_envs, np.float32)
    episode_lengths = np.zeros(n_envs, np.int32)

    finished_returns = []

    num_updates = total_timesteps // (n_envs * num_steps)
    print(f"Total updates = {num_updates}")

    for update in range(1, num_updates + 1):

        #  COLLECT ROLLOUT
        for step in range(num_steps):
            with torch.no_grad():
                action, logp, entropy, value = model.get_action_and_value(obs_t)

            actions = action.cpu().numpy()

            next_obs, reward, terminated, truncated, infos = envs.step(actions)
            done = np.logical_or(terminated, truncated)

            reward_clip = np.clip(reward, -1.0, 1.0)

            # Store rollout data
            obs_buf[step]      = obs_t
            actions_buf[step]  = action
            logprobs_buf[step] = logp
            rewards_buf[step]  = torch.tensor(reward_clip, device=DEVICE).float()
            dones_buf[step]    = torch.tensor(done, device=DEVICE).float()
            values_buf[step]   = value

            # Episode tracking
            episode_returns += reward
            episode_lengths += 1

            for i in range(n_envs):
                if done[i]:
                    finished_returns.append(episode_returns[i])
                    episode_returns[i] = 0.0
                    episode_lengths[i] = 0

            obs = next_obs
            obs_t = preprocess_obs(obs)

        #  COMPUTE ADV + RETURNS (GAE)
        with torch.no_grad():
            _, _, _, next_values = model.get_action_and_value(obs_t)

        advantages = torch.zeros_like(rewards_buf)
        last_gae = torch.zeros(n_envs, device=DEVICE)

        for step in reversed(range(num_steps)):
            if step == num_steps - 1:
                next_non_terminal = 1.0 - dones_buf[step]
                next_vals = next_values
            else:
                next_non_terminal = 1.0 - dones_buf[step + 1]
                next_vals = values_buf[step + 1]

            delta = rewards_buf[step] + gamma * next_vals * next_non_terminal - values_buf[step]
            last_gae = delta + gamma * gae_lambda * next_non_terminal * last_gae
            advantages[step] = last_gae

        returns = advantages + values_buf

        # Flatten batch
        batch_obs = obs_buf.reshape(-1, 4, 84, 84)
        batch_actions = actions_buf.reshape(-1)
        batch_logprobs = logprobs_buf.reshape(-1)
        batch_advantages = advantages.reshape(-1)
        batch_returns = returns.reshape(-1)
        batch_values = values_buf.reshape(-1)

        # Normalize adv
        batch_advantages = (batch_advantages - batch_advantages.mean()) / (batch_advantages.std() + 1e-8)
        #  PPO UPDATE
        batch_size = n_envs * num_steps
        indices = np.arange(batch_size)

        for epoch in range(update_epochs):
            np.random.shuffle(indices)

            for start in range(0, batch_size, minibatch_size):
                end = start + minibatch_size
                mb_idx = indices[start:end]

                mb_obs = batch_obs[mb_idx]
                mb_actions = batch_actions[mb_idx]
                mb_logprobs_old = batch_logprobs[mb_idx]
                mb_adv = batch_advantages[mb_idx]
                mb_returns = batch_returns[mb_idx]

                _, new_logprob, entropy, new_values = model.get_action_and_value(
                    mb_obs, mb_actions
                )

                ratio = (new_logprob - mb_logprobs_old).exp()

                pg1 = -mb_adv * ratio
                pg2 = -mb_adv * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
                policy_loss = torch.max(pg1, pg2).mean()

                value_loss = 0.5 * (mb_returns - new_values).pow(2).mean()

                entropy_loss = entropy.mean()

                loss = (
                    policy_loss +
                    vf_coef * value_loss -
                    ent_coef * entropy_loss
                )

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()

        if update % 20 == 0:
            avg = np.mean(finished_returns[-50:]) if len(finished_returns) > 0 else 0.0
            print(f"[Update {update}/{num_updates}] AvgReturn (last 50): {avg:.2f}")

    envs.close()

    np.save("spaceinvaders_ppo_returns.npy", finished_returns)
    torch.save(model.state_dict(), "spaceinvaders_ppo.pth")

    print("\nTraining completed!")
    print("Saved: spaceinvaders_ppo.pth + spaceinvaders_ppo_returns.npy")
    return model, finished_returns

#  GREEDY EVALUATION

def evaluate_ppo_spaceinvaders(model_path="spaceinvaders_ppo.pth", episodes=5, seed=123):
    env = make_single_spaceinvader_env(seed)()
    action_dim = env.action_space.n

    model = PPOAtariPolicy(action_dim).to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()

    returns = []

    for ep in range(episodes):
        obs, _ = env.reset(seed=seed + ep)
        done = False
        total_reward = 0.0

        while not done:
            obs_t = preprocess_obs(obs[None, ...])
            with torch.no_grad():
                logits, value = model(obs_t)
                action = torch.argmax(logits, dim=-1).item()

            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

        returns.append(total_reward)
        print(f"[EVAL] Episode {ep}: Reward = {total_reward}")

    env.close()
    print("Mean reward:", np.mean(returns))
    return returns

#  MAIN

if __name__ == "__main__":
    model, returns = train_ppo_spaceinvaders(
        total_timesteps=5_000_000,
        n_envs=16,
        num_steps=256,
    )

    # After training:
    # evaluate_ppo_spaceinvaders("spaceinvaders_ppo.pth", episodes=5)

Using device: cuda
Observation space: Box(0, 255, (4, 84, 84), uint8)
Action space: Discrete(6)
Total updates = 1220
[Update 20/1220] AvgReturn (last 50): 188.30
[Update 40/1220] AvgReturn (last 50): 195.60
[Update 60/1220] AvgReturn (last 50): 252.30
[Update 80/1220] AvgReturn (last 50): 271.40
[Update 100/1220] AvgReturn (last 50): 296.80
[Update 120/1220] AvgReturn (last 50): 336.90
[Update 140/1220] AvgReturn (last 50): 356.80
[Update 160/1220] AvgReturn (last 50): 400.60
[Update 180/1220] AvgReturn (last 50): 365.60
[Update 200/1220] AvgReturn (last 50): 364.40
[Update 220/1220] AvgReturn (last 50): 376.10
[Update 240/1220] AvgReturn (last 50): 370.70
[Update 260/1220] AvgReturn (last 50): 352.00
[Update 280/1220] AvgReturn (last 50): 410.10
[Update 300/1220] AvgReturn (last 50): 447.10
[Update 320/1220] AvgReturn (last 50): 451.70
[Update 340/1220] AvgReturn (last 50): 399.30
[Update 360/1220] AvgReturn (last 50): 425.90
[Update 380/1220] AvgReturn (last 50): 440.00
[Update 400/1

In [12]:
if __name__ == "__main__":
    #model, returns = train_ppo_spaceinvaders(
     #   total_timesteps=5_000_000,
      #  n_envs=16,
       # num_steps=256,
    #)

    # After training:
     evaluate_ppo_spaceinvaders("spaceinvaders_ppo.pth", episodes=10)

[EVAL] Episode 0: Reward = 575.0
[EVAL] Episode 1: Reward = 360.0
[EVAL] Episode 2: Reward = 465.0
[EVAL] Episode 3: Reward = 360.0
[EVAL] Episode 4: Reward = 360.0
[EVAL] Episode 5: Reward = 575.0
[EVAL] Episode 6: Reward = 465.0
[EVAL] Episode 7: Reward = 465.0
[EVAL] Episode 8: Reward = 285.0
[EVAL] Episode 9: Reward = 285.0
Mean reward: 419.5


In [None]:
import gymnasium as gym
from gymnasium.wrappers import AtariPreprocessing, FrameStackObservation, RecordVideo
import torch
import numpy as np

def record_greedy_spaceinvaders_video(
    model_path="spaceinvaders_ppo.pth",
    video_folder="videos_spaceinvaders_ppo",
    seed: int = 123,
):
    base_env = gym.make(
        "ALE/SpaceInvaders-v5",
        frameskip=1,
        repeat_action_probability=0.0,
        render_mode="rgb_array",   
    )

    base_env = AtariPreprocessing(
        base_env,
        frame_skip=4,
        grayscale_obs=True,  
        screen_size=84,
        scale_obs=False,
    )
    base_env = FrameStackObservation(base_env, stack_size=4)

    env = RecordVideo(
        base_env,
        video_folder=video_folder,
        episode_trigger=lambda ep_id: ep_id == 0,  # record first episode
        name_prefix="spaceinvaders_ppo_greedy",
    )

    # Load model
    action_dim = env.action_space.n
    model = PPOAtariPolicy(action_dim).to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()

    #Run one greedy episode
    obs, info = env.reset(seed=seed)
    done = False
    total_reward = 0.0
    steps = 0

    while not done:
        obs_t = preprocess_obs(obs[None, ...]) 
        with torch.no_grad():
            logits, _ = model(obs_t)
            action = torch.argmax(logits, dim=-1).item()

        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        total_reward += reward
        steps += 1

    env.close()
    print(f"Recorded greedy episode: reward={total_reward}, steps={steps}")
    print(f"Video saved under folder: {video_folder}")

# Run
record_greedy_spaceinvaders_video()


  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"


Recorded greedy episode: reward=575.0, steps=620
Video saved under folder: videos_spaceinvaders_ppo
