In [11]:
!pip uninstall -y opencv-python opencv-python-headless
!pip install opencv-python-headless

Found existing installation: opencv-python 4.11.0.86
Uninstalling opencv-python-4.11.0.86:
  Successfully uninstalled opencv-python-4.11.0.86
Found existing installation: opencv-python-headless 4.11.0.86
Uninstalling opencv-python-headless-4.11.0.86:
  Successfully uninstalled opencv-python-headless-4.11.0.86
[0mCollecting opencv-python-headless
  Using cached opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Using cached opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (50.0 MB)
Installing collected packages: opencv-python-headless
Successfully installed opencv-python-headless-4.11.0.86
[0m

In [5]:
import cv2
print(cv2.__version__)

4.11.0


In [6]:
#from typing import Any
import os
import random
import time

import gymnasium as gym  #make sure you're using gymnasium
from gymnasium.wrappers import RecordVideo, RecordEpisodeStatistics

#import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# stable_baselines3 have wrappers that simplifies
# the preprocessing a lot, read more about them here:
# https://stable-baselines3.readthedocs.io/en/master/common/atari_wrappers.html
from stable_baselines3.common.atari_wrappers import (
    ClipRewardEnv,
    EpisodicLifeEnv,
    FireResetEnv,
    MaxAndSkipEnv,
    NoopResetEnv,
)
from stable_baselines3.common.buffers import ReplayBuffer
import stable_baselines3.common.atari_wrappers
from ale_py import ALEInterface
ale = ALEInterface()
from stable_baselines3.common.atari_wrappers import WarpFrame
from gymnasium.wrappers import FrameStackObservation


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from collections import deque


import hyperparameters0

# Instantiate the class
params = hyperparameters0.Hyperparameters()

# Access attributes
print(f"learning-rate: ", params.learning_rate)  # Should print something like 5e-3
print(f"timesteps: ", params.total_timesteps) #Should print like 10 000 ish
print(f"buffer: ", params.buffer_size) #100
print(f"exploration : ",params.exploration_fraction)
print(f"end e: ", params.end_e)
print(f"seed : ", params.seed)

learning-rate:  0.0001
timesteps:  10000000
buffer:  100000
exploration :  0.1
end e:  0.01
seed :  1


In [None]:
def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        env = gym.make(env_id, render_mode="rgb_array")
        env = gym.wrappers.RecordEpisodeStatistics(env)

        if capture_video and idx == 0:
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}", episode_trigger=lambda e: e % 1000 == 0)

        env = NoopResetEnv(env, noop_max=30)
        env = MaxAndSkipEnv(env, skip=4)
        env = EpisodicLifeEnv(env)

        if "FIRE" in env.unwrapped.get_action_meanings():
            env = FireResetEnv(env)

        env = ClipRewardEnv(env)
        env = WarpFrame(env)
        env = FrameStackObservation(env, 4)

        env.reset(seed=seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env
    return thunk


class QNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, env.single_action_space.n)
        )

    def forward(self, x):
        # Input should already be normalized from preprocessing
        return self.network(x / 255.0)


def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

if __name__ == "__main__":
    print(params.batch_size)
    run_name = f"{params.env_id}__{params.exp_name}__{params.seed}__{int(time.time())}"

    random.seed(params.seed)
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    torch.backends.cudnn.deterministic = params.torch_deterministic

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # env setup
    envs = gym.vector.SyncVectorEnv([make_env(params.env_id, params.seed, 0, params.capture_video, run_name)])
    assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"

    q_network = QNetwork(envs).to(device)
    optimizer = optim.AdamW(q_network.parameters(), lr=params.learning_rate, weight_decay=1e-3)
    target_network = QNetwork(envs).to(device)
    target_network.load_state_dict(q_network.state_dict())

    rb = ReplayBuffer(
        params.buffer_size,
        envs.single_observation_space,
        envs.single_action_space,
        device,
        optimize_memory_usage=True,
        handle_timeout_termination=False,
    )

    #take an action
    obs, infos = envs.reset()
    for global_step in range(params.total_timesteps):
        epsilon = linear_schedule(params.start_e, params.end_e, params.exploration_fraction * params.total_timesteps, global_step)
            
        if random.random() < epsilon:
            actions = np.array([envs.single_action_space.sample()])
        else:
            with torch.no_grad():
                # Fix: Handle dimension properly - squeeze out extra dimensions
                obs_tensor = torch.tensor(obs, device=device, dtype=torch.float32).squeeze(-1)
                q_values = q_network(obs_tensor)
                actions = torch.argmax(q_values, dim=1).cpu().numpy()

        # Take a step in the environment
        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
        dones = np.logical_or(terminations, truncations)

        # Print reward info
        if "episode" in infos.keys():
            print(f"global_step={global_step}, episodic_return={infos['episode']['r']}")

        # Fix: Remove syntax error - 's' before next_obs
        real_next_obs = next_obs.copy()

        # Store transitions in replay buffer
        processed_infos = [{} for _ in range(envs.num_envs)]
        rb.add(obs, real_next_obs, actions, rewards, dones, processed_infos)

        obs = next_obs
        
        # Training
        if global_step > params.learning_starts:
            if global_step % params.train_frequency == 0:
                data = rb.sample(params.batch_size)

                with torch.no_grad():
                    # Fix: Squeeze out extra dimensions for target network
                    next_q_values = target_network(data.next_observations.float().squeeze(-1))
                    target_max = next_q_values.max(dim=1)[0]  # Get values, not indices
                    
                    # Fix: Proper reward and done tensor handling
                    rewards_tensor = data.rewards.flatten()
                    dones_tensor = data.dones.flatten()
                    
                    td_target = rewards_tensor + params.gamma * target_max * (1 - dones_tensor)

                # Fix: Current Q-values calculation with proper dimensions
                current_q_values = q_network(data.observations.float().squeeze(-1))
                old_val = current_q_values.gather(1, data.actions.long()).squeeze()

                loss = F.mse_loss(old_val, td_target)

                # Gradient step
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # Update target network
            if global_step % params.target_network_frequency == 0:
                for target_param, q_param in zip(target_network.parameters(), q_network.parameters()):
                    target_param.data.copy_(
                        params.tau * q_param.data + (1.0 - params.tau) * target_param.data
                    )

    if params.save_model:
        model_path = f"runs/{run_name}/{params.exp_name}_model"
        os.makedirs(f"runs/{run_name}", exist_ok=True)
        torch.save(q_network.state_dict(), model_path)
        print(f"model saved to {model_path}")

    envs.close()

32
cuda
global_step=113, episodic_return=[0.]
global_step=255, episodic_return=[1.]
global_step=401, episodic_return=[1.]
global_step=642, episodic_return=[3.]
global_step=816, episodic_return=[2.]
global_step=1031, episodic_return=[2.]
global_step=1195, episodic_return=[1.]
global_step=1437, episodic_return=[3.]
global_step=1599, episodic_return=[1.]
global_step=1792, episodic_return=[2.]
global_step=1912, episodic_return=[0.]
global_step=2028, episodic_return=[0.]
global_step=2176, episodic_return=[1.]
global_step=2294, episodic_return=[0.]
global_step=2515, episodic_return=[3.]
global_step=2663, episodic_return=[1.]
global_step=2781, episodic_return=[0.]
global_step=3044, episodic_return=[3.]
global_step=3164, episodic_return=[0.]
global_step=3426, episodic_return=[3.]
global_step=3546, episodic_return=[0.]
global_step=3710, episodic_return=[1.]
global_step=3828, episodic_return=[0.]
global_step=3992, episodic_return=[1.]
global_step=4156, episodic_return=[1.]
global_step=4272, epis