In [1]:
import gym
import torch
import random
import numpy as np
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv
from stable_baselines3.common.buffers import ReplayBuffer



In [2]:
class DQN(nn.Module):
    def __init__(self, nb_actions):
        super().__init__()
        self.network = nn.Sequential(nn.Conv2d(4, 16, 8, stride=4), nn.ReLU(),
                                     nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(),
                                     nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(),
                                     nn.Linear(256, nb_actions), )

    def forward(self, x):
        return self.network(x / 255.)

Initialize replay memory D to capacity N
Initialize action-value function Q with random weights

for episode = 1, M do
    Initialise sequence s₁ = {x₁} and preprocessed sequenced φ₁ = φ(s₁)

    for t = 1, T do
        With probability ε select a random action aₜ
        otherwise select aₜ = max_a Q*(φ(sₜ), a; θ)

        Execute action aₜ in emulator and observe reward rₜ and image xₜ₊₁

        Set sₜ₊₁ = sₜ, aₜ, xₜ₊₁ and preprocess φₜ₊₁ = φ(sₜ₊₁)

        Store transition (φₜ, aₜ, rₜ, φₜ₊₁) in D

        Sample random minibatch of transitions (φⱼ, aⱼ, rⱼ, φⱼ₊₁) from D

        Set yⱼ = {
            rⱼ                                    for terminal φⱼ₊₁
            rⱼ + γ max_a' Q(φⱼ₊₁, a'; θ)         for non-terminal φⱼ₊₁
        }

        Perform a gradient descent step on (yⱼ − Q(φⱼ, aⱼ; θ))² according to equation 3
    end for
end for

In [25]:
def Deep_Q_Learning(env, replay_memory_size=100_000, nb_epochs=30_000_000, update_frequency=4, batch_size=32,
                    discount_factor=0.99, replay_start_size=80_000, initial_exploration=1, final_exploration=0.01,
                    exploration_steps=1_000_000, device='cuda'):
    
    # Initialize replay memory D to capacity N
    rb = ReplayBuffer(replay_memory_size, env.observation_space, env.action_space, device,
                      optimize_memory_usage=True, handle_timeout_termination=False)
    
    # Initialize action-value function Q with random weights
    q_network = DQN(env.action_space.n).to(device)
    optimizer = torch.optim.Adam(q_network.parameters(), lr=1.25e-4)

    epoch = 0
    smoothed_rewards = []
    rewards = []

    # Update progress bar to only refresh every 1000 steps
    progress_bar = tqdm(total=nb_epochs, miniters=1000, unit_scale=True)

    while epoch <= nb_epochs:
        # Initialise sequence s1 = {x1}and preprocessed sequenced φ1 = φ(s1)
        dead = False
        total_rewards = 0
        obs = env.reset()[0]

        for _ in range(random.randint(1, 30)):
            obs, _, terminated, truncated, info = env.step(1)

        # for t= 1,T do
        while not dead:
            current_life = info['lives']

            epsilon = max((final_exploration - initial_exploration) / exploration_steps * epoch + initial_exploration,
                          final_exploration)
            # With probability ϵ select a random action a
            if random.random() < epsilon:
                action = np.array(env.action_space.sample())
                # otherwise select at = maxa Q∗(φ(st),a; θ)
            else:
                q_values = q_network(torch.Tensor(obs).unsqueeze(0).to(device))
                action = torch.argmax(q_values, dim=1).item()

            '''
            Breaking down each parameter:
            The ALE emulator creates a virtual frame buffer that represents the Atari 2600's video output. 
            
            The raw output from ALE is:

            A frame buffer of 160x210 pixels in RGB format
            The current RAM state of the emulated Atari
            The current score/reward from the game memory
            The current game state (lives, game over, etc.)

            When used through Gym/Gymnasium, this gets processed:

            The frame buffer gets preprocessed:

            Converted to grayscale (usually)
            Often downscaled to 84x84 pixels
            Frames are often stacked (4 frames is common) to give temporal information
            Back from the ale via the gym wrapper
            next_obs: The screen data (observation) after taking the action
            reward: The score change from the action
            terminated: True if the game naturally ended (like losing all lives)
            truncated: True if the episode was artificially cut off (like reaching max steps)
            info: Dictionary with additional information like current lives
            '''

            # Execute action at in emulator and observe reward rt and image xt+1
            next_obs, reward, terminated, truncated, info = env.step(action)
            dead = terminated or truncated

            done = True if (info['lives'] < current_life) else False
            # Set st+1 = st,at,xt+1 and preprocess φt+1 = φ(st+1)
            real_next_obs = next_obs.copy()
            total_rewards += reward
            reward = np.sign(reward)

            # Store transition (φt,at,rt,φt+1) in D
            rb.add(obs, real_next_obs, action, reward, done, info)
            obs = next_obs

            if epoch > replay_start_size and epoch % update_frequency == 0:
                # Sample random minibatch of transitions (φj,aj,rj,φj+1) from D
                data = rb.sample(batch_size) # Get batch of past experiences

                # Set yj to rj + γ maxa′ Q(φj+1,a′; θ)
                with torch.no_grad():
                    
                    # What ACTUALLY happened , # Prediction for FUTURE value (next state)
                    max_q_value, _ = q_network(data.next_observations).max(dim=1)
                    # Target combines REAL reward with PREDICTED future value
                    # Combine the REAL reward we got with our prediction of future value
                    # This becomes our target - what we think the total value should have been
                    # This is implementing Q(s,a) = r + γ * max[Q(s',a')]
                    y = data.rewards.flatten() + discount_factor * max_q_value * (1 - data.dones.flatten())

                # What it PREDICTED, # Prediction for the state when we took the action
                # Then later, current_q_value is asking:
                # "What did you think that action would be worth before you took it?"
                current_q_value = q_network(data.observations).gather(1, data.actions).squeeze()
                
                # Learn from the difference
                # Perform a gradient descent step on (yj−Q(φj,aj; θ))2 according to equation 3
                loss = F.huber_loss(y, current_q_value)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            epoch += 1
            # Only update progress bar every 1000 steps
            if epoch % 1000 == 0:
                progress_bar.update(1000)

            if (epoch % 50_000 == 0) and epoch > 0:
                smoothed_rewards.append(np.mean(rewards))
                rewards = []
                plt.plot(smoothed_rewards)
                plt.title("Average Reward on Breakout")
                plt.xlabel("Training Epochs")
                plt.ylabel("Average Reward per Episode")
                #plt.savefig('Imgs/average_reward_on_breakout.png')
                plt.show()
                plt.close()

        rewards.append(total_rewards)

In [18]:
#!pip install --upgrade pip setuptools wheel
#!pip install opencv-python
#!pip install "gym[atari]"

In [19]:
# First remove existing installations to avoid conflicts
#!pip uninstall gym ale-py AutoROM atari-py -y

# Then install in the correct order
#!pip install gym
#!pip install "gym[atari]"
#!pip install ale-py
#!pip install "autorom[accept-rom-license]"
#!python -m autorom

# Or you can do it all in one line:
#!pip install gym "gym[atari]" ale-py "autorom[accept-rom-license]" && python -m autorom

In [20]:
import gymnasium as gym
all_envs = gym.envs.registry.keys()
for env in sorted(all_envs):
    print(env)

Acrobot-v1
Ant-v2
Ant-v3
Ant-v4
Ant-v5
BipedalWalker-v3
BipedalWalkerHardcore-v3
Blackjack-v1
CarRacing-v3
CartPole-v0
CartPole-v1
CliffWalking-v0
FrozenLake-v1
FrozenLake8x8-v1
GymV21Environment-v0
GymV26Environment-v0
HalfCheetah-v2
HalfCheetah-v3
HalfCheetah-v4
HalfCheetah-v5
Hopper-v2
Hopper-v3
Hopper-v4
Hopper-v5
Humanoid-v2
Humanoid-v3
Humanoid-v4
Humanoid-v5
HumanoidStandup-v2
HumanoidStandup-v4
HumanoidStandup-v5
InvertedDoublePendulum-v2
InvertedDoublePendulum-v4
InvertedDoublePendulum-v5
InvertedPendulum-v2
InvertedPendulum-v4
InvertedPendulum-v5
LunarLander-v3
LunarLanderContinuous-v3
MountainCar-v0
MountainCarContinuous-v0
Pendulum-v1
Pusher-v2
Pusher-v4
Pusher-v5
Reacher-v2
Reacher-v4
Reacher-v5
Swimmer-v2
Swimmer-v3
Swimmer-v4
Swimmer-v5
Taxi-v3
Walker2d-v2
Walker2d-v3
Walker2d-v4
Walker2d-v5
phys2d/CartPole-v0
phys2d/CartPole-v1
phys2d/Pendulum-v0
tabular/Blackjack-v0
tabular/CliffWalking-v0


In [21]:
import gym
all_envs = gym.envs.registry.keys()
atari_envs = [env for env in all_envs if 'Breakout' in env]
print("\nAtari Breakout environments:")
for env in sorted(atari_envs):
    print(env)


Atari Breakout environments:
ALE/Breakout-ram-v5
ALE/Breakout-v5
Breakout-ram-v0
Breakout-ram-v4
Breakout-ramDeterministic-v0
Breakout-ramDeterministic-v4
Breakout-ramNoFrameskip-v0
Breakout-ramNoFrameskip-v4
Breakout-v0
Breakout-v4
BreakoutDeterministic-v0
BreakoutDeterministic-v4
BreakoutNoFrameskip-v0
BreakoutNoFrameskip-v4


In [22]:
env = gym.make("BreakoutNoFrameskip-v4")

print("Observation Space: ", env.observation_space)
print("Action Space       ", env.action_space)


Observation Space:  Box(0, 255, (210, 160, 3), uint8)
Action Space        Discrete(4)


In [27]:
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        terminated = truncated = False

        for i in range(self._skip):
            obs, reward, term, trunc, info = self.env.step(action)
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += reward
            terminated = terminated or term
            truncated = truncated or trunc
            if terminated or truncated:
                break

        max_frame = self._obs_buffer.max(axis=0)
        return max_frame, total_reward, terminated, truncated, info

In [None]:
#from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv


env = gym.make("BreakoutNoFrameskip-v4")
env = gym.wrappers.RecordEpisodeStatistics(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4) # 3 frames for space invaders
env = MaxAndSkipEnv(env, skip=4)

Deep_Q_Learning(env, replay_memory_size=100_000, device='cpu')
env.close()

In [29]:
# Save the currently running model
torch.save({
    'model_state_dict': q_network.state_dict(),
    'action_space': env.action_space.n,
}, 'dqn_checkpoint_manual.pt')

NameError: name 'q_network' is not defined

In [8]:
import gymnasium
print(gymnasium.__version__)

1.0.0


In [None]:
from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
from stable_baselines3.common.buffers import ReplayBuffer
from stable_baselines3.common.utils import safe_mean

import torch as th
import numpy as np
from torch.nn import functional as F
import matplotlib.pyplot as plt


class CustomDQN(OffPolicyAlgorithm):
    def __init__(
            self,
            policy,
            env,
            replay_memory_size=100_000,
            nb_epochs=30_000_000,
            update_frequency=4,
            batch_size=32,
            discount_factor=0.99,
            replay_start_size=80_000,
            initial_exploration=1.0,
            final_exploration=0.01,
            exploration_steps=1_000_000,
            learning_rate=1.25e-4,
            device="cuda",
            **kwargs
    ):
        super(CustomDQN, self).__init__(policy, env, device=device, **kwargs)
        self.replay_buffer = ReplayBuffer(
            replay_memory_size, env.observation_space, env.action_space, device, optimize_memory_usage=True
        )
        self.q_network = self.policy.q_net  # Ensure your policy contains a q_net
        self.optimizer = th.optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.epochs = nb_epochs
        self.update_frequency = update_frequency
        self.batch_size = batch_size
        self.discount_factor = discount_factor
        self.replay_start_size = replay_start_size
        self.initial_exploration = initial_exploration
        self.final_exploration = final_exploration
        self.exploration_steps = exploration_steps
        self.smoothed_rewards = []
        self.rewards = []

    def train(self, gradient_steps: int, batch_size: int = 32):
        for epoch in range(self.epochs):
            obs = self.env.reset()[0]
            dead = False
            total_rewards = 0

            while not dead:
                epsilon = max(
                    (self.final_exploration - self.initial_exploration) / self.exploration_steps * epoch
                    + self.initial_exploration,
                    self.final_exploration,
                )
                if np.random.rand() < epsilon:
                    action = self.env.action_space.sample()
                else:
                    q_values = self.q_network(th.Tensor(obs).unsqueeze(0).to(self.device))
                    action = th.argmax(q_values, dim=1).item()

                next_obs, reward, terminated, truncated, info = self.env.step(action)
                dead = terminated or truncated

                self.replay_buffer.add(obs, next_obs, action, reward, terminated)
                obs = next_obs
                total_rewards += reward

                if epoch > self.replay_start_size and epoch % self.update_frequency == 0:
                    # Sample from the replay buffer
                    data = self.replay_buffer.sample(self.batch_size)
                    with th.no_grad():
                        max_q_values = self.q_network(data.next_observations).max(1)[0]
                        targets = data.rewards.flatten() + self.discount_factor * max_q_values * (1 - data.dones.flatten())

                    # Compute loss and optimize
                    q_values = self.q_network(data.observations).gather(1, data.actions).squeeze()
                    loss = F.huber_loss(q_values, targets)

                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

            self.rewards.append(total_rewards)
            if epoch % 1000 == 0:
                self.logger.record("train/mean_reward", safe_mean(self.rewards[-100:]))


Modify the RL Zoo registry (train.py):



In [None]:
from custom_dqn import CustomDQN

ALGOS["custom_dqn"] = CustomDQN


In [None]:
from stable_baselines3.common.policies import BasePolicy
from torch import nn

class CustomDQNPolicy(BasePolicy):
    def __init__(self, observation_space, action_space, lr_schedule):
        super(CustomDQNPolicy, self).__init__(observation_space, action_space, lr_schedule)
        self.q_net = nn.Sequential(
            nn.Linear(observation_space.shape[0], 128),
            nn.ReLU(),
            nn.Linear(128, action_space.n)
        )
        self.optimizer = th.optim.Adam(self.parameters(), lr=lr_schedule(1))

    def forward(self, obs):
        return self.q_net(obs)


Train the Algorithm
Now you can use the Zoo CLI to train your CustomDQN:

In [None]:
python train.py --algo custom_dqn --env CartPole-v1 --hyperparams path/to/params.json


### related methods



## Deep Neural Networks for Environment Estimation:
They learn patterns from raw input data through multiple layers of processing to predict how the environment will respond to actions.
## Restricted Boltzmann Machines:
They create a two-way (bidirectional) network that can learn complex patterns by repeatedly comparing and adjusting between visible input data and hidden learned features.
##Gradient Temporal-Difference Methods:
They improve learning stability by updating the neural network weights based on the difference between predicted and actual outcomes, but in a way that prevents the predictions from spiraling out of control.
## Neural Fitted Q-learning:
It updates the entire Q-value prediction network all at once using stored experiences to minimize prediction errors across all seen situations.
## Experience Replay with Neural Networks:
 It stores past experiences in memory and randomly replays them during training to help the neural network learn more efficiently from less data.
## HyperNEAT on Atari:
 It evolves neural networks through artificial evolution, using special rules that help it discover patterns across the game screen's spatial layout.