In [1]:
import gymnasium as gym
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, Flatten, Dense
from keras.optimizers import Adam
from collections import deque
import random
from tqdm import tqdm



In [2]:
# Define constants and hyperparameters
num_episodes = 100
max_steps_per_episode = 1000
learning_rate = 0.001 # Reminder to use 0.001 but using 0.01 for testing algorithm/network with lower number of episodes
batch_size = 64
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.975 # Reminder to use 0.995 but using 0.975 for testing with lower number of episodes
memory = deque(maxlen=10000)  # Experience replay buffer
env_name = "ALE/Frogger-v5"

In [3]:
def build_model(input_shape, num_actions):
    model = Sequential([
        Conv2D(32, kernel_size=(8, 8), strides=(4, 4), activation='relu', input_shape=input_shape),
        Conv2D(64, kernel_size=(4, 4), strides=(2, 2), activation='relu'),
        Conv2D(64, kernel_size=(3, 3), strides=(1, 1), activation='relu'),
        Flatten(),
        Dense(512, activation='relu'),
        Dense(256, activation='relu'),
        Dense(num_actions, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    return model

In [4]:
# Create the environment
env = gym.make(env_name, render_mode='human')
state_shape = env.observation_space.shape
num_actions = env.action_space.n

In [5]:
env.observation_space.shape

(210, 160, 3)

In [6]:
# Build the DQN model
model = build_model(state_shape, num_actions)

  super().__init__(


In [7]:
# Training loop
for episode in tqdm(range(num_episodes), desc='Episode Progress', position=0):
    state, _ = env.reset()
    episode_reward = 0
    done = False

    for step in range(max_steps_per_episode):
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()  # Exploration
        else:
            q_values = model.predict(np.array([state]), verbose=None)[0]
            action = np.argmax(q_values)  # Exploitation

        # Ensure action is within bounds
        action = np.clip(action, 0, num_actions - 1)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        memory.append((state, action, reward, next_state, done))

        state = next_state

        if done:
            break

    # Experience replay
    if len(memory) >= batch_size:
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + gamma * np.amax(model.predict(np.array([next_state]), verbose=None)[0])

            target_f = model.predict(np.array([state]), verbose=None)
            target_f[0][action] = target
            model.fit(np.array([state]), target_f, epochs=1, verbose=None)

    # Decay exploration rate
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    print(f"\rPrevious episode: Episode: {episode + 1}/{num_episodes}, Total Reward: {episode_reward}, Epsilon: {epsilon:.4f}", end="")

env.close()

Episode Progress:   1%|          | 1/100 [00:21<35:05, 21.26s/it]

Previous episode: Episode: 1/100, Total Reward: 11.0, Epsilon: 0.9750

Episode Progress:   2%|▏         | 2/100 [00:49<41:13, 25.24s/it]

Previous episode: Episode: 2/100, Total Reward: 9.0, Epsilon: 0.9506

Episode Progress:   3%|▎         | 3/100 [01:20<45:32, 28.17s/it]

Previous episode: Episode: 3/100, Total Reward: 8.0, Epsilon: 0.9269

Episode Progress:   4%|▍         | 4/100 [01:51<46:31, 29.08s/it]

Previous episode: Episode: 4/100, Total Reward: 12.0, Epsilon: 0.9037

Episode Progress:   4%|▍         | 4/100 [02:16<54:29, 34.06s/it]


KeyboardInterrupt: 