In [7]:
import gymnasium as gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
from tqdm import tqdm

In [8]:
# Define the Deep Q-Network (DQN) model
def build_dqn(input_shape, num_actions):
    model = Sequential([
        Dense(64, input_shape=input_shape, activation='relu'),
        Dense(64, activation='relu'),
        Dense(num_actions, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

In [9]:
# Define constants and hyperparameters
num_episodes = 1000
max_steps_per_episode = 1000
batch_size = 64
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
memory = deque(maxlen=10000)  # Experience replay buffer
env_name = "ALE/Frogger-v5"

In [10]:
# Create the environment
env = gym.make(env_name)
state_shape = env.observation_space.shape
num_actions = env.action_space.n

In [11]:
# Build the DQN model
model = build_dqn(state_shape, num_actions)

In [12]:
# Training loop
for episode in tqdm(range(num_episodes), desc='Episode Progress', position=0):
    state, _ = env.reset()
    episode_reward = 0
    done = False

    for step in range(max_steps_per_episode):
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()  # Exploration
        else:
            q_values = model.predict(np.array([state]), verbose=None)[0]
            action = np.argmax(q_values)  # Exploitation

        # Ensure action is within bounds
        action = np.clip(action, 0, num_actions - 1)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        memory.append((state, action, reward, next_state, done))

        state = next_state

        if done:
            break
    # Experience replay
    if len(memory) >= batch_size:
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + gamma * np.amax(model.predict(np.array([next_state]), verbose=None)[0])

            target_f = model.predict(np.array([state]), verbose=None)
            target_f[0][action] = target
            model.fit(np.array([state]), target_f, epochs=1, verbose=None)

    # Decay exploration rate
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # print(f"Previous episode: Total Reward: {episode_reward}, Epsilon: {epsilon:.4f}")

env.close()

Episode Progress:   0%|          | 1/1000 [00:08<2:26:04,  8.77s/it]

Previous episode: Total Reward: 7.0, Epsilon: 0.9950


Episode Progress:   0%|          | 2/1000 [00:17<2:22:16,  8.55s/it]

Previous episode: Total Reward: 10.0, Epsilon: 0.9900


Episode Progress:   0%|          | 3/1000 [00:25<2:21:06,  8.49s/it]

Previous episode: Total Reward: 8.0, Epsilon: 0.9851


Episode Progress:   0%|          | 4/1000 [00:33<2:20:15,  8.45s/it]

Previous episode: Total Reward: 11.0, Epsilon: 0.9801


Episode Progress:   0%|          | 5/1000 [00:42<2:21:38,  8.54s/it]

Previous episode: Total Reward: 10.0, Epsilon: 0.9752


Episode Progress:   1%|          | 6/1000 [00:51<2:21:06,  8.52s/it]

Previous episode: Total Reward: 6.0, Epsilon: 0.9704


Episode Progress:   1%|          | 7/1000 [00:59<2:21:05,  8.53s/it]

Previous episode: Total Reward: 7.0, Epsilon: 0.9655


Episode Progress:   1%|          | 8/1000 [01:08<2:20:48,  8.52s/it]

Previous episode: Total Reward: 6.0, Epsilon: 0.9607


Episode Progress:   1%|          | 9/1000 [01:16<2:20:37,  8.51s/it]

Previous episode: Total Reward: 7.0, Epsilon: 0.9559


Episode Progress:   1%|          | 10/1000 [01:25<2:23:19,  8.69s/it]

Previous episode: Total Reward: 11.0, Epsilon: 0.9511


Episode Progress:   1%|          | 11/1000 [01:34<2:23:32,  8.71s/it]

Previous episode: Total Reward: 8.0, Epsilon: 0.9464


Episode Progress:   1%|          | 12/1000 [01:43<2:23:50,  8.73s/it]

Previous episode: Total Reward: 6.0, Epsilon: 0.9416


Episode Progress:   1%|▏         | 13/1000 [01:52<2:26:06,  8.88s/it]

Previous episode: Total Reward: 8.0, Epsilon: 0.9369


Episode Progress:   1%|▏         | 14/1000 [02:01<2:24:22,  8.79s/it]

Previous episode: Total Reward: 8.0, Epsilon: 0.9322


Episode Progress:   2%|▏         | 15/1000 [02:09<2:24:23,  8.80s/it]

Previous episode: Total Reward: 8.0, Epsilon: 0.9276


Episode Progress:   2%|▏         | 15/1000 [02:17<2:30:42,  9.18s/it]


KeyboardInterrupt: 