In [None]:
import gymnasium as gym
import numpy as np
import keras
from keras.models import Sequential, clone_model
from keras.layers import Conv2D, Flatten, Dense
from keras.optimizers import Adam
from collections import deque
from gymnasium.wrappers import FrameStack
import random
from tqdm import tqdm

In [2]:
env = gym.make("ALE/Tetris-v5", render_mode='human')
env.reset()
done = False

In [3]:
state_size = env.observation_space.shape
action_size = env.action_space.n
memory = deque(maxlen=2000)
batch_size = 64
gamma = 0.95
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
target_update_counter = 0
update_target_every = 5
episodes = 100
max_steps_per_episode = 5000
learning_rate = 0.001
update_target_every = 10  # Update target DQN every 10 episodes

In [4]:
def build_model(input_shape, action_size):
    model = Sequential([ # Each person should change the amount of Conv2D/Dense layers, as well as the filter amount and kernel_size/strides
        Conv2D(32, kernel_size=(8, 8), strides=(4, 4), activation='relu', input_shape=input_shape),
        Conv2D(64, kernel_size=(4, 4), strides=(2, 2), activation='relu'),
        Conv2D(64, kernel_size=(3, 3), strides=(1, 1), activation='relu'),
        Flatten(),
        Dense(512, activation='relu'),
        Dense(256, activation='relu'),
        Dense(action_size, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    return model

In [5]:
dqn = build_model(state_size, action_size)
# Define target DQN model
target_dqn = clone_model(dqn)
target_dqn.set_weights(dqn.get_weights())

  super().__init__(


In [6]:
for episode in tqdm(range(episodes), desc='Episode Progress'):
    state, _ = env.reset()
    done = False
    for step in range(max_steps_per_episode):
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()  # Exploration
        else:
            q_values = dqn.predict(np.array([state]), verbose=None)[0]
            action = np.argmax(q_values)  # Exploitation

        # Ensure action is within bounds
        action = np.clip(action, 0, action_size - 1)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        memory.append((state, action, reward, next_state, done))

        state = next_state

        # Perform batch update every few steps
        if len(memory) >= batch_size:
            batch = random.sample(memory, batch_size)
            states = np.array([sample[0] for sample in batch])
            actions = np.array([sample[1] for sample in batch])
            rewards = np.array([sample[2] for sample in batch])
            next_states = np.array([sample[3] for sample in batch])
            dones = np.array([sample[4] for sample in batch])

            # Calculate target Q-values using target DQN
            target_q_values = target_dqn.predict(next_states, verbose=0)
            target_actions = np.argmax(target_q_values, axis=1)  # Get the index of the action with maximum Q-value
            target_values = dqn.predict(next_states, verbose=0)  # Get Q-values for the next states from the main DQN
            targets = rewards + (1 - dones) * gamma * target_values[np.arange(batch_size), target_actions]

            # Ensure targets have the appropriate shape for the loss calculation
            targets = targets.reshape(-1, 1)  # Reshape to (batch_size, 1) if necessary

            # Update Q-values using the target Q-values
            dqn.fit(states, targets, epochs=1, verbose=0)

        if done:
            break

    # Decay exploration rate
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Update target DQN weights periodically
    if episode % update_target_every == 0:
        target_dqn.set_weights(dqn.get_weights())

env.close()

Episode Progress:  15%|█▌        | 15/100 [1:03:44<6:01:12, 254.97s/it]


KeyboardInterrupt: 

In [7]:
dqn.save_weights('brennan.weights.h5')

: 