Reinforcement Learning Pong Agent


In [1]:
%pip install gymnasium
%pip install gymnasium[atari]
%pip install gymnasium[accept-rom-license]

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Imports

In [2]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import random
import cv2
import os
import matplotlib.pyplot as plt
from IPython.display import clear_output
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))


Num GPUs Available:  0


Environment

In [3]:
env = gym.make("ALE/Pong-v5", render_mode='rgb_array') # create environment
env = gym.wrappers.RecordVideo(env, './videos', episode_trigger=lambda episode_id: True, video_length=0)
action_space_n = env.action_space.n

Model

In [4]:
# hyperparameter
learning_rate = 0.0001
discount_factor = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
batch_size = 64
num_episodes = 1000
frame_skip = 2 # number of frames to skip 


# model architecture
def create_model(action_space_n):
    input_layer = layers.Input(shape=(84, 84, 4)) # input shape is 84x84x4
    conv1 = layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(input_layer) # 32 filters of 8x8 with stride 4
    conv2 = layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(conv1) # 64 filters of 4x4 with stride 2
    conv3 = layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(conv2) # 64 filters of 3x3 with stride 1
    flatten = layers.Flatten()(conv3) # flatten the output
    dense1 = layers.Dense(512, activation='relu')(flatten) # dense layer with 512 units
    output_layer = layers.Dense(action_space_n, activation='linear')(dense1) # output layer with action_space_n units
    model = models.Model(inputs=input_layer, outputs=output_layer)
    return model

# agent initialization
model = create_model(action_space_n)
target_model = create_model(action_space_n)
target_model.set_weights(model.get_weights())

optimizer = tf.keras.optimizers.Adam(learning_rate)
huber_loss = tf.keras.losses.Huber()

# Experience Replay Buffer
class ExperienceReplayBuffer:
    def __init__(self, capacity=10000): # capacity of the buffer
        self.capacity = capacity
        self.buffer = []
        self.position = 0
 
    def add(self, state, action, reward, next_state, done): # add experience to the buffer
        if len(self.buffer) < self.capacity: 
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done) # add experience to the buffer
        self.position = (self.position + 1) % self.capacity # update position

    def sample(self, batch_size): # sample experience from the buffer
        return zip(*random.sample(self.buffer, batch_size)) # return a batch of experiences

# Preprocessing function for the state
def preprocess_state(state): 
    gray_state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY) # convert to grayscale
    resized_state = cv2.resize(gray_state, (84, 84)) # resize to 84x84
    return resized_state.astype(np.float32) / 255.0 # normalize

# plots
def plot_results(episode_rewards, episode_lengths, episode_losses):
    plt.figure(figsize=(12, 8))

    plt.subplot(3, 1, 1) 
    plt.plot(episode_rewards, label="Total Reward")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.legend()

    plt.subplot(3, 1, 2)
    plt.plot(episode_lengths, label="Episode Length")
    plt.xlabel("Episode")
    plt.ylabel("Length")
    plt.legend()

    plt.subplot(3, 1, 3)
    plt.plot(episode_losses, label="Loss")
    plt.xlabel("Episode")
    plt.ylabel("Loss")
    plt.legend()

    plt.tight_layout() 
    plt.show()

In [None]:
# class PrioritizedExperienceReplayBuffer:
#     def __init__(self, capacity=10000, alpha=0.6, beta=0.4, beta_increment_per_sampling=0.001, epsilon=0.01):
#         self.capacity = capacity
#         self.alpha = alpha
#         self.beta = beta
#         self.beta_increment_per_sampling = beta_increment_per_sampling
#         self.epsilon = epsilon
#         self.buffer = []
#         self.priorities = np.zeros((capacity,), dtype=np.float32)
#         self.position = 0
#         self.priorities_sum = 1.0

#     def add(self, state, action, reward, next_state, done):
#         priority = np.max(self.priorities) if self.buffer else 1.0
#         experience = (state, action, reward, next_state, done)
#         if len(self.buffer) < self.capacity:
#             self.buffer.append(experience)
#         else:
#             self.buffer[self.position] = experience
#         self.priorities[self.position] = priority
#         self.position = (self.position + 1) % self.capacity

#     def sample(self, batch_size):
#         if len(self.buffer) == self.capacity:
#             priorities = self.priorities
#         else:
#             priorities = self.priorities[:self.position]
#         probabilities = priorities ** self.alpha
#         probabilities /= np.sum(probabilities)  # Normalizing probabilities to sum up to 1
#         indices = np.random.choice(len(self.buffer), size=batch_size, p=probabilities)
#         samples = [self.buffer[idx] for idx in indices]
#         weights = (len(self.buffer) * probabilities[indices]) ** (-self.beta)
#         weights /= weights.max()
#         self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
#         return samples, indices, weights

#     def update_priorities(self, indices, priorities):
#         self.priorities[indices] = priorities
#         self.priorities_sum = np.sum(self.priorities)

In [None]:
# # Training metrics
# episode_rewards = []
# episode_lengths = []
# episode_losses = []

# buffer = PrioritizedExperienceReplayBuffer(capacity=10000, alpha=alpha, beta=beta)

# record_env = None
# videos_dir = "videos"
# os.makedirs(videos_dir, exist_ok=True)

# best_reward = -float('inf')

# for episode in range(num_episodes):
#     if (episode + 1) % 25 == 0:
#         if record_env is not None:
#             record_env.close()
#         video_path = os.path.join(videos_dir, f"episode_{episode + 1}")
#         record_env = gym.wrappers.RecordVideo(env, video_path, episode_trigger=lambda episode_id: True, video_length=0)
#     else:
#         env.reset()

#     state, info = env.reset()
#     state = preprocess_state(state)
#     state = np.stack([state] * 4, axis=2)
#     done = False
#     total_reward = 0
#     step = 0
#     losses = []

#     while not done:
#         if np.random.rand() <= epsilon:
#             action = env.action_space.sample()
#         else:
#             action = np.argmax(model.predict(np.expand_dims(state, axis=0), verbose=0))

#         for _ in range(frame_skip):
#             next_state, reward, done, truncated, info = env.step(action)
#             done = done or truncated
#             total_reward += reward
#             if done:
#                 break

#         next_state, reward, done, truncated, info = env.step(action)
#         done = done or truncated

#         next_state = preprocess_state(next_state)
#         next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, axis=2), axis=2)

#         total_reward += reward

#         buffer.add(state, action, reward, next_state, done)

#         state = next_state
#         step += 1

#         if len(buffer.buffer) > batch_size:
#             samples, indices, weights = buffer.sample(batch_size)
#             states, actions, rewards, next_states, dones = zip(*samples)

#             future_rewards = target_model.predict(np.array(next_states), verbose=0)
#             dones = np.array(dones, dtype=int)
#             updated_q_values = rewards + discount_factor * np.max(future_rewards, axis=1) * (1 - dones)

#             masks = tf.one_hot(actions, action_space_n)

#             with tf.GradientTape() as tape:
#                 q_values = model(np.array(states))
#                 q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
#                 loss = huber_loss(updated_q_values, q_action)
#                 weighted_loss = tf.reduce_mean(loss * weights)  # Weighted loss
#                 losses.append(weighted_loss.numpy())

#             grads = tape.gradient(weighted_loss, model.trainable_variables)
#             optimizer.apply_gradients(zip(grads, model.trainable_variables))

#             buffer.update_priorities(indices, loss.numpy())

#     episode_rewards.append(total_reward)
#     episode_lengths.append(step)
#     episode_losses.append(np.mean(losses))

#     print(f"Episode {episode + 1} abgeschlossen mit {step} Schritten, Gesamtbelohnung: {total_reward}, Epsilon: {epsilon}")

#     if episode % 10 == 0:
#         target_model.set_weights(model.get_weights())

#     epsilon = max(epsilon_min, epsilon * epsilon_decay)

#     if total_reward > best_reward:
#         best_reward = total_reward
#         model.save("best_pong_v5_rl_model.keras")

#     if (episode + 1) % 100 == 0:
#         model.save(f"pong_v5_rl_model_episode_{episode + 1}.keras")

#     if (episode + 1) % 5 == 0:
#         plot_results(episode_rewards, episode_lengths, episode_losses)

# if record_env is not None:
#     record_env.close()

# model.save("pong_rl_model.keras")

Training

In [5]:
# training metrics
episode_rewards = []
episode_lengths = []
episode_losses = []

buffer = ExperienceReplayBuffer()
best_reward = -float('inf') 

for episode in range(num_episodes):
    env.reset() 
    state, info = env.reset()
    state = preprocess_state(state)  # Preprocessing of the state
    state = np.stack([state] * 4, axis=2)  # Stacking of the state 4 times
    done = False
    total_reward = 0
    step = 0  # step counter for the episode
    losses = []

    while not done: # loop through steps
        if np.random.rand() <= epsilon: # epsilon greedy policy
            action = env.action_space.sample() # take random action
        else:
            action = np.argmax(model.predict(np.expand_dims(state, axis=0), verbose=0))

        # Frame skipping logic
        # for _ in range(frame_skip):
        #     next_state, reward, done, truncated, info = env.step(action)
        #     done = done or truncated
        #     total_reward += reward
        #     if done:
        #         break

        next_state, reward, done, truncated, info = env.step(action) # take action in the environment
        done = done or truncated  # check if the episode is done or truncated

        next_state = preprocess_state(next_state)  
        next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, axis=2), axis=2)

        total_reward += reward

        buffer.add(state, action, reward, next_state, done)

        state = next_state
        step += 1  

        if len(buffer.buffer) > batch_size:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)

            # Q-Learning
            future_rewards = target_model.predict(np.array(next_states), verbose=0) # predict future rewards with target model
            dones = np.array(dones, dtype=int) # convert dones to int

            updated_q_values = rewards + discount_factor * np.max(future_rewards, axis=1) * (1 - dones) # calculate updated q values

            masks = tf.one_hot(actions, action_space_n) # create masks 

            with tf.GradientTape() as tape: # calculate loss
                q_values = model(np.array(states))
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = huber_loss(updated_q_values, q_action)
                losses.append(loss.numpy())

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        #print(f"Episode: {episode + 1}, Step: {step}, Step Reward: {reward}, Total Reward: {total_reward}, Epsilon: {epsilon}")

    episode_rewards.append(total_reward) #append total reward to list
    episode_lengths.append(step) #append step to list
    episode_losses.append(np.mean(losses)) #append loss to list

    print(f"Episode {episode + 1} abgeschlossen mit {step} Schritten, Gesamtbelohnung: {total_reward}, Epsilon: {epsilon}")

    if episode % 10 == 0:
        target_model.set_weights(model.get_weights())

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    if total_reward > best_reward:  # save best model
        best_reward = total_reward
        model.save("best_pong_v5_rl_model.keras")

    if (episode + 1) % 100 == 0: #save model every 100 episodes
        model.save(f"pong_v5_rl_model_episode_{episode + 1}.keras")

    if (episode + 1) % 50 == 0: 
        plot_results(episode_rewards, episode_lengths, episode_losses)      
        plt.savefig(f"pong_v5_rl_model_episode_{episode + 1}.png") 

model.save("pong_rl_model.keras") #save model

  logger.warn(


KeyboardInterrupt: 