In [9]:
# Step 0: Install dependencies
#!pip install tensorflow
#!pip install gym
#!pip install keras
!pip install keras-rl2

Collecting keras-rl2
  Downloading keras_rl2-1.0.5-py3-none-any.whl.metadata (304 bytes)
Downloading keras_rl2-1.0.5-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-rl2
Successfully installed keras-rl2-1.0.5


In [2]:
# Step 1: Import dependencies
import tensorflow as tf
import gymnasium as gym
import keras
import random

In [3]:
# Step 2: Test random  environment with OpenAI gym
env = gym.make('CartPole-v1')

# Step 3: Number of possible states in the Environment
states = env.observation_space.shape[0]
print("States:", states)

# Step 4: Number of possible actions by the Agent in the Environment
actions = env.action_space.n
print("Actions:", actions)

States: 4
Actions: 2


In [4]:
# Step 5: Building a random environment to visualize random steps

# Number of trials for playing the game
episodes = 10


for episode in (range(1, episodes+1)):

  # Reset the Environment, game state and score for each episode
  state = env.reset()
  done = False
  score = 0

  # while episode is not done
  while not done:

    # Render our environment to see Agent's actions
    env.render()

    # Establishing random actions by the Agent
    #action = random.choice([0, 1])
    action = env.action_space.sample()

    # Applying the random action of the Agent to the Environment
    # Unpack the five values from the step() method
    n_state, reward, terminated, truncated, info = env.step(action)

    score += reward

    # Combine the termination flags into one "done" flag
    done = terminated or truncated

  print("Episode: {}, Score: {}".format(episode, score))

Episode: 1, Score: 20.0
Episode: 2, Score: 16.0
Episode: 3, Score: 25.0
Episode: 4, Score: 29.0
Episode: 5, Score: 30.0
Episode: 6, Score: 21.0
Episode: 7, Score: 15.0
Episode: 8, Score: 12.0
Episode: 9, Score: 14.0
Episode: 10, Score: 12.0


  gym.logger.warn(


In [1]:
# Step 6: Create a DQN model with Keras
import os
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers
from gym.wrappers import RecordVideo

# 1. Define the Q-Network using Keras's Functional API.
def create_q_model(num_states, num_actions):
    inputs = layers.Input(shape=(num_states,))
    x = layers.Dense(24, activation='relu')(inputs)
    x = layers.Dense(24, activation='relu')(x)
    outputs = layers.Dense(num_actions, activation='linear')(x)
    model = Model(inputs=inputs, outputs=outputs)
    return model

# 2. A simple replay buffer to store experiences.
class ReplayBuffer:
    def __init__(self, max_size=10000):
        self.buffer = []
        self.max_size = max_size

    def add(self, experience):
        self.buffer.append(experience)
        if len(self.buffer) > self.max_size:
            self.buffer.pop(0)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), size=batch_size, replace=False)
        return [self.buffer[i] for i in indices]

# 3. Hyperparameters for training.
num_episodes = 100       # Total training episodes.
batch_size = 64
gamma = 0.99             # Discount factor.
epsilon = 1.0            # Starting exploration rate.
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 1e-3

# 4. Create the Gym environment.
env_name = "CartPole-v1"
# For training, use a standard (non-recording) environment.
env = gym.make(env_name)
num_actions = env.action_space.n
num_states = env.observation_space.shape[0]

# 5. Build the primary and target Q-networks.
primary_network = create_q_model(num_states, num_actions)
target_network  = create_q_model(num_states, num_actions)
target_network.set_weights(primary_network.get_weights())

optimizer = optimizers.Adam(learning_rate=learning_rate)
replay_buffer = ReplayBuffer(max_size=10000)

# 6. Define a training step function.
def train_step(batch):
    # Unpack the batch.
    states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
    # Compute target Q-values using the target network.
    next_q_values = target_network(next_states)
    # Note: using .numpy() here assumes eager execution.
    max_next_q = np.max(next_q_values.numpy(), axis=1)
    target_q = rewards + (1 - dones) * gamma * max_next_q

    with tf.GradientTape() as tape:
        q_values = primary_network(states)
        # One-hot encode actions to select the predicted Q-value for each state.
        action_masks = tf.one_hot(actions, num_actions)
        predicted_q = tf.reduce_sum(q_values * action_masks, axis=1)
        loss = tf.reduce_mean(tf.square(target_q - predicted_q))

    grads = tape.gradient(loss, primary_network.trainable_variables)
    optimizer.apply_gradients(zip(grads, primary_network.trainable_variables))
    return loss

# 7. A helper function to run one episode (returns total reward).
def run_episode(env, epsilon):
    state = env.reset()
    # For newer Gym versions that return a tuple.
    if isinstance(state, tuple):
        state = state[0]
    total_reward = 0
    done = False
    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            state_input = np.expand_dims(state, axis=0).astype(np.float32)
            q_vals = primary_network(state_input)
            action = int(np.argmax(q_vals.numpy()[0]))
        step_result = env.step(action)
        # Handle new and old Gym API.
        if len(step_result) == 5:
            next_state, reward, done, truncated, _ = step_result
            done = done or truncated
        else:
            next_state, reward, done, _ = step_result
        total_reward += reward
        state = next_state
    return total_reward

# 8. Function to record one episode using a given environment.
def record_episode(video_folder, epsilon):
    # Remove previous video files if any.
    if os.path.exists(video_folder):
        os.system(f"rm -rf {video_folder}")
    os.makedirs(video_folder, exist_ok=True)
    # Wrap a new environment with video recording.
    video_env = RecordVideo(gym.make(env_name), video_folder=video_folder, episode_trigger=lambda x: True)
    total_reward = run_episode(video_env, epsilon)
    video_env.close()
    return total_reward

# 9. Record a pre-training episode (with random actions, i.e. epsilon=1.0).
print("Recording pre-training episode (random policy)...")
pre_video_folder = "./video_pre"
pre_reward = record_episode(pre_video_folder, epsilon=1.0)
print(f"Pre-training episode reward: {pre_reward}")

# 10. Main DQN training loop.
print("Starting training...")
for episode in range(num_episodes):
    state = env.reset()
    if isinstance(state, tuple):
        state = state[0]
    total_reward = 0
    done = False
    while not done:
        # Epsilon-greedy action selection.
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            state_input = np.expand_dims(state, axis=0).astype(np.float32)
            q_vals = primary_network(state_input)
            action = int(np.argmax(q_vals.numpy()[0]))

        step_result = env.step(action)
        if len(step_result) == 5:
            next_state, reward, done, truncated, _ = step_result
            done = done or truncated
        else:
            next_state, reward, done, _ = step_result

        total_reward += reward
        replay_buffer.add((state, action, reward, next_state, float(done)))
        state = next_state

        if len(replay_buffer.buffer) >= batch_size:
            batch = replay_buffer.sample(batch_size)
            _ = train_step(batch)

    # Decay epsilon.
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Update the target network every 10 episodes.
    if episode % 10 == 0:
        target_network.set_weights(primary_network.get_weights())

    print(f"Episode: {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")

env.close()
print("Training complete.")

# 11. Record a post-training episode using the trained policy (set epsilon=0 to use greedy actions).
print("Recording post-training episode (trained policy)...")
post_video_folder = "./video_post"
post_reward = record_episode(post_video_folder, epsilon=0.0)
print(f"Post-training episode reward: {post_reward}")

# 12. (Optional) Display one of the videos in Colab.
from IPython.display import HTML
from base64 import b64encode
import glob

def show_video(video_folder):
    videos = glob.glob(os.path.join(video_folder, "*.mp4"))
    if videos:
        video_file = open(videos[0], "rb").read()
        video_url = "data:video/mp4;base64," + b64encode(video_file).decode()
        return HTML(f'<video width="400" controls><source src="{video_url}" type="video/mp4"></video>')
    else:
        return "No video found."

# Uncomment these lines to display the pre- and post-training videos in a Colab cell.
display(show_video(pre_video_folder))
display(show_video(post_video_folder))


  deprecation(
  deprecation(
  logger.warn(
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Recording pre-training episode (random policy)...


  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  if not isinstance(terminated, (bool, np.bool8)):
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Pre-training episode reward: 18.0
Starting training...
Episode: 0, Total Reward: 21.0, Epsilon: 0.995
Episode: 1, Total Reward: 21.0, Epsilon: 0.990
Episode: 2, Total Reward: 10.0, Epsilon: 0.985
Episode: 3, Total Reward: 14.0, Epsilon: 0.980
Episode: 4, Total Reward: 21.0, Epsilon: 0.975
Episode: 5, Total Reward: 26.0, Epsilon: 0.970
Episode: 6, Total Reward: 26.0, Epsilon: 0.966
Episode: 7, Total Reward: 31.0, Epsilon: 0.961
Episode: 8, Total Reward: 13.0, Epsilon: 0.956
Episode: 9, Total Reward: 56.0, Epsilon: 0.951
Episode: 10, Total Reward: 32.0, Epsilon: 0.946
Episode: 11, Total Reward: 15.0, Epsilon: 0.942
Episode: 12, Total Reward: 21.0, Epsilon: 0.937
Episode: 13, Total Reward: 17.0, Epsilon: 0.932
Episode: 14, Total Reward: 10.0, Epsilon: 0.928
Episode: 15, Total Reward: 16.0, Epsilon: 0.923
Episode: 16, Total Reward: 14.0, Epsilon: 0.918
Episode: 17, Total Reward: 22.0, Epsilon: 0.914
Episode: 18, Total Reward: 25.0, Epsilon: 0.909
Episode: 19, Total Reward: 17.0, Epsilon: 0

  deprecation(
  deprecation(
  logger.warn(
  logger.deprecation(
  if not isinstance(terminated, (bool, np.bool8)):
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Post-training episode reward: 184.0
