#### **DDQN with Hyperparameter setting**

This is an example of Hyperparameter setting for CARTPOLE. <br>
With the actual setup it can take many hours (even with a GPU) <br>
Press the colab icon to run it in COLAB (in this way you save your laptop for other tasks)

<a target="_blank" href="https://colab.research.google.com/github/castorgit/RL_course/blob/main/00_LunarLander-COLAB_render.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


In [1]:
import tensorflow as tf
import keras as k
print(k.__version__)
print(tf.__version__)

2024-12-21 09:53:46.964305: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-21 09:53:46.976206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-21 09:53:46.990538: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-21 09:53:46.994798: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-21 09:53:47.005543: I tensorflow/core/platform/cpu_feature_guar

3.6.0
2.17.1


In [2]:
#### **Install Packages**
!pip install gymnasium[box2d]
!pip install numpy
!pip install random
!pip install matplotlib
!pip install tensorflow==2.17.1
!pip install keras==3.6.0
!pip pandas as pd

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try apt install
[31m   [0m python3-xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian-packaged Python package,
[31m   [0m create a virtual environment using python3 -m venv path/to/venv.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip. Make
[31m   [0m sure you have python3-full installed.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian packaged Python application,
[31m   [0m it may be easiest to use pipx install xyz, which will manage a
[31m   [0m virtual environment for you. Make sure you have pipx installed.
[31m   [0m 
[31m   [0m See /usr/share/doc/python3.12/README.venv for more information.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS dist

In [14]:
# Imports
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings
import gymnasium as gym
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import deque
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
import time
from PIL import Image
from sklearn.model_selection import ParameterGrid
import pandas as pd

import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


#### **Environment Setup**

In [2]:
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]  # 8 state variables
action_size = env.action_space.n  # 4 discrete actions
tf.random.set_seed(221)

#### **hyperparameter search space**

In [3]:
param_grid = {
    'learning_rate': [0.0001],
    'gamma': [0.99],
    'batch_size': [128],
    'tau': [0.1],
    'epsilon_decay': [0.99],
    'retrain_steps': [15, 10] 
}

In [4]:
learning_rate = 0.0001
gamma = 0.98
batch_size = 128
epsilon_start = 1.0
epsilon_min = 0.01
epsilon_decay = 0.99
tau = 0.15  # For soft target network update
buffer_capacity = 10000
max_episodes = 100                  # We train only 50 to see how the agent learns
max_steps = 1200
solved_threshold = 195
verbose = 0                        # 0: No trace 1: Trace

In [5]:
# Replay buffer
replay_buffer = deque(maxlen=buffer_capacity)

# Add experience to replay buffer
def store_experience(state, action, reward, next_state, done):
    replay_buffer.append((state, action, reward, next_state, done))

# Sample experiences from the replay buffer
def sample_experiences(batch_size):
    indices = np.random.choice(len(replay_buffer), batch_size, replace=False)
    batch = [replay_buffer[i] for i in indices]
    states, actions, rewards, next_states, dones = zip(*batch)
    
    return (
        np.vstack(states),
        np.array(actions),
        np.array(rewards),
        np.vstack(next_states),
        np.array(dones, dtype=np.float32)
    )

#### **Neural Network definition**

In [6]:
# Build the neural network
def build_model(state_size, action_size):
    inputs = Input(shape=(state_size,))  
    x = Dense(16, activation="relu")(inputs)
    x = Dense(64, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    outputs = Dense(action_size, activation="linear")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="mse")
    return model

#### **Support Functions**

In [7]:
# Soft update function for the target network
def soft_update(model, target_model, tau):
    target_weights = target_model.get_weights()
    model_weights = model.get_weights()
    new_weights = [
        tau * mw + (1 - tau) * tw for mw, tw in zip(model_weights, target_weights)
    ]
    target_model.set_weights(new_weights)

# Polski Optimization Function
def polski_optimization(weights, beta=0.01):
    return [w * (1 - beta) for w in weights]

# Double DQN target calculation
def experience_replay_with_ddqn(model, target_model, batch_size, gamma, tau, step):
    if len(replay_buffer) < batch_size:
        return

    states, actions, rewards, next_states, dones = sample_experiences(batch_size)

    # Predict Q-values for next states using both networks
    next_q_values = model.predict(next_states, verbose=0)
    best_actions = np.argmax(next_q_values, axis=1)
    target_q_values = target_model.predict(next_states, verbose=0)

    # Update Q-values using Double DQN formula
    targets = rewards + gamma * target_q_values[np.arange(batch_size), best_actions] * (1 - dones)

    # Update main Q-network
    q_values = model.predict(states, verbose=0)
    q_values[np.arange(batch_size), actions] = targets
    model.fit(states, q_values, epochs=1, verbose=0)

    # Apply soft update to target network
    if step % retrain_steps == 0:   
        soft_update(model, target_model, tau)

In [8]:
# Initialize models
#model = build_model(state_size, action_size)
#target_model = build_model(state_size, action_size)
#target_model.set_weights(model.get_weights())  # Sync target network initially


In [12]:
# Training loop
def train_ddqn_with_params(params, num_episodes, verbose):
    
    global learning_rate, gamma, batch_size, tau, epsilon_decay, retrain_steps
    learning_rate = params['learning_rate']
    gamma = params['gamma']
    batch_size = params['batch_size']
    tau = params['tau']
    epsilon_decay = params['epsilon_decay']
    retrain_steps = params['retrain_steps']
    
    print('**************************************************************')
    print('lr:', learning_rate, 'gamma', gamma, 'batch_size', batch_size, 'tau ', tau, 'epsilon decay', epsilon_decay,
                 'retrain_steps ', retrain_steps)
    
    # Initialize models
    model = build_model(state_size, action_size)
    target_model = build_model(state_size, action_size)
    target_model.set_weights(model.get_weights())  # Sync target network initially
    

    epsilon = epsilon_start
    episode_rewards = []
    rolling_avg_rewards = []

    start_time = time.time()

    for episode in range(max_episodes):
        state, _ = env.reset()
        state = np.reshape(state, [1, state_size])
        total_reward = 0

        for step in range(max_steps):
            # Epsilon-greedy policy
            if np.random.rand() <= epsilon:
                action = np.random.randint(action_size)  # Explore
            else:
                action_vals = model.predict(state, verbose=0)
                action = np.argmax(action_vals[0])  # Exploit

            # Perform action
            next_state, reward, done, _, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            total_reward += reward

            # Store experience
            store_experience(state, action, reward, next_state, done)

            # Update state
            state = next_state

            # Train using experience replay
            experience_replay_with_ddqn(model, target_model, batch_size, gamma, tau, step)

            if done:
                break

        # Decay epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

        # Record reward
        episode_rewards.append(total_reward)
        rolling_avg = np.mean(episode_rewards[-20:])
        rolling_avg_rewards.append(rolling_avg)

        # Print progress
        if verbose:
           print(f"Episode: {episode+1:3}/{max_episodes}, Reward: {total_reward:+7.2f}, "
                f"Epsilon: {epsilon:.2f}, Rolling Avg: {rolling_avg:4.2f}, Steps: {step:3}")

        # Check if environment is solved
        if rolling_avg >= solved_threshold:
            print(f"Environment solved in {episode+1} episodes!")
            model.save("XXX_ddqn_model1.keras")
            break

    end_time = time.time()
    print(f"Rewards Rolling Avg: {rolling_avg:4.2f}")
    print(f"Training completed in {(end_time - start_time)/60:.2f} minutes")
    print(learning_rate, gamma, batch_size, tau, epsilon_decay, retrain_steps)
    
    return (rolling_avg)

In [15]:
# Perform grid search
results = []
for params in ParameterGrid(param_grid):
    avg_reward = train_ddqn_with_params(params, 50, verbose)
    results.append({**params, 'avg_reward': avg_reward})

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='avg_reward', ascending=False)

# Display top results
print("Top Hyperparameters:")
print(results_df.head())

**************************************************************
lr: 0.0001 gamma 0.99 batch_size 128 tau  0.15 epsilon decay 0.995 retrain_steps  30


I0000 00:00:1734794748.877377    1307 service.cc:146] XLA service 0x7f6a30005370 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1734794748.877401    1307 service.cc:154]   StreamExecutor device (0): NVIDIA T600 Laptop GPU, Compute Capability 7.5
I0000 00:00:1734794750.925746    1307 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Rewards Rolling Avg: 17.50
Training completed in 2.00 minutes
0.0001 0.99 128 0.15 0.995 30
**************************************************************
lr: 0.0001 gamma 0.99 batch_size 128 tau  0.1 epsilon decay 0.995 retrain_steps  30
Rewards Rolling Avg: 30.35
Training completed in 3.45 minutes
0.0001 0.99 128 0.1 0.995 30
**************************************************************
lr: 0.0001 gamma 0.99 batch_size 128 tau  0.001 epsilon decay 0.995 retrain_steps  30
Rewards Rolling Avg: 19.75
Training completed in 3.04 minutes
0.0001 0.99 128 0.001 0.995 30
**************************************************************
lr: 0.0001 gamma 0.99 batch_size 128 tau  0.15 epsilon decay 0.995 retrain_steps  15
Rewards Rolling Avg: 18.00
Training completed in 2.46 minutes
0.0001 0.99 128 0.15 0.995 15
**************************************************************
lr: 0.0001 gamma 0.99 batch_size 128 tau  0.1 epsilon decay 0.995 retrain_steps  15
Rewards Rolling Avg: 19.70
Training compl

Rewards Rolling Avg: 35.80
Training completed in 4.02 minutes
0.0001 0.99 128 0.001 0.99 5
**************************************************************
lr: 0.001 gamma 0.99 batch_size 128 tau  0.15 epsilon decay 0.99 retrain_steps  30
Rewards Rolling Avg: 41.30
Training completed in 5.18 minutes
0.001 0.99 128 0.15 0.99 30
**************************************************************
lr: 0.001 gamma 0.99 batch_size 128 tau  0.1 epsilon decay 0.99 retrain_steps  30
Rewards Rolling Avg: 37.20
Training completed in 4.15 minutes
0.001 0.99 128 0.1 0.99 30
**************************************************************
lr: 0.001 gamma 0.99 batch_size 128 tau  0.001 epsilon decay 0.99 retrain_steps  30
Rewards Rolling Avg: 21.85
Training completed in 2.96 minutes
0.001 0.99 128 0.001 0.99 30
**************************************************************
lr: 0.001 gamma 0.99 batch_size 128 tau  0.15 epsilon decay 0.99 retrain_steps  15
Rewards Rolling Avg: 52.90
Training completed in 5.19 m

In [12]:
stop

NameError: name 'stop' is not defined

In [None]:
# Save the final model
model.save("lunarlander_ddqn_model1.keras")
files.download("lunarlander_ddqn_model1.keras")

In [None]:
# Plot rewards with rolling average
plt.figure(figsize=(10, 6))
plt.plot(episode_rewards, label='Rewards', color='blue')
plt.plot(rolling_avg_rewards, label='Rolling Avg (Last 100 Episodes)', color='orange')
plt.axhline(y=solved_threshold, color='red', linestyle='--', label='Solved Threshold')
plt.title('Double DQN Training Performance')
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Testing for 50 episodes
start_time = time.time()

for e_test in range(50):  # Run 50 test episodes
    state, _ = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0

    for t_test in range(max_steps):  # Use the same max_steps as training
        # Use the trained model for testing
        action_vals = model.predict(state, verbose=0)  # Predict action values
        action = np.argmax(action_vals[0])  # Choose the action with the highest Q-value

        next_state, reward, done, _, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        total_reward += reward
        state = next_state

        if done:
            print(f"Test Episode: {e_test + 1}/50, Reward: {total_reward:.2f}")
            break

end_time = time.time()
testing_duration = (end_time - start_time) / 60  # Convert to minutes
print(f"Testing completed in {testing_duration:.2f} minutes")


In [None]:
# Test the trained agent with video rendering
env = gym.make('LunarLander-v3', render_mode='rgb_array')  # Enable RGB rendering
frames = []  # Store frames for visualization

# Render a single test episode
state, _ = env.reset()
state = np.reshape(state, [1, state_size])
tot_rewards = 0

while True:
    # Use the trained model for action
    action_vals = model.predict(state, verbose=0)  # Predict action values
    action = np.argmax(action_vals[0])  # Choose the action with the highest Q-value

    next_state, reward, done, _, _ = env.step(action)
    frames.append(env.render())  # Save frame for rendering later
    next_state = np.reshape(next_state, [1, state_size])
    tot_rewards += reward
    state = next_state

    if done:
        print(f"Rendered Test Episode Reward: {tot_rewards:.2f}")
        break

env.close()

# Save the rendered episode as a GIF
def save_frames_as_gif(frames, path='./', filename='lunarlander_ddqn1.gif'):
    images = [Image.fromarray(frame) for frame in frames]
    gif_path = os.path.join(path, filename)
    images[0].save(gif_path, save_all=True, append_images=images[1:], duration=50, loop=0)
    print(f"Saved GIF to: {gif_path}")

save_frames_as_gif(frames, filename="lunarlander_ddqn1.gif")
files.download("lunarlander_ddqn1.gif")


In [None]:
Soft Updates Every Step: Use a small tau (e.g., 0.005).
Soft Updates Every N Steps/Episodes: Allows a higher tau (e.g., 0.1).
Hard Updates: Perform less frequently (e.g., every 1000 steps) target_model.set_weights(model.get_weights())