#### **CARTPOLE DDQN V2 (soft update Polyak + some improvements)**

This is a naive version vectorized of the DDQN Algorithm for the CARTPOLE env <br>
Vectorizing we obtain a version that is faster and it is 'functional' as it works <br>
Leave training for 300 episodes

The Double DQN (DDQN) uses two networks one as a target and one as the trained network <br>

The target network is a copy of the online network that is updated less frequently, <br>
and is used to generate the target values for the Q-learning update

It includes a soft update of the weights, instead of substituting the weights between the networks it blends them together using a Polyak averaging This was an improvement on the pure DDQN algorithm

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import gymnasium as gym
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from collections import deque
import random

In [None]:
# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
episodes = 300
batch_size = 64
gamma = 0.99  # Discount factor
epsilon = 1.0  # Initial exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
memory = deque(maxlen=10000)  # Replay buffer size
train_every = 4  # Train the model every 4 steps

In [None]:
# Support Functions

# Build the neural network model

def build_model(state_size,action_size):
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse',
                      optimizer=Adam(learning_rate=learning_rate))
    model.summary()
    return model



# Soft update function for target network
def soft_update(model, target_model, tau=0.1):
    target_weights = target_model.get_weights()
    model_weights = model.get_weights()
    new_weights = [
        tau * mw + (1 - tau) * tw for mw, tw in zip(model_weights, target_weights)
    ]
    target_model.set_weights(new_weights)

#### **Replay Function - DDQN This is the core of the DDQN Algorithm**

In [None]:
def replay(model, target_model, memory, batch_size, gamma):
    """
    Perform a replay step in Double Deep Q-Network (DDQN).

    Args:
        model (keras.Model): Main (online) model.
        target_model (keras.Model): Target model for stable Q-value estimation.
        memory (deque): Replay memory storing (state, action, reward, next_state, done) tuples.
        batch_size (int): Number of experiences to sample from memory.
        gamma (float): Discount factor for future rewards.
    """
    # Ensure there are enough samples in memory
    if len(memory) < batch_size:
        return

    # Sample a minibatch of experiences
    minibatch = random.sample(memory, batch_size)

    # Prepare batches
    states, next_states, actions, rewards, dones = [], [], [], [], []
    for state, action, reward, next_state, done in minibatch:
        states.append(state)
        next_states.append(next_state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)

    # Convert lists to numpy arrays for batch processing
    states = np.array(states)
    next_states = np.array(next_states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    dones = np.array(dones)

    # Predict Q-values for current and next states
    q_values = model.predict(states, verbose=0)
    next_q_values_main = model.predict(next_states, verbose=0)  # Main model
    next_q_values_target = target_model.predict(next_states, verbose=0)  # Target model

    # Update Q-values for the sampled experiences
    for i in range(batch_size):
        if dones[i]:  # Terminal state
            q_values[i][actions[i]] = rewards[i]
        else:
            # Main model selects the best action for the next state
            best_next_action = np.argmax(next_q_values_main[i])
            # Target model evaluates the Q-value of that action
            q_values[i][actions[i]] = rewards[i] + gamma * next_q_values_target[i][best_next_action]

    # Train the main model with the updated Q-values
    model.fit(states, q_values, epochs=1, verbose=0)


In [1]:
# Main training loop

env = gym.make("CartPole-v1")

model = build_model()
target_model = build_model()
target_model.set_weights(model.get_weights())

for episode in range(episodes):
    state, _ = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    done = False
    step = 0

    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()
        else:
            q_values = model.predict(state, verbose=0)
            action = np.argmax(q_values[0])

        # Take action
        next_state, reward, done, _, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        total_reward += reward

        # Store experience in memory
        memory.append((state, action, reward, next_state, done))
        state = next_state

        # Train the model every `train_every` steps
        if step % train_every == 0:
            replay(model, target_model, memory, batch_size, gamma)

        step += 1

        if done:
            print(f"Episode: {episode + 1}/{episodes}, Score: {total_reward}, Epsilon: {epsilon:.2f}")
            break

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Update target network using soft updates
    soft_update(model, target_model)

# Close the environment
env.close()


2024-11-26 09:07:01.287580: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-26 09:07:01.297151: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-26 09:07:01.308917: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-26 09:07:01.312403: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-26 09:07:01.320813: I tensorflow/core/platform/cpu_feature_guar

Episode: 1/500, Score: 20.0, Epsilon: 1.00
Episode: 2/500, Score: 16.0, Epsilon: 0.99
Episode: 3/500, Score: 42.0, Epsilon: 0.99
Episode: 4/500, Score: 11.0, Epsilon: 0.99
Episode: 5/500, Score: 17.0, Epsilon: 0.98
Episode: 6/500, Score: 11.0, Epsilon: 0.98
Episode: 7/500, Score: 14.0, Epsilon: 0.97
Episode: 8/500, Score: 47.0, Epsilon: 0.97
Episode: 9/500, Score: 20.0, Epsilon: 0.96
Episode: 10/500, Score: 11.0, Epsilon: 0.96
Episode: 11/500, Score: 51.0, Epsilon: 0.95
Episode: 12/500, Score: 15.0, Epsilon: 0.95
Episode: 13/500, Score: 17.0, Epsilon: 0.94
Episode: 14/500, Score: 9.0, Epsilon: 0.94
Episode: 15/500, Score: 16.0, Epsilon: 0.93
Episode: 16/500, Score: 12.0, Epsilon: 0.93
Episode: 17/500, Score: 13.0, Epsilon: 0.92
Episode: 18/500, Score: 24.0, Epsilon: 0.92
Episode: 19/500, Score: 19.0, Epsilon: 0.91
Episode: 20/500, Score: 27.0, Epsilon: 0.91
Episode: 21/500, Score: 21.0, Epsilon: 0.90
Episode: 22/500, Score: 14.0, Epsilon: 0.90
Episode: 23/500, Score: 11.0, Epsilon: 0.9

Episode: 185/500, Score: 108.0, Epsilon: 0.40
Episode: 186/500, Score: 106.0, Epsilon: 0.40
Episode: 187/500, Score: 195.0, Epsilon: 0.39
Episode: 188/500, Score: 327.0, Epsilon: 0.39
Episode: 189/500, Score: 55.0, Epsilon: 0.39
Episode: 190/500, Score: 283.0, Epsilon: 0.39
Episode: 191/500, Score: 334.0, Epsilon: 0.39
Episode: 192/500, Score: 153.0, Epsilon: 0.38
Episode: 193/500, Score: 283.0, Epsilon: 0.38
Episode: 194/500, Score: 113.0, Epsilon: 0.38
Episode: 195/500, Score: 292.0, Epsilon: 0.38
Episode: 196/500, Score: 65.0, Epsilon: 0.38
Episode: 197/500, Score: 287.0, Epsilon: 0.37
Episode: 198/500, Score: 270.0, Epsilon: 0.37
Episode: 199/500, Score: 196.0, Epsilon: 0.37
Episode: 200/500, Score: 257.0, Epsilon: 0.37
Episode: 201/500, Score: 204.0, Epsilon: 0.37
Episode: 202/500, Score: 342.0, Epsilon: 0.37
Episode: 203/500, Score: 335.0, Epsilon: 0.36
Episode: 204/500, Score: 104.0, Epsilon: 0.36
Episode: 205/500, Score: 12.0, Epsilon: 0.36
Episode: 206/500, Score: 230.0, Epsil

Episode: 364/500, Score: 318.0, Epsilon: 0.16
Episode: 365/500, Score: 251.0, Epsilon: 0.16
Episode: 366/500, Score: 182.0, Epsilon: 0.16
Episode: 367/500, Score: 264.0, Epsilon: 0.16
Episode: 368/500, Score: 183.0, Epsilon: 0.16
Episode: 369/500, Score: 250.0, Epsilon: 0.16
Episode: 370/500, Score: 324.0, Epsilon: 0.16
Episode: 371/500, Score: 232.0, Epsilon: 0.16
Episode: 372/500, Score: 468.0, Epsilon: 0.16
Episode: 373/500, Score: 165.0, Epsilon: 0.15
Episode: 374/500, Score: 193.0, Epsilon: 0.15
Episode: 375/500, Score: 207.0, Epsilon: 0.15
Episode: 376/500, Score: 206.0, Epsilon: 0.15
Episode: 377/500, Score: 244.0, Epsilon: 0.15
Episode: 378/500, Score: 274.0, Epsilon: 0.15
Episode: 379/500, Score: 232.0, Epsilon: 0.15
Episode: 380/500, Score: 203.0, Epsilon: 0.15
Episode: 381/500, Score: 458.0, Epsilon: 0.15
Episode: 382/500, Score: 211.0, Epsilon: 0.15
Episode: 383/500, Score: 556.0, Epsilon: 0.15
Episode: 384/500, Score: 316.0, Epsilon: 0.15
Episode: 385/500, Score: 308.0, Ep

In [5]:
"""
   In this cell we calculate if the Challenge can be considered as solved
   Solved conditions are : in 100 episodes the average reward per episode over 195
"""

episodes = 100
threshold = 195
total_rewards = []

env = gym.make('CartPole-v1', render_mode='rgb_array')


for _ in range(episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
        
    while not done:
       state_v = np.reshape(state, [1, state_size])
       q_values = model.predict(state_v, verbose=0)
       action = np.argmax(q_values[0])
            
       next_state, reward, terminated, truncated, _ = env.step(action)

       episode_reward += reward
            
       state = next_state
       
       if terminated or truncated:
           done = True
        
    total_rewards.append(episode_reward)
    
average_reward = np.mean(total_rewards)
solved = average_reward >= threshold
if solved:
    text = "solved"
else:
    text = "unsolved"
print ('The Challenge has been', text, 'with an average of', average_reward, 
       'avg/reward per episode after 100 episodes')

The Challenge has been unsolved with an average of 18.76 avg/reward per episode after 100 episodes
