#### **CARTPOLE DDQN V1 (Plain)**

This is a naive version of the CARTPOLE DDQN algorithm <br>
It will be very slow but it is an educational tool to understand all the steps in the DDQN algorithm <br>

The Double DQN (DDQN) uses two networks one as a target and one as the trained network <br>

The target network is a copy of the online network that is updated less frequently, <br>
and is used to generate the target values for the Q-learning update

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import gymnasium as gym
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from collections import deque
import random

In [None]:
# Parameters
# Create the CartPole environment
env = gym.make("CartPole-v1")

state_size = env.observation_space.shape[0]
action_size = env.action_space.n
episodes = 500
batch_size = 24
gamma = 0.99  # Discount factor
epsilon = 1.0  # Initial exploration rate
epsilon_min = 0.01
epsilon_decay = 0.95
learning_rate = 0.001
memory = deque(maxlen=2000)

In [None]:
# Build the neural network model

def build_model(state_size,action_size):
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse',
                      optimizer=Adam(learning_rate=learning_rate))
    model.summary()
    return model


# Soft update function for target network
def soft_update(model, target_model, tau=0.1):
    target_weights = target_model.get_weights()
    model_weights = model.get_weights()
    new_weights = [tau * mw + (1 - tau) * tw for mw, tw in zip(model_weights, target_weights)]
    target_model.set_weights(new_weights)

#### **Replay Function this is the core of the DDQN**

In [None]:
import random
import numpy as np

def replay(model, target_model, memory, batch_size, gamma):
    """
    Perform a replay step in Double Deep Q-Network (DDQN).

    Args:
        model (keras.Model): Main (online) model.
        target_model (keras.Model): Target model for stable Q-value estimation.
        memory (deque): Replay memory storing (state, action, reward, next_state, done) tuples.
        batch_size (int): Number of experiences to sample from memory.
        gamma (float): Discount factor for future rewards.
    """
    # Ensure there are enough samples in memory
    if len(memory) < batch_size:
        return

    # Sample a minibatch of experiences
    minibatch = random.sample(memory, batch_size)

    # Prepare batches
    states, next_states, actions, rewards, dones = [], [], [], [], []
    for state, action, reward, next_state, done in minibatch:
        states.append(state)
        next_states.append(next_state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)

    # Convert lists to numpy arrays for batch processing
    states = np.array(states)
    next_states = np.array(next_states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    dones = np.array(dones)

    # Predict Q-values for current and next states
    q_values = model.predict(states, verbose=0)
    next_q_values_main = model.predict(next_states, verbose=0)  # Main model
    next_q_values_target = target_model.predict(next_states, verbose=0)  # Target model

    # Update Q-values for the sampled experiences
    for i in range(batch_size):
        if dones[i]:  # Terminal state
            q_values[i][actions[i]] = rewards[i]
        else:
            # Main model selects the best action for the next state
            best_next_action = np.argmax(next_q_values_main[i])
            # Target model evaluates the Q-value of that action
            q_values[i][actions[i]] = rewards[i] + gamma * next_q_values_target[i][best_next_action]

    # Train the main model with the updated Q-values
    model.fit(states, q_values, epochs=1, verbose=0)


In [None]:

# Replay and train function
def replay(model, target_model):
    if len(memory) < batch_size:
        return
    minibatch = random.sample(memory, batch_size)
    states, targets = [], []
    for state, action, reward, next_state, done in minibatch:
        target = model.predict(state, verbose=0)
        if done:
            target[0][action] = reward
        else:
            next_target = target_model.predict(next_state, verbose=0)
            target[0][action] = reward + gamma * np.max(next_target[0])
        states.append(state[0])
        targets.append(target[0])
    model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)

In [None]:


# Main training loop
model = build_model()
target_model = build_model()
target_model.set_weights(model.get_weights())

for episode in range(episodes):
    state, _ = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    done = False

    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()
        else:
            q_values = model.predict(state, verbose=0)
            action = np.argmax(q_values[0])

        # Take action
        next_state, reward, done, _, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        total_reward += reward

        # Store experience in memory
        memory.append((state, action, reward, next_state, done))
        state = next_state

        # Train the model
        replay(model, target_model, memory, batch_size, gamma)

        # End episode if done
        if done:
            print(f"Episode: {episode + 1}/{episodes}, Score: {total_reward}, Epsilon: {epsilon:.2f}")
            break

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Update target network using soft updates
    soft_update(model, target_model)

# Close the environment
env.close()
