#### **CARTPOLE DQN V1 (Plain)**

This is a naive version of the CARTPOLE DQN algorithm <br>
It will be very slow but it is an educational tool to understand all the steps in the DQN algorithm

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import gymnasium as gym
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from collections import deque
import random

In [None]:
# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
episodes = 500
batch_size = 64
gamma = 0.99  # Discount factor
epsilon = 1.0  # Initial exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.0005
memory = deque(maxlen=2000)

In [None]:
# Build the neural network model
def build_model_1():
    model = Sequential([
        Dense(24, input_dim=state_size, activation="relu"),
        Dense(24, activation="relu"),
        Dense(action_size, activation="linear"),
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="mse")
    return model

def build_model():
    # Define the input layer
    inputs = Input(shape=(state_size,))  
    hidden1 = Dense(24, activation="relu")(inputs)
    hidden2 = Dense(24, activation="relu")(hidden1)
    outputs = Dense(action_size, activation="linear")(hidden2)  # action_size is the number of actions
    model = Model(inputs=inputs, outputs=outputs)

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="mse")
    
    return model


# Replay and train function
def replay(model, target_model):
    if len(memory) < batch_size:
        return
    minibatch = random.sample(memory, batch_size)
    states, targets = [], []
    for state, action, reward, next_state, done in minibatch:
        target = model.predict(state, verbose=0)
        if done:
            target[0][action] = reward
        else:
            next_target = target_model.predict(next_state, verbose=0)
            target[0][action] = reward + gamma * np.max(next_target[0])
        states.append(state[0])
        targets.append(target[0])
    model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)

# Soft update function for target network
def soft_update(model, target_model, tau=0.1):
    target_weights = target_model.get_weights()
    model_weights = model.get_weights()
    new_weights = [
        tau * mw + (1 - tau) * tw for mw, tw in zip(model_weights, target_weights)
    ]
    target_model.set_weights(new_weights)

In [1]:
# Create the CartPole environment
env = gym.make("CartPole-v1")



# Main training loop
model = build_model()
target_model = build_model()
target_model.set_weights(model.get_weights())

for episode in range(episodes):
    state, _ = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    done = False

    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()
        else:
            q_values = model.predict(state, verbose=0)
            action = np.argmax(q_values[0])

        # Take action
        next_state, reward, done, _, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        total_reward += reward

        # Store experience in memory
        memory.append((state, action, reward, next_state, done))
        state = next_state

        # Train the model
        replay(model, target_model)

        # End episode if done
        if done:
            print(f"Episode: {episode + 1}/{episodes}, Score: {total_reward}, Epsilon: {epsilon:.2f}")
            break

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Update target network using soft updates
    soft_update(model, target_model)

# Close the environment
env.close()


2024-11-26 09:00:53.854026: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-26 09:00:53.862168: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-26 09:00:53.872600: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-26 09:00:53.875589: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-26 09:00:53.883553: I tensorflow/core/platform/cpu_feature_guar

Episode: 1/500, Score: 22.0, Epsilon: 1.00
Episode: 2/500, Score: 21.0, Epsilon: 0.99
Episode: 3/500, Score: 14.0, Epsilon: 0.99
Episode: 4/500, Score: 35.0, Epsilon: 0.99
Episode: 5/500, Score: 14.0, Epsilon: 0.98
Episode: 6/500, Score: 18.0, Epsilon: 0.98


KeyboardInterrupt: 