In [1]:
# https://www.geeksforgeeks.org/a-beginners-guide-to-deep-reinforcement-learning/

# Solving the CartPole Problem using Deep Q-Network (DQN)

In [2]:
# Import Required Libraries
import numpy as np
import tensorflow as tf
import gym

In [3]:
# Define the DQN Model
class DQN(tf.keras.Model): # creating a child class DQN that inherits from the parent class tf.keras.Model
    def __init__(self, num_actions):
        super(DQN, self).__init__() # to call the __init__ of the parent class, before overriding it. It can be also written "super().__init__()"
        self.dense1 = tf.keras.layers.Dense(24, activation='relu')
        self.dense2 = tf.keras.layers.Dense(24, activation='relu')
        self.output_layer = tf.keras.layers.Dense(num_actions, activation='linear') #  is the part of the model that predicts the Q-values for each possible action given the current state of the environment. The activation function for this layer is ‘linear’, meaning that the layer will output the raw values produced by its neurons without applying any additional function to them. This is common in Q-learning models like DQN, where the goal is to predict a set of arbitrary real-valued numbers (the Q-values).

 
    def call(self, inputs): # here the model takes the current state of the environment (the inputs) and produces Q-values for each possible action. The agent can then use these Q-values to select its next action
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)
 

In [4]:
# CartPole has 2 possible actions: push left or push right

num_actions = 2 
dqn_agent = DQN(num_actions)
dqn_agent

<__main__.DQN at 0x24585026310>

In [5]:
# Define the DQN Algorithm Parameters
learning_rate = 0.001
discount_factor = 0.99

# Initial exploration probability
exploration_prob = 1.0
# Decay rate of exploration probability
exploration_decay = 0.995
# Minimum exploration probability
min_exploration_prob = 0.1

In [6]:
# Initialize the CartPole Environment
env = gym.make('CartPole-v1')

#if we want to render
# env = gym.make('CartPole-v1', render_mode='human')


In [7]:
env.reset()
# it returns an array containing, in order:
# 1)position of the cart along the linear surface
# 2)vertical angle of the pole on the cart
# 3)linear velocity of the cart
# 4)angular velocity of the pole on the cart

(array([ 0.03587867, -0.04158823,  0.00895021,  0.00234133], dtype=float32),
 {})

In [8]:

state_reset = env.reset()[0]
state_reset

array([ 0.00649007,  0.01811054,  0.04681893, -0.01777743], dtype=float32)

In [9]:

#The main reason for adding the extra dimension in this case is to match the input shape that the model expects. The model is designed to process batches of states, so even when you’re only predicting the Q-values for a single state (state_reset.reshape(1,-1) or state[np.newaxis, :] below), that state needs to be in the form of a batch.
dqn_agent(state_reset.reshape(1,-1))

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.00304406, -0.00302192]], dtype=float32)>

In [10]:
dqn_agent(state_reset.reshape(1,-1)).numpy()[0,1]

-0.003021919

In [11]:
np.argmax(dqn_agent(state_reset.reshape(1,-1)))

1

In [12]:
env.render()

  gym.logger.warn(


In [13]:
# Define the Loss Function and Optimizer
loss_fn = tf.keras.losses.MeanSquaredError() # The MSE loss function measures the average squared differences between the estimated and true Q-values. This is appropriate for your problem because it effectively penalizes the model when its predictions are far from the actual values
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [14]:
env.action_space.sample()

0

In [15]:
env.step(env.action_space.sample())

  if not isinstance(terminated, (bool, np.bool8)):


(array([ 0.00685228, -0.1776505 ,  0.04646339,  0.28930208], dtype=float32),
 1.0,
 False,
 False,
 {})

In [16]:
# Training the DQN
num_episodes = 1000
max_steps_per_episode = 500
 
for episode in range(num_episodes):
    state = env.reset()[0]
    episode_reward = 0
 
    for step in range(max_steps_per_episode):
        # Choose action using epsilon-greedy policy
        if np.random.rand() < exploration_prob:
            action = env.action_space.sample()  # Explore randomly
        else:
            action = np.argmax(dqn_agent(state[np.newaxis, :])) # Given the current state of the environment, pass that state to the DQN model, get the predicted Q-values for each possible action, and choose the action with the highest Q-value ------ np.newaxis adds a extra dimension to the state array (same as state.reshape(1, -1)) to match the input shape that the model expects (see above)
 
        next_state, reward, done, _, _ = env.step(action)
 
        # Update the Q-values using Bellman equation
        with tf.GradientTape() as tape: # context manager that allows TensorFlow to trace the computation of Q-values, the Bellman equation, and the calculation of the loss, and automatically compute the gradients of the loss with respect to the trainable variables (dqn_agent.trainable_variables)
            current_q_values = dqn_agent(state[np.newaxis, :])
            next_q_values = dqn_agent(next_state[np.newaxis, :])
            max_next_q = tf.reduce_max(next_q_values, axis=-1)
            target_q_values = current_q_values.numpy()
            target_q_values[0, action] = reward + discount_factor * max_next_q * (1 - done) # updating the q_value for the specific action taken
            loss = loss_fn(current_q_values, target_q_values)
 
        gradients = tape.gradient(loss, dqn_agent.trainable_variables) # calculated the gradients using the context manager above
        optimizer.apply_gradients(zip(gradients, dqn_agent.trainable_variables)) # applies the gradients computed during backpropagation to update the parameters of the neural network (trainable variables). The zip function combines the gradients and trainable variables into pairs. Each pair consists of a specific gradient and the corresponding trainable variable. This pairing is necessary to specify which gradients should be applied to which variables during the parameter update.
 
        state = next_state
        episode_reward += reward
 
        if done:
            break
 
    # Decay exploration probability
    exploration_prob = max(min_exploration_prob, exploration_prob * exploration_decay)
    if (episode + 1)%100==0:
        print(f"Episode {episode + 1}: Reward = {episode_reward}")

Episode 100: Reward = 27.0
Episode 200: Reward = 16.0
Episode 300: Reward = 21.0
Episode 400: Reward = 26.0
Episode 500: Reward = 104.0


In [20]:
# Evaluating the Trained DQN
num_eval_episodes = 10
eval_rewards = []
 
for _ in range(num_eval_episodes):
    state = env.reset()[0]
    eval_reward = 0
 
    for _ in range(max_steps_per_episode):
        action = np.argmax(dqn_agent(state[np.newaxis, :]))
        next_state, reward, done, _, _ = env.step(action)
        eval_reward += reward
        state = next_state
 
        if done:
            break
 
    eval_rewards.append(eval_reward)
 
average_eval_reward = np.mean(eval_rewards)
print(f"Average Evaluation Reward: {average_eval_reward}")

Average Evaluation Reward: 110.3
