In [1]:
# Importing necessary libraries
import tensorflow as tf
from tensorflow.keras import layers, models
from collections import deque
import numpy as np
import random
import gym
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [2]:


# DQN (Deep Q-Network) class definition
class DQN(models.Model):
    def __init__(self, action_size):
        super(DQN, self).__init__()
        
        # Define the layers for the neural network
        self.dense1 = layers.Dense(32, activation='relu', input_shape=(84, 84, 4))
        self.dense2 = layers.Dense(64, activation='relu')
        self.dense3 = layers.Dense(action_size, activation='linear')
        self.flatten = layers.Flatten()

    # Feed forward network  
    def call(self, state):
        # Flatten the state
        s = self.flatten(state)
        # Pass through the first dense layer
        s = self.dense1(s)
        # Pass through the second dense layer
        s = self.dense2(s)
        # Output layer with linear activation
        return self.dense3(s)

# Agent class definition
class DQNagent:
    def __init__(self, state_size, action_size):
        # Initialize state and action sizes
        self.state_size = state_size
        self.action_size = action_size
        # Initialize a deque (double-ended queue) for experience replay
        self.memory = deque(maxlen=2000)
        # Initialize exploration rate (epsilon) to encourage exploration
        self.epsilon = 1.0
        # Minimum exploration rate
        self.epsilon_min = 0.01
        # Decay rate for exploration rate
        self.epsilon_decay = 0.995
        # Initialize discount factor (gamma) for future rewards
        self.gamma = 0.95
        # Build the DQN model
        self.model = self.build_model()

    # Method to build the DQN model
    def build_model(self):
        # Create an instance of the DQN class
        model = DQN(self.action_size)
        # Compile the model with Adam optimizer and Mean Squared Error loss
        model.compile(optimizer='adam', loss='mse')
        # Return the compiled model
        return model

    # Method to store experience in memory buffer
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # Method to select action using epsilon-greedy policy
    def act(self, state):
        # Epsilon-greedy policy
        if np.random.rand() <= self.epsilon:
            # Explore: Choose a random action
            return random.randrange(self.action_size)
        # Exploit: Choose the action with highest Q-value from the DQN model
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    # Method to train the DQN model
    def replay(self, batch_size):
        # Check if the memory buffer size is sufficient for sampling
        if len(self.memory) < batch_size:
            return
        
        # Sample a batch of experiences from memory
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            # If the episode is done, set target to reward
            target = reward
            if not done:
                # Calculate target using Bellman equation
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            # Get current Q-values from the DQN model
            target_f = self.model.predict(state)
            # Update the Q-value for the chosen action
            target_f[0][action] = target
            # Train the DQN model using the updated target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        # Decay epsilon (exploration rate) over time
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [4]:
# Create the CartPole environment
env = gym.make("CartPole-v1")

# Number of episodes to run
episodes = 10

# Initialize the DQN agent
agent = DQNagent(state_size=4, action_size=2)

# Loop through each episode
for epi in range(1, episodes + 1):
    # Reset the environment to start a new episode
    state = env.reset()
    done = False  # Whether the episode has ended or not
    score = 0  # Initialize the score for this episode

    # Run the episode until it's done
    while not done:
        # Choose an action for the agent using epsilon-greedy policy
        action = agent.act(state)

        # Take a step in the environment based on the chosen action
        step_result = env.step(action)
        
        # Check the length of the returned tuple
        if len(step_result) == 2:
            next_state, reward = step_result
            done = False  # Assuming episode is not done if only next_state and reward are returned
        elif len(step_result) == 3:
            next_state, reward, done = step_result
        elif len(step_result) == 4:
            next_state, reward, done, info = step_result
        else:
            raise ValueError("Unexpected number of values returned by env.step()")

        # Store the experience in the agent's memory
        agent.remember(state, action, reward, next_state, done)

        # Accumulate the reward to compute the score for this episode
        score += reward

        # Update the current state to the next state
        state = next_state

    # Train the agent using experiences from memory
    agent.replay(batch_size=32)

    # Render the environment
    plt.imshow(env.render(mode='rgb_array'))
    plt.axis("off")
    plt.title(f"Episodes:{epi}, Score: {score}")
    plt.show()
    clear_output(wait=True)

# Close the rendered frames
env.close()


ValueError: Unexpected number of values returned by env.step()