In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
import random
from collections import deque

# Create the CartPole environment
env = gym.make('CartPole-v1')

# Set random seeds for reproducibility
np.random.seed(0)
tf.random.set_seed(0)
env.reset(seed=0)


(array([ 0.01369617, -0.02302133, -0.04590265, -0.04834723], dtype=float32),
 {})

In [2]:
def build_dqn_model(input_shape, output_shape):
    model = models.Sequential([
        layers.Dense(24, input_dim=input_shape, activation='relu'),
        layers.Dense(24, activation='relu'),
        layers.Dense(output_shape, activation='linear')
    ])
    model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=0.001))  # Updated here
    return model

In [3]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = build_dqn_model(state_size, action_size)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [4]:
def train_dqn(episodes=1000):
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    batch_size = 32

    for e in range(episodes):
        state, _ = env.reset()  # Extract state from the reset output
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            # Uncomment to render the environment
            # env.render()
            
            action = agent.act(state)
            next_state, reward, done, truncated, _ = env.step(action)  # Unpack all five values
            
            # Use the done flag as is
            done = done or truncated  # Treat 'truncated' as 'done' if True
            
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state

            if done:
                print(f"Episode {e+1}/{episodes}, Score: {time}, Epsilon: {agent.epsilon:.2}")
                break

            if len(agent.memory) > batch_size:
                agent.replay(batch_size)


In [None]:
# Call the training function
train_dqn(episodes=500)