In [1]:
#!/usr/bin/env python3 

In [1]:
!pip install gymnasium

Collecting gymnasium
  Using cached gymnasium-0.28.1-py3-none-any.whl (925 kB)
Collecting farama-notifications>=0.0.1
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting jax-jumpy>=1.0.0
  Using cached jax_jumpy-1.0.0-py3-none-any.whl (20 kB)
Installing collected packages: farama-notifications, jax-jumpy, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.28.1 jax-jumpy-1.0.0


In [2]:
# Importing the necessary moduels 
import gym
import random
import numpy as np 
import matplotlib.pyplot as plt 
import tensorflow as tf 
from tensorflow.keras import models, layers 
from collections import deque 
from IPython.display import display, clear_output
import cv2

# Set random seed s for reproducibility 
np.random.seed(42) 
tf.random.set_seed(42) 

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
2025-07-30 15:07:43.356255: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  import imp


In [3]:
# Define the Hyperparameters 
gamma = 0.99 
learningRate = 0.001
memorySize = 10000
batchSize = 64 
epsilon = 1.0 
epsilonMin = 0.01
epsilonDecay = 0.995 

In [None]:
# Define the DQN class 
class DQN:
    def __init__(self, stateSize, actionSize):
        self.stateSize = stateSize
        self.actionSize = actionSize
        self.memory = deque(maxlen=memorySize)
        self.epsilon = epsilon
        self.model = self._buildModel()
        self.targetModel = self._buildModel()
        self.updateTargetModel()
        
    def save(self, filePath):
        self.model.save(filePath)

    def load(self, filePath):
        self.model = tf.keras.models.load_model(filePath)
        self.updateTargetModel()

    def _buildModel(self):
        model = models.Sequential([
            layers.Dense(24, input_dim=self.stateSize, activation='relu'),
            layers.Dense(24, activation='relu'),
            layers.Dense(self.actionSize, activation='linear')
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=learningRate))
        return model

    def updateTargetModel(self):
        self.targetModel.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, nextState, done):
        self.memory.append((state, action, reward, nextState, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.actionSize)
        actValues = self.model.predict(state, verbose=0)
        return np.argmax(actValues[0])

    def replay(self):
        if len(self.memory) < batchSize:
            return
        miniBatch = random.sample(self.memory, batchSize)
        states = np.array([t[0][0] for t in miniBatch])
        actions = np.array([t[1] for t in miniBatch])
        rewards = np.array([t[2] for t in miniBatch])
        nextStates = np.array([t[3][0] for t in miniBatch])
        dones = np.array([t[4] for t in miniBatch])

        targets = self.model.predict(states, verbose=0)
        targetsNext = self.targetModel.predict(nextStates, verbose=0)

        for i in range(batchSize):
            if dones[i]:
                targets[i][actions[i]] = rewards[i]
            else:
                targets[i][actions[i]] = rewards[i] + gamma * np.amax(targetsNext[i])

        self.model.fit(states, targets, epochs=1, verbose=0)

        if self.epsilon > epsilonMin:
            self.epsilon *= epsilonDecay

In [5]:
import gymnasium as gym
import numpy as np
import cv2
import matplotlib.pyplot as plt
from IPython.display import clear_output

def trainDqn(episodes, renderMode='none', displayFreq=10):
    # Initialize environment with appropriate render mode
    if renderMode == 'human':
        cartPoleEnv = gym.make('CartPole-v1', render_mode='human')
    else:
        cartPoleEnv = gym.make('CartPole-v1', render_mode='rgb_array')

    stateSize = cartPoleEnv.observation_space.shape[0]
    actionSize = cartPoleEnv.action_space.n
    dqnAgent = DQN(stateSize, actionSize)
    episodeScores = []

    for episode in range(episodes):
        resetResult = cartPoleEnv.reset()
        currentState = resetResult[0] if isinstance(resetResult, tuple) else resetResult
        currentState = np.reshape(currentState, [1, stateSize])
        episodeScore = 0

        for step in range(500):  # Max steps per episode
            if renderMode != 'none' and (episode + 1) % displayFreq == 0:
                if renderMode == 'human':
                    cartPoleEnv.render()
                else:
                    frame = cartPoleEnv.render()
                    if frame is not None:
                        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                        frame = cv2.resize(frame, (400, 300))
                        clear_output(wait=True)
                        plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                        plt.axis('off')
                        plt.show()

            action = dqnAgent.act(currentState)
            stepResult = cartPoleEnv.step(action)

            nextState = stepResult[0]
            reward = stepResult[1]
            terminated = stepResult[2]
            truncated = stepResult[3]
            done = terminated or truncated

            nextState = np.reshape(nextState, [1, stateSize])
            dqnAgent.remember(currentState, action, reward, nextState, done)
            currentState = nextState
            episodeScore += reward

            if done:
                print(f"Episode: {episode + 1}/{episodes}, Score: {episodeScore}, Epsilon: {dqnAgent.epsilon:.2f}")
                episodeScores.append(episodeScore)
                break

            dqnAgent.replay()

        if episode % 10 == 0:
            dqnAgent.updateTargetModel()

    cartPoleEnv.close()
    return episodeScores


In [7]:
# # Cell 5: Train and Plot
# episodes = 100

# # Choose renderMode: 'human', 'notebook', or 'none'
# scores = trainDqn(episodes, renderMode='notebook', displayFreq=10)

# # Plot the results
# plt.plot(scores)
# plt.xlabel('Episode')
# plt.ylabel('Score')
# plt.title('Training Progress')
# plt.show()