In [None]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1

In [None]:
!apt-get install -y xvfb

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
xvfb is already the newest version (2:21.1.4-2ubuntu1.7~22.04.13).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1

In [None]:
import numpy as np
import tensorflow as tf
import keras
from collections import deque
import random
import gym
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
np.bool8 = np.bool_
display = Display(visible=0, size=(400, 300))
display.start()

# hyperparameters
alpha, gamma = 0.001, 0.99
epsilon, epsilon_min = 1.0, 0.1
epsilon_decay = 0.995
batch_size = 32
memory_size = 10000
episodes = 500

env = gym.make("CartPole-v0")
stateSpaceSize = env.observation_space.shape[0]
actionSpaceSize = env.action_space.n

# 3-layer artificial neural network
def NN():
  inputs = keras.Input(shape=(stateSpaceSize,), name="states")
  x1 = keras.layers.Dense(32, activation="relu")(inputs)
  x2 = keras.layers.Dense(32, activation="relu")(x1)
  outputs = keras.layers.Dense(actionSpaceSize, activation="linear")(x2)
  model = keras.Model(inputs=inputs, outputs=outputs)
  model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=alpha))
  return model

# buffer to save experiences
class ReplayBuffer:
  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def add(self, state, action, reward, next_state, done):
    self.buffer.append((state, action, reward, next_state, done))

  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

  def __len__(self):
    return len(self.buffer)

model = NN()
replay_buffer = ReplayBuffer(memory_size)

def train_model():
  global epsilon
  for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, stateSpaceSize])
    total_reward = 0
    done = False

    while not done:
      # epsilon-greedy
      if np.random.rand() > epsilon:
        Q = model.predict(state, verbose=0)
        action = np.argmax(Q[0])
      else:
        action = np.random.choice(actionSpaceSize)

      next_state, reward, done, _ = env.step(action)
      next_state = np.reshape(next_state, [1, stateSpaceSize])
      total_reward += reward

      replay_buffer.add(state, action, reward, next_state, done)
      state = next_state

      if len(replay_buffer) >= batch_size:
        batch = replay_buffer.sample(batch_size)
        states = np.array([transition[0] for transition in batch])
        actions = np.array([transition[1] for transition in batch])
        rewards = np.array([transition[2] for transition in batch])
        next_states = np.array([transition[3] for transition in batch])
        dones = np.array([transition[4] for transition in batch])

        states = np.squeeze(states, axis=1)
        next_states = np.squeeze(next_states, axis=1)

        # function approximation
        target_Q = model.predict(states, verbose=0)
        next_Q = model.predict(next_states, verbose=0)
        for i in range(batch_size):
          if dones[i]:
            target_Q[i][actions[i]] = rewards[i]
          else:
            target_Q[i][actions[i]] = rewards[i] + gamma * np.max(next_Q[i])

        model.train_on_batch(states, target_Q)

    # epsilon decay
    if epsilon > epsilon_min:
      epsilon *= epsilon_decay

    print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon:.2f}")
    if total_reward > 100:
      break

train_model()

  logger.warn(
  deprecation(
  deprecation(


Episode: 1, Total Reward: 12.0, Epsilon: 0.99
Episode: 2, Total Reward: 12.0, Epsilon: 0.99
Episode: 3, Total Reward: 32.0, Epsilon: 0.99
Episode: 4, Total Reward: 42.0, Epsilon: 0.98
Episode: 5, Total Reward: 36.0, Epsilon: 0.98
Episode: 6, Total Reward: 20.0, Epsilon: 0.97
Episode: 7, Total Reward: 15.0, Epsilon: 0.97
Episode: 8, Total Reward: 20.0, Epsilon: 0.96
Episode: 9, Total Reward: 15.0, Epsilon: 0.96
Episode: 10, Total Reward: 11.0, Epsilon: 0.95
Episode: 11, Total Reward: 29.0, Epsilon: 0.95
Episode: 12, Total Reward: 19.0, Epsilon: 0.94
Episode: 13, Total Reward: 26.0, Epsilon: 0.94
Episode: 14, Total Reward: 15.0, Epsilon: 0.93
Episode: 15, Total Reward: 13.0, Epsilon: 0.93
Episode: 16, Total Reward: 56.0, Epsilon: 0.92
Episode: 17, Total Reward: 24.0, Epsilon: 0.92
Episode: 18, Total Reward: 12.0, Epsilon: 0.91
Episode: 19, Total Reward: 46.0, Epsilon: 0.91
Episode: 20, Total Reward: 56.0, Epsilon: 0.90
Episode: 21, Total Reward: 35.0, Epsilon: 0.90
Episode: 22, Total Rew

As shown above, I tried to code a DQN to solve the cart pole problem.
A 3-layer fully connected neural network is built to approximate the Q-value function.
Stochastic gradient descent is used as the model optimizer (actually I think perhaps Adam is a better choice).
From the training process, we can tell that at the beginning, we are mainly exploring, and thus the rewards are low.
With the decrease of epsilon, explorations become less and the rewards are higher.
To save time, I chose to break the look when the total reward reaches 100+.
Better results can be achieved with more episodes.