In [1]:
!pip uninstall tensorflow keras keras-rl2 gymnasium numpy pygame -y

!pip install tensorflow==2.10 keras-rl2 gymnasium pygame numpy


Found existing installation: tensorflow 2.10.0
Uninstalling tensorflow-2.10.0:
  Successfully uninstalled tensorflow-2.10.0
Found existing installation: keras 2.10.0
Uninstalling keras-2.10.0:
  Successfully uninstalled keras-2.10.0
Found existing installation: keras-rl2 1.0.5
Uninstalling keras-rl2-1.0.5:
  Successfully uninstalled keras-rl2-1.0.5
Found existing installation: gymnasium 1.1.1
Uninstalling gymnasium-1.1.1:
  Successfully uninstalled gymnasium-1.1.1
Found existing installation: numpy 1.24.4
Uninstalling numpy-1.24.4:
  Successfully uninstalled numpy-1.24.4
Found existing installation: pygame 2.6.1
Uninstalling pygame-2.6.1:
  Successfully uninstalled pygame-2.6.1
Collecting tensorflow==2.10
  Using cached tensorflow-2.10.0-cp38-cp38-win_amd64.whl.metadata (3.1 kB)
Collecting keras-rl2
  Using cached keras_rl2-1.0.5-py3-none-any.whl.metadata (304 bytes)
Collecting gymnasium
  Using cached gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pygame
  Using cached 

In [1]:
import gymnasium as gym
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory


In [2]:
# Wrapper to adjust gymnasium env to keras-rl expected API
class EnvWrapper(gym.Wrapper):
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return obs  # only observation

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        return obs, reward, done, info

    def render(self, mode='human', **kwargs):
        return self.env.render()

In [3]:
# Create env
env = EnvWrapper(gym.make('CartPole-v1', render_mode='human'))

states = env.observation_space.shape[0]
actions = env.action_space.n

In [4]:
# Build model
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))  # Window length = 1
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))  # Q-values
    return model

model = build_model(states, actions)

In [5]:
# Build agent
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                   nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

In [6]:
# Train agent
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

# Test agent
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(f'Average reward: {np.mean(scores.history["episode_reward"])}')

# Save weights
dqn.save_weights('dqn_weights.h5f', overwrite=True)

Training for 50000 steps ...


  updates=self.state_updates,


     9/50000: episode: 1, duration: 0.646s, episode steps:   9, steps per second:  14, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.889 [0.000, 1.000],  loss: --, mae: --, mean_q: --


  updates=self.state_updates,


    19/50000: episode: 2, duration: 0.524s, episode steps:  10, steps per second:  19, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.900 [0.000, 1.000],  loss: 0.722762, mae: 0.827255, mean_q: 0.286972




    36/50000: episode: 3, duration: 0.367s, episode steps:  17, steps per second:  46, episode reward: 17.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.588 [0.000, 1.000],  loss: 0.572289, mae: 0.731530, mean_q: 0.466274
    45/50000: episode: 4, duration: 0.206s, episode steps:   9, steps per second:  44, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.333 [0.000, 1.000],  loss: 0.607518, mae: 0.712155, mean_q: 0.579460
    71/50000: episode: 5, duration: 0.555s, episode steps:  26, steps per second:  47, episode reward: 26.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.615 [0.000, 1.000],  loss: 0.466148, mae: 0.617858, mean_q: 0.689452
    81/50000: episode: 6, duration: 0.224s, episode steps:  10, steps per second:  45, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.900 [0.000, 1.000],  loss: 0.356811, mae: 0.551854, mean_q: 0.906310
    94/50000: episode: 7, duration: 0.286s, episode steps:  13, step

In [None]:
# Reload weights and test again
dqn.load_weights('dqn_weights.h5f')
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 185.000, steps: 185


<keras.callbacks.History at 0x26c4b01f6a0>

: 