# Keras-RL DQN


In [1]:
import numpy as np
import gym

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

In [4]:
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy

## Create the environment

In [6]:
env_name = 'Acrobot-v1'
env = gym.make(env_name)

In [7]:
num_actions = env.action_space.n
num_observations = env.observation_space.shape
print(f"Action Space: {env.action_space.n}")
print(f"Observation Space: {num_observations}")

assert num_actions == 3 and num_observations == (6,) , "Wrong environment!"

Action Space: 3
Observation Space: (6,)


## ANN

In [9]:
model = Sequential()

model.add(Flatten(input_shape=(1,)+num_observations))

model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dense(num_actions))
model.add(Activation('linear'))

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 6)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                448       
_________________________________________________________________
activation_4 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                4160      
_________________________________________________________________
activation_5 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
activation_6 (Activation)    (None, 64)               

## DQN AGENT

In [10]:
memory = SequentialMemory(limit=50_000,
                          window_length=1)

In [11]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.0,
                              value_min=0.1,
                              value_test=0.05,
                              nb_steps=150_000)

In [12]:
dqn = DQNAgent(model=model,
               nb_steps_warmup=1000,
               target_model_update=1000,
               batch_size=32,
               gamma=0.99,
               policy=policy,
               memory=memory,
               nb_actions=num_actions)

In [14]:
dqn.compile(Adam(learning_rate=1e-3),
            metrics=['mae'])

## Fit the model

In [15]:
dqn.fit(env, 
        nb_steps=150_000,
        visualize=False,
        verbose=1)

Training for 150000 steps ...
Interval 1 (0 steps performed)
  999/10000 [=>............................] - ETA: 6s - reward: -1.0000

  if not isinstance(values, collections.Sequence):


20 episodes - episode_reward: -500.000 [-500.000, -500.000] - loss: 0.029 - mae: 3.440 - mean_q: -5.070 - mean_eps: 0.967

Interval 2 (10000 steps performed)
20 episodes - episode_reward: -497.700 [-500.000, -454.000] - loss: 0.188 - mae: 9.481 - mean_q: -14.020 - mean_eps: 0.910

Interval 3 (20000 steps performed)
22 episodes - episode_reward: -455.500 [-500.000, -239.000] - loss: 0.402 - mae: 14.335 - mean_q: -21.203 - mean_eps: 0.850

Interval 4 (30000 steps performed)
23 episodes - episode_reward: -417.783 [-500.000, -230.000] - loss: 0.678 - mae: 17.856 - mean_q: -26.402 - mean_eps: 0.790

Interval 5 (40000 steps performed)
26 episodes - episode_reward: -391.462 [-500.000, -223.000] - loss: 0.812 - mae: 20.593 - mean_q: -30.447 - mean_eps: 0.730

Interval 6 (50000 steps performed)
32 episodes - episode_reward: -316.469 [-500.000, -178.000] - loss: 0.961 - mae: 22.671 - mean_q: -33.465 - mean_eps: 0.670

Interval 7 (60000 steps performed)
37 episodes - episode_reward: -267.919 [-50

<tensorflow.python.keras.callbacks.History at 0x7ff463bbb4d0>

## Evaluate the model

In [16]:
dqn.test(env,
         nb_episodes=5,
         visualize=True)
env.close()

Testing for 5 episodes ...
Episode 1: reward: -62.000, steps: 63
Episode 2: reward: -64.000, steps: 65
Episode 3: reward: -72.000, steps: 73
Episode 4: reward: -81.000, steps: 82
Episode 5: reward: -70.000, steps: 71
