# DQN with Images

## Imports and preprocessing

In [1]:
# Basic
from PIL import Image
import numpy as np
import gym

# Convolutional Backbone Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from tensorflow.keras.optimizers import Adam

# Keras-RL
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [2]:
from gym.utils import play

pygame 2.1.2 (SDL 2.0.18, Python 3.7.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [4]:
env = gym.make("BreakoutDeterministic-v4", render_mode='human') #https://github.com/openai/gym/issues/1280
nb_action = env.action_space.n
nb_action

4

In [3]:
# we will use shape 84x84 and window length 4 (timestep of 4 consecutive frames)

IMG_SHAPE = (84, 84) 
WINDOW_LENGTH = 4 

In [None]:
play.play(env)

In [4]:
# Based on those settings we create our processor. It is the same processor as in the last notebook, 
# with the addition that it standardizes the data into the [0, 1] intervall which often decreases 
# the necessary training time. 
# We perform this standardization routine in the process_state_batch function, 
# which is only executed on the current batch and not on the complete replay memory 
# which decreases RAM usage by a factor of 4. Additionally we clip the reward 
# in the intervall [-1, 1] which might speed up the training

In [5]:
class ImageProcessor(Processor):
    
    def process_observation(self, observation):
        img = Image.fromarray(observation) # convert from numpy array to PIL Image
        img = img.resize(IMG_SHAPE) # resize the image
        img = img.convert('L') # convert it to grayscale
        img = np.array(img) # convert the image back to a numpy array
        
        return img.astype('uint8') # optimize training time less memory
    
    def process_state_batch(self, batch):
        processsed_batch = batch.astype('float32')/255.0 # divided by 255 to compress it into [0, 1]
        return processsed_batch
    
    def process_reward(self, reward):
        return np.clip(reward, -1.0, 1.0) # normalized reward [-1, 1]

In [6]:
# define input shape 4 x 84 x 84
input_shape = (WINDOW_LENGTH, IMG_SHAPE[0], IMG_SHAPE[1])
input_shape

(4, 84, 84)

## Constructing the Network 

In [7]:
# Convolution2D input_shape should be (Batch, 84, 84, 4)

In [8]:
model = Sequential()

# permutation on input data swaping data, change places
#https://keras.io/api/layers/reshaping_layers/permute/
model.add(Permute((2, 3, 1), input_shape=input_shape)) 

model.add(Convolution2D(filters=32, 
                        kernel_size=(8,8), 
                        strides=(4,4), 
                        kernel_initializer='he_normal'))
model.add(Activation('relu'))


model.add(Convolution2D(filters=64, 
                        kernel_size=(4,4), 
                        strides=(2,2), 
                        kernel_initializer='he_normal'))
model.add(Activation('relu'))

model.add(Convolution2D(filters=64, 
                        kernel_size=(3,3), 
                        strides=(1,1), 
                        kernel_initializer='he_normal'))
model.add(Activation('relu'))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))

# output layer actions ---> Q(s, a)
model.add(Dense(nb_action))
model.add(Activation('linear'))

model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute (Permute)            (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
activation (Activation)      (None, 20, 20, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
activation_1 (Activation)    (None, 9, 9, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________

## Setting up the Agent

In [9]:
# we define the memory. We use again the SequentialMemory, but this time with window_lenght = 4
memory = SequentialMemory(limit=1_000_000,
                          window_length=WINDOW_LENGTH)

In [10]:
# define processor
processor = ImageProcessor()

In [11]:
# We use again a LinearAnnealedPolicy to implement the epsilon greedy action selection with decaying epsilon.
# As we need to train for at least a million steps, we set the number of steps to 1,000,000
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              nb_steps=1_000_000,
                              attr='eps',
                              value_max=1.0,
                              value_min=-1.0,
                              value_test=0.05)

In [12]:
# we define agent and compile it. The agent is defined in the same way in the previous lectures with add 
# train_interval=4, we train on every 4th step
# besides we clip delta (the error) to 1 
# Both, clipping and train_interval ofren increase the result

dqn = DQNAgent(model=model,
               nb_actions=nb_action,
               policy=policy,
               memory=memory,
               processor=processor,
               nb_steps_warmup=50_000,
               gamma=.99,
               target_model_update=10_000,
               train_interval=4,
               delta_clip=1)

In [13]:
dqn.compile(optimizer=Adam(learning_rate=0.00025),
            metrics=['mae'])

2022-07-12 18:41:45.288736: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-12 18:41:45.289529: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 10. Tune using inter_op_parallelism_threads for best performance.


In [14]:
pwd

'/Users/Chabi/Documents/reinforcment_learning/Codes/09_DQN_Images'

In [15]:
weights_filenamne = 'DQN_B0.h5f'
checkpoint_filename = 'DQN_CHECKPOINT.h5f'

checkpoint_callback = ModelIntervalCheckpoint(checkpoint_filename,
                                              interval=100_000)

In [16]:
# load the weights
model.load_weights("/Users/Chabi/Documents/reinforcment_learning/Codes/09_DQN_Images/weights/dqn_BreakoutDeterministic-v4_weights_900000.h5f")


Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fa7b8308990> and <tensorflow.python.keras.layers.core.Permute object at 0x7fa768a82090>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fa768ac2f10> and <tensorflow.python.keras.layers.core.Activation object at 0x7fa768a82c90>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fa7b8456390> and <tensorflow.python.keras.layers.core.Activation object at 0x7fa7b8430f10>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.core.Dense object at 0x7fa7b84a8a50> and <tensorflow.python.keras.layers.core.Flatten object at 0x7fa7b8477810>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.core.Dense object at 0x7fa7b84d7dd0> and <tensorflow.pytho

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa78cb14610>

In [17]:
# update the policy to start with smaller epsilon
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              nb_steps=1_000_000,
                              attr='eps',
                              value_max=0.2, # for this script
                              value_min=-1.0,
                              value_test=0.05)


# initialize the DQNagent with the new model and update policy and compile it
dqn = DQNAgent(model=model,
               nb_actions=nb_action,
               policy=policy,
               memory=memory,
               processor=processor,
               nb_steps_warmup=50_000,
               gamma=.99,
               target_model_update=10_000,
               train_interval=4,
               delta_clip=1)


dqn.compile(optimizer=Adam(learning_rate=0.00025),
            metrics=['mae'])

In [None]:
# test if everything is okay 
dqn.fit(env,
        nb_steps=1000,
        callbacks=[checkpoint_callback],
        log_interval=500,
        visualize=False)

Training for 1000 steps ...
Interval 1 (0 steps performed)
Interval 2 (500 steps performed)

In [None]:
#dqn.test(env,
#         nb_episodes=1,
#         visualize=True)

In [None]:
#Final model

In [None]:
model.load_weights("/Users/Chabi/Documents/reinforcment_learning/Codes/09_DQN_Images/weights/dqn_BreakoutDeterministic-v4_weights_1200000.h5f")

In [None]:
policy = EpsGreedyQPolicy(0.1)

In [None]:
dqn = DQNAgent(model=model,
               nb_actions=nb_action,
               policy=policy,
               memory=memory,
               processor=processor)

dqn.compile(optimizer=Adam(learning_rate=0.00025),
            metrics=['mae'])

In [None]:
dqn.test(env, nb_episodes=5, visualize=True)