# DQN on custom ENV

remember to install your own env!

## Import and Preprocessing

In [1]:
from PIL import Image # to transform the image in the Processor
import numpy as np
import gym

# CNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from tensorflow.keras.optimizers import Adam

# Keras-RL 
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [2]:
env = gym.make('snake:snake-v0')
nb_actions = env.action_space.n
nb_actions

pygame 2.1.2 (SDL 2.0.18, Python 3.7.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


4

In [3]:
# we will use shape 84x84 and window length 4 (timestep of 4 consecutive frames)
IMG_SHAPE = (84, 84)
WINDOW_LENGTH = 4

In [4]:
class ImageProcessor(Processor):
    
    def process_observation(self, observation):
        
        img = Image.fromarray(observation) # convert from numpy array to PIL Image
        img = img.resize(IMG_SHAPE) # resize image
        img = img.convert('L') # convert to grayscale
        img = np.array(img) # convert the image back to a numpy array
        
        return img.astype('uint8') # optimize training time for less memory
    
    def process_state_batch(self, batch):
        
        processed_batch = batch.astype('float32')/255.0 # divded by 255 to compress it into [0, 1]
        
        return processed_batch
    
    def process_reward(self, reward):
        
        return np.clip(reward, -1.0, 1.0) # normalized reward [-1, 1]

In [5]:
# define input shape 4 x 84 x 84
input_shape = (WINDOW_LENGTH, IMG_SHAPE[0], IMG_SHAPE[1])
input_shape

(4, 84, 84)

## Constructing the Network

In [6]:
model = Sequential()

# Convolution2D input_shape should be (Batch, 84, 84, 4)
model.add(Permute((2, 3, 1), input_shape=input_shape))

model.add(Convolution2D(filters=32,
                        kernel_size=(8, 8),
                        strides=(4, 4),
                        kernel_initializer='he_normal'))
model.add(Activation('relu'))

model.add(Convolution2D(filters=64,
                        kernel_size=(4, 4),
                        strides=(2, 2),
                        kernel_initializer='he_normal'))
model.add(Activation('relu'))

model.add(Convolution2D(filters=64,
                        kernel_size=(3, 3),
                        strides=(1, 1),
                        kernel_initializer='he_normal'))
model.add(Activation('relu'))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))

# output layer actions = Q(s, a)
model.add(Dense(nb_actions))
model.add(Activation('linear'))

model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute (Permute)            (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
activation (Activation)      (None, 20, 20, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
activation_1 (Activation)    (None, 9, 9, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________

## Setting up the Agent

In [7]:
# we define the memory
memory = SequentialMemory(limit=1_000_000,
                          window_length=WINDOW_LENGTH)

In [8]:
# define processor
processor = ImageProcessor()

In [9]:
# We use a LinearAnnealedPolicy to implement the epsilon greedy action selection with decaying epsilon.
# As we need to train for at least a million steps, we set the number of steps to 1,000,000
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              nb_steps=1_000_000,
                              attr='eps',
                              value_max=1.0,
                              value_min=-1.0,
                              value_test=0.05)

In [10]:
# we define agent and compile it. the agent is define with add train_interval=4, we train every 4th step
# besides we clip delta (the error) to 1 both, clipping and train_interval often increase the result

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               nb_steps_warmup=50_000,
               gamma=0.99,
               target_model_update=10_000,
               train_interval=4,
               delta_clip=1)

In [11]:
dqn.compile(optimizer=Adam(learning_rate=0.00025),
            metrics=['mae'])

2022-07-14 23:24:47.640203: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-14 23:24:47.641114: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 10. Tune using inter_op_parallelism_threads for best performance.


In [12]:
pwd

'/Users/Chabi/Documents/reinforcment_learning/Codes/10_ENV'

In [13]:
# saving weights as my train
weights_filename = 'test_bartek_dqn_snake_weights.h5f'
checkpoint_weights_filename = 'test_dqn_bartek_' + 'snake' + '_weights_{step}.h5f'
checkpoint_callback = ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100_000)

In [None]:
dqn.fit(env,
        nb_steps=1_500_000,
        callbacks=[checkpoint_callback],
        log_interval=100_000,
        visualize=False)

#after training is done, we save the final weights one more time
dqn.save_weights(weights_filename, overwrite=True)

Training for 1500000 steps ...
Interval 1 (0 steps performed)
2211 episodes - episode_reward: -0.905 [-1.000, 1.000] - loss: 0.003 - mae: 0.178 - mean_q: 0.239 - mean_eps: 0.850 - score: 0.073

Interval 2 (100000 steps performed)
1522 episodes - episode_reward: -0.766 [-1.000, 2.000] - loss: 0.002 - mae: 0.353 - mean_q: 0.486 - mean_eps: 0.700 - score: 0.165

Interval 3 (200000 steps performed)
1628 episodes - episode_reward: -0.189 [-1.000, 5.000] - loss: 0.004 - mae: 0.584 - mean_q: 0.803 - mean_eps: 0.500 - score: 0.513

Interval 4 (300000 steps performed)
1707 episodes - episode_reward: 0.493 [-1.000, 7.000] - loss: 0.008 - mae: 0.886 - mean_q: 1.228 - mean_eps: 0.300 - score: 0.986

Interval 5 (400000 steps performed)
988 episodes - episode_reward: 2.180 [-1.000, 13.000] - loss: 0.013 - mae: 1.270 - mean_q: 1.768 - mean_eps: 0.100 - score: 2.230

Interval 6 (500000 steps performed)
497 episodes - episode_reward: 5.519 [0.000, 15.000] - loss: 0.016 - mae: 1.680 - mean_q: 2.342 - me

In [55]:
# Load weights
model.load_weights('snake_weights/dqn_snake_weights_1200000.h5f')

memory = SequentialMemory(limit=1_000_000,
                          window_length=WINDOW_LENGTH)

policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              nb_steps=1_000_000,
                              attr='eps',
                              value_max=1.0,
                              value_min=-1.0,
                              value_test=0.05)

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               nb_steps_warmup=50_000,
               gamma=0.99,
               target_model_update=10_000,
               train_interval=4,
               delta_clip=1)

dqn.compile(optimizer=Adam(learning_rate=0.00025),
            metrics=['mae'])


Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fe6136bee10> and <tensorflow.python.keras.layers.core.Permute object at 0x7fe6136be9d0>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fe5b0439790> and <tensorflow.python.keras.layers.core.Activation object at 0x7fe6136d3190>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fe5b032d550> and <tensorflow.python.keras.layers.core.Activation object at 0x7fe5b032d990>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.core.Dense object at 0x7fe5b0312750> and <tensorflow.python.keras.layers.core.Flatten object at 0x7fe5b0320690>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.core.Dense object at 0x7fe5d0991bd0> and <tensorflow.pytho

In [61]:
env.sleep=0.2 # doesn't move increadibly fast

In [62]:
dqn.test(env,
         nb_episodes=1,
         visualize=True)

Testing for 1 episodes ...
Episode 1: reward: 4.000, steps: 116


<tensorflow.python.keras.callbacks.History at 0x7fe6138c05d0>