In [1]:
#Image utils
from PIL import Image  
import numpy as np
import gym

# Model imports 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from tensorflow.keras.optimizers import Adam

# Keras-RL
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [2]:
env = gym.make("snake:snake-v0")
nb_actions = env.action_space.n

#Varibles
img_shape = (84, 84)
window_length = 4

pygame 2.0.1 (SDL 2.0.14, Python 3.7.16)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
class ImageProcessor(Processor):
    def __init__(self, img_shape, gray_scale=True):
        self.img_shape = img_shape
        self.gray_scale = gray_scale
    
    def process_observation(self, observation):
        # Convert the numpy array to a PIL Image
        img = Image.fromarray(observation)
        # Resize the image
        img = img.resize(self.img_shape)
        if self.gray_scale:
            # Convert it to grayscale  
            img = img.convert("L")
        # Convert the image back to a numpy array 
        img = np.array(img)
        # Return the image
        return img.astype('uint8')  # saves storage in experience memory
    
    def process_state_batch(self, batch):
        #Divide to compress it into the intervall [0, 1].
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        #We put a limit on rewards to improve the training of the model. 
        return np.clip(reward, -1., 1.)

# MODEL 

In [4]:
def  create_model(window_length, img_shape, nb_actions):
    """ 
    The architecture is designed to process high-dimensional input (like the raw pixels from video frames in games) and output Q-values corresponding to different actions the agent can take.
    Was used in Deep Q-Networks (DQN), a reinforcement learning algorithm developed by DeepMind. 
    Specifically, this convolutional neural network (CNN) design is used to approximate the Q-value function for an agent playing video games, such as those in the Atari 2600 environment.
    
    """
    
    
    input_shape = (window_length, img_shape[0], img_shape[1])

    model = Sequential()
    model.add(Permute((2, 3, 1), input_shape=input_shape))

    model.add(Convolution2D(32, (8, 8), strides=(4, 4),kernel_initializer='he_normal'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, (4, 4), strides=(2, 2), kernel_initializer='he_normal'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, (3, 3), strides=(1, 1), kernel_initializer='he_normal'))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    
    return model 

model = create_model(window_length, img_shape, nb_actions)
print(model.summary())


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute (Permute)            (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
activation (Activation)      (None, 20, 20, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
activation_1 (Activation)    (None, 9, 9, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________

# AGENT

## Training

In [5]:
#if you just want to test it, don't execute this cell, better use the test cell.

memory = SequentialMemory(limit=1000000, window_length=window_length)
processor = ImageProcessor(img_shape, gray_scale=True)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
              train_interval=4, delta_clip=1)
dqn.compile(Adam(learning_rate=.00025), metrics=['mae'])

weights_filename = 'weights/test_dqn_snake_weights.h5f'
checkpoint_weights_filename = 'weights/test_dqn_' + "snake" + '_weights_{step}.h5f'
checkpoint_callback = ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)



In [6]:
dqn.fit(env, nb_steps=1500000, callbacks=[checkpoint_callback], log_interval=100000, visualize=False)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

## Testing

In [8]:
# Load the weights

#Loads the weight of the model, if you want to try another version remember to remove the ‘.index’ end 
model.load_weights("./weights/test_dqn_snake_weights.h5f")


memory = SequentialMemory(limit=1000000, window_length=window_length)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1, value_min=.1, value_test=.05,
                              nb_steps=100000)

processor = ImageProcessor(img_shape, gray_scale=True)

# Initialize the DQNAgent with the new model and updated policy and compile it
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000)
dqn.compile(Adam(learning_rate=.00025), metrics=['mae'])

env.sleep = 0.02

In [9]:
dqn.test(env, nb_episodes=1, visualize=True)

Testing for 1 episodes ...
Episode 1: reward: 22.000, steps: 548


<tensorflow.python.keras.callbacks.History at 0x1da96325048>