In [1]:
# Section in which we build our CNN - Brain

# Importing the libraries
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from keras.optimizers import Adam

nRows = 105                  #Number of rows in the image.     210
nColumns = 80                #Number of columns in the image.  160

# Creating the Brain class
class Brain():
    def __init__(self, iS = (nRows,nColumns,3), lr = 0.0005):  # is = input shape    lr = learning rate
        self.learningRate = lr 
        self.inputShape = iS
        self.numOutputs = 3         # three actions total, left, right and nothing
        self.model = Sequential()
        # Adding layers to the model
        self.model.add(Conv2D(32, (3,3), activation = 'relu', input_shape = self.inputShape))  #32 3x3 filters with the ReLU activation function. You
        self.model.add(MaxPooling2D((2,2)))                                                    #adding a max pooling layer. The window's size is 2x2, which will shrink the feature maps in size by 2.
        self.model.add(Conv2D(64, (2,2), activation = 'relu'))                                 #second convolution layer. This time it has 64 2x2 filters, with the same ReLU activation function.
        self.model.add(Flatten())                                                              #flatten to a 1D vector. Here the 2D images is flattened to a 1D vector, which we then will be able to use as the input to your neural network.
        self.model.add(Dense(units = 256, activation = 'relu'))                                #full connection step – building the traditional ANN. This specific line adds a new hidden layer with 256 neurons and the ReLU activation function to the model.
        self.model.add(Dense(units = self.numOutputs)) #last layer of the neural network – the output layer. It has as many outputs as there are actions. By not mentioning an activation function, it defaults to a linear.
        # Compiling the model
        self.model.compile(loss = 'mean_squared_error', optimizer = Adam(lr = self.learningRate)) #how to calculate the error (indicated by loss), and which optimizer to use when training the model

        # Making a function that will load a model from a file
        def loadModel(self, filepath):
            self.model = load_model(filepath)
            return self.model

Using TensorFlow backend.


In [2]:
# Section containing the environment (Breakout game) - Environment
import gym
import atari_py
env = gym.make("Breakout-v4")
env.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT']

In [3]:
# Section that builds the Experience Replay Memory - DQN

# Importing the libraries
import numpy as np

# IMPLEMENTING DEEP Q-LEARNING WITH EXPERIENCE REPLAY

class Dqn(object):
    
    # INTRODUCING AND INITIALIZING ALL THE PARAMETERS AND VARIABLES OF THE DQN
    def __init__(self, max_memory = 100, discount = 0.9):
        self.memory = list()
        self.max_memory = max_memory
        self.discount = discount

    # MAKING A METHOD THAT BUILDS THE MEMORY IN EXPERIENCE REPLAY
    def remember(self, transition, game_over):
        self.memory.append([transition, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    # MAKING A METHOD THAT BUILDS TWO BATCHES OF INPUTS AND TARGETS BY EXTRACTING TRANSITIONS FROM THE MEMORY
    def get_batch(self, model, batch_size = 10):
        len_memory = len(self.memory)
        num_outputs = model.output_shape[-1]
        
        # Input batch which works with 3D states
        inputs = np.zeros((min(len_memory, batch_size), self.memory[0][0][0].shape[1],self.memory[0][0][0].shape[2],self.memory[0][0][0].shape[3]))
        
        targets = np.zeros((min(len_memory, batch_size), num_outputs))
        for i, idx in enumerate(np.random.randint(0, len_memory, size = min(len_memory, batch_size))):
            current_state, action, reward, next_state = self.memory[idx][0]
            game_over = self.memory[idx][1]
            inputs[i] = current_state
            targets[i] = model.predict(current_state)[0]
            Q_sa = np.max(model.predict(next_state)[0])
            if game_over:
                targets[i, action] = reward
            else:
                targets[i, action] = reward + self.discount * Q_sa
        return inputs, targets

In [4]:
# Section where we will train our AI to play Breakout - Trainer

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cv2

# Defining the parameters (hyperparamteters)
memSize = 50000              #The maximum size of the experience replay memory.
batchSize = 10               #The size of the batch of inputs and targets gotten at each iteration from the experience replay memory for the model to train on.
learningRate = 0.0001        #The learning rate for the Adam optimizer in the Brain.
gamma = 0.9                  #The discount factor for the experience replay memory.
nLastStates = 3              #How many last frames we save as our current state of the game. Remember, the input is a 3D array of size nRows x nColumns x nLastStates to the CNN in the Brain.
epsilon = 1.                 #The initial epsilon, the chance of taking a random action.
epsilonDecayRate = 0.0002    #By how much we decrease epsilon after every single game/epoch.
minEpsilon = 0.05            #The lowest possible epsilon, after which it can't be adjusted any lower.
filepathToSave = 'model2.h5' #Where we want to save the model.

# Creating the Environment, the Brain and the Experience Replay Memory
#env = environment(0)  --- maybe just remove this line
brain = Brain()
model = brain.model
dqn = Dqn(memSize, gamma)

# A function that will initialize game states
def resetStates():
    observation = rgb2gray(env.reset())
    currentState = np.zeros((1, nRows, nColumns, nLastStates))
    for i in range(nLastStates):
        currentState[:,:,:,i] = observation
    return currentState, currentState

# A function that converts the picture to grayscaale
def rgb2gray(rgb):
    # Downsizing the image so the process of the convolution layers will be faster 
    downsized = cv2.resize(rgb, dsize=(nColumns, nRows), interpolation=cv2.INTER_CUBIC)
    return np.dot(downsized[...,:3], [0.2989, 0.5870, 0.1140])

# Starting the main loop
epoch = 0
scores = list()
maxScore = 0    #the highest score obtained so far in the training
score = 0      #the score in each game/epoch

while True:
    # Resetting the environment and game states
    currentState, nextState = resetStates()
    epoch += 1
    gameOver = False
    previousLifes = 5
    lifes = 5
    livesStep = 0
    score = -80
    bricks = 84   # 14 bricks in a row and there are 6 rows in total, so a starting amount of 84 bricks
        
    # Starting the second loop in which we play the game and teach our AI
    while not gameOver:
        # Choosing an action to play
        if np.random.rand() < epsilon:        #Checks if a random action sould be made, or just take the action with the highest Q-value.
            action = np.random.randint(0, 3)
        else:
            qvalues = model.predict(currentState)[0]
            action = np.argmax(qvalues)

        # Updating the enviroenment
        state, reward, gamOver, livesStep = env.step(action + 1)  # +1 is placed here as there are 4 actions, but the first action does nothing and hence it is ignored.
        lifes = livesStep['ale.lives']        
        # Render the game
        env.render()
        # converting the rgb image state to grayscale
        state = rgb2gray(state)
        

        # Adding new game frame to the next state and deleting the oldest frame from next state
        state = np.reshape(state, (1, nRows, nColumns, 1))
        nextState = np.append(nextState, state, axis = 3)
        nextState = np.delete(nextState, 0, axis = 3)
        
        # Remembering the transition and training the AI
        dqn.remember([currentState, action, reward, nextState], gameOver)
        inputs, targets = dqn.get_batch(model, batchSize)
        model.train_on_batch(inputs, targets)
        
        # Sets the currenState to the nextState
        currentState = nextState
                
        # Updates the score
        score += reward
        
        # If a life was lost, reset environment
        if lifes < previousLifes:
            # Updating lifes
            previousLifes -= 1
            # granting it a negative reward for loosing a life
            score -= 3
        if lifes == 0:
            gameOver = True
    
    # Checking if score record was beaten and if yes then saving the model
    if score > maxScore:
        maxScore = score
        model.save(filepathToSave)
    
    # Lowering the epsilon
    if epsilon > minEpsilon:
        epsilon -= epsilonDecayRate
    
    # Showing the results each game
    print('Epoch: ' + str(epoch) + ' Score: ' + str(score) + ' Epsilon: {:.5f}'.format(epsilon))














Epoch: 1 Score: -43.51399999999983 Epsilon: 0.99980
Epoch: 2 Score: -38.860000000000035 Epsilon: 0.99960
Epoch: 3 Score: -43.499999999999844 Epsilon: 0.99940
Epoch: 4 Score: -40.37200000000006 Epsilon: 0.99920
Epoch: 5 Score: -43.96499999999979 Epsilon: 0.99900
Epoch: 6 Score: -39.364000000000054 Epsilon: 0.99880
Epoch: 7 Score: -50.15100000000021 Epsilon: 0.99860
Epoch: 8 Score: -49.584000000000245 Epsilon: 0.99840
Epoch: 9 Score: -39.44800000000006 Epsilon: 0.99820
Epoch: 10 Score: -39.61600000000006 Epsilon: 0.99800
Epoch: 11 Score: -54.188000000000216 Epsilon: 0.99780
Epoch: 12 Score: -39.02800000000004 Epsilon: 0.99760
Epoch: 13 Score: -39.112000000000045 Epsilon: 0.99740
Epoch: 14 Score: -39.112000000000045 Epsilon: 0.99720
Epoch: 15 Score: -43.540999999999784 Epsilon: 0.99700
Epoch: 16 Score: -43.87899999999979 Epsilon: 0.99680
Epoch: 17 Score: -39.53200000000006 Epsilon: 0.99660
Epoch: 18 Score: -42.660999999999845 Epsilon: 0.99640
Epoch: 19 Score: -38.272000000000

KeyboardInterrupt: 