# Minecraft Agent - Deep Reinforcement

Work realised in collaboration with:

- [Clément Romac](https://github.com/ClementRomac)
- [Pierre Leroy](https://github.com/PierreLeroyBdx)

Inspired by the work of [Arthur Juliani](https://github.com/awjuliani/DeepRL-Agents) and the [Malmo Project](https://github.com/Microsoft/malmo).
Thanks to [Gym Minecraft](https://github.com/tambetm/gym-minecraft) & [Tensorflow](https://www.tensorflow.org/)

Work realised with the [installation of gym-minecraft on Ubuntu](rgfedrgfdgfdg)

In [None]:
import gym
import gym_minecraft
import numpy as np
import tensorflow as tf
import math
import random
import time
import datetime
import pickle
from enum import Enum
import matplotlib.pyplot as plt
from lxml import etree

## Envrionnemental Settings

The frame size is a useful variable, it allows us to decide which size our inputs are going to be.
The values are very important, wider your frame will be and longer the training session is going to be, however you 'll give more information to your network.

In [None]:
graph1 = tf.Graph()
frame_size = [60,60]

In [None]:
#XML modified : reward -0.1 for each action, 1000 for win & -1000 for lava
env = gym.make('MinecraftBasic-v0')  
env.init(start_minecraft = True, videoResolution = [frame_size[0], frame_size[1]], allowDiscreteMovement = ["move", "turn"]) #Movements modified to a faster convergence

In [None]:
mission_spec = str(env.mission_spec).replace('MissionSpec:\n<?xml version="1.0" encoding="UTF-8" standalone="no" ?>\n', '')
tree = etree.fromstring(mission_spec)
for elem in tree.iter():
    if(elem.tag == '{http://ProjectMalmo.microsoft.com}Reward'):
        attribs = elem.attrib.values()
        if attribs[0] == 'found_goal':
            win_reward = int(attribs[2])
        if attribs[0] == 'out_of_time':
            out_of_time_reward = int(attribs[2])
    if(elem.tag == '{http://ProjectMalmo.microsoft.com}RewardForSendingCommand'):
        step_reward = int(elem.attrib.values()[2])

## Neural Net Definition ( Feed Forward )

In [None]:
class FeedForward():
    def __init__(self):
        self.x = tf.placeholder("float", [None, num_input])
        
        self.n_hidden_1 = 1024
        self.n_hidden_2 = 1024
        
        self.weights = {
            'h1': tf.Variable(tf.random_normal([num_input, self.n_hidden_1])),
            'h2': tf.Variable(tf.random_normal([self.n_hidden_1, self.n_hidden_2])),
            'out': tf.Variable(tf.random_normal([self.n_hidden_2, num_classes]))
        }
        self.biases = {
            'b1': tf.Variable(tf.random_normal([self.n_hidden_1])),
            'b2': tf.Variable(tf.random_normal([self.n_hidden_2])),
            'out': tf.Variable(tf.random_normal([num_classes]))
        }
        
        self.layer_1 = tf.add(tf.matmul(self.x, self.weights['h1']), self.biases['b1'])
        self.layer_2 = tf.add(tf.matmul(self.layer_1, self.weights['h2']), self.biases['b2'])
        
        self.logits_layer = tf.matmul(self.layer_2, self.weights['out']) + self.biases['out']
        
        #Scale the output to improve training
        self.Qout = tf.div(tf.subtract(self.logits_layer, tf.reduce_min(self.logits_layer)), tf.subtract(tf.reduce_max(self.logits_layer), tf.reduce_min(self.logits_layer)))
        
        #Indexes of the actions the network shall take
        self.prediction = tf.argmax(self.Qout, 1)
        
        self.actions = tf.placeholder(shape = [None], dtype = tf.int32)
        #Multiply our Q values by a OneHotEncoding to only take the chosen ones.
        self.actions_onehot = tf.one_hot(self.actions, num_classes, dtype = tf.float32)
        #So that Q's going to be the Q values choosen by the network
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis = 1)

        #NextQ corresponds to the Q estimated by the Bellman Equation
        self.nextQ = tf.placeholder(shape = [None], dtype = tf.float32)

        #Scale the output to improve training
        self.nextQ_scaled = tf.div(tf.subtract(self.nextQ, tf.reduce_min(self.nextQ)), tf.subtract(tf.reduce_max(self.nextQ), tf.reduce_min(self.nextQ)))

        #The loss value coresponds to the difference between the two different Q values estimated
        self.loss = tf.reduce_mean(tf.square(self.nextQ_scaled - self.Q))
        
        #Let's print the important informations
        tf.summary.histogram("nextQ", self.nextQ)
        tf.summary.histogram("Q", self.Q)
        tf.summary.scalar("LOSS_FUNCTION", self.loss)
        self.merged = tf.summary.merge_all()
        
        self.learningRate = learningRate
        #We would prefer the Adam Optimizer
        self.trainer = tf.train.AdamOptimizer(learning_rate = self.learningRate)
        self.updateModel = self.trainer.minimize(self.loss)

## Neural Net Definition 2 ( Convolutional )

In [None]:
class Convolutional():
    def __init__(self):
        self.x = tf.placeholder("float", [None, num_input])

        #Reshape the flatten data with 3 channels (RGB)
        self.input_layer = tf.reshape(self.x, [-1, frame_size[1], frame_size[0], 3])

        #Convolutional Layer 1
        self.conv1 = tf.layers.conv2d(
            inputs = self.input_layer,
            filters = 16,
            kernel_size = [6, 6],
            strides=[1, 1],
            padding = "same",
            activation = tf.nn.relu)
        
        self.pool1 = tf.layers.max_pooling2d(inputs = self.conv1, pool_size = [2, 2], strides = 2)

        #Convolutional Layer 2
        self.conv2 = tf.layers.conv2d(
            inputs = self.pool1,
            filters = 32,           
            kernel_size = [4, 4],
            strides = [1, 1],
            padding = "same",
            activation = tf.nn.relu)
        
        #Pooling Layer #2
        self.pool2 = tf.layers.max_pooling2d(inputs = self.conv2, pool_size = [2, 2], strides = 2)

        #Convolutional Layer 3
        self.conv3 = tf.layers.conv2d(
            inputs = self.pool2,
            filters = 64,
            kernel_size = [4, 4],
            strides = [1, 1],
            padding = "same",
            activation = tf.nn.relu)
        
        #Pooling Layer #3
        self.pool3 = tf.layers.max_pooling2d(inputs = self.conv3, pool_size = [2, 2], strides = 2)

        #Flatten the data to pass it through the feed forward
        self.dims = self.pool3.get_shape().as_list()
        self.final_dimension = self.dims[1] * self.dims[2] * self.dims[3]
        self.conv3_flat = tf.reshape(self.pool3, [-1, self.final_dimension])
        
        #Feed Forward
        self.dense = tf.layers.dense(inputs = self.conv3_flat, units = 512, activation = tf.nn.relu)
        
        self.logits_layer = tf.layers.dense(inputs = self.dense, units = num_classes)
        
        #Scale the output to improve training
        self.Qout = tf.div(tf.subtract(self.logits_layer, tf.reduce_mean(self.logits_layer)), tf.subtract(tf.reduce_max(self.logits_layer), tf.reduce_min(self.logits_layer)))
        
        #Indexes of the actions the network shall take
        self.prediction = tf.argmax(self.Qout, 1)
        
        self.actions = tf.placeholder(shape = [None], dtype = tf.int32)
        #Multiply our Q values by a OneHotEncoding to only take the chosen ones.
        self.actions_onehot = tf.one_hot(self.actions, num_classes, dtype = tf.float32)
        #So that Q's going to be the Q values choosen by the network
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis = 1)

        #NextQ corresponds to the Q estimated by the Bellman Equation
        self.nextQ = tf.placeholder(shape = [None], dtype = tf.float32)

        #Scale the output to improve training
        self.nextQ_scaled = tf.div(tf.subtract(self.nextQ, tf.reduce_mean(self.nextQ)), tf.subtract(tf.reduce_max(self.nextQ), tf.reduce_min(self.nextQ)))
                
        #The loss value coresponds to the difference between the two different Q values estimated
        self.loss = tf.reduce_mean(tf.square(self.nextQ_scaled - self.Q))
        
        #Let's print the important informations
        #tf.summary.histogram("nextQ", self.nextQ)
        #tf.summary.histogram("Q", self.Q)
        tf.summary.scalar("LOSS_FUNCTION", self.loss)
        self.merged = tf.summary.merge_all()
        
        self.learningRate = learningRate
        #We would prefer the Adam Optimizer
        self.trainer = tf.train.AdamOptimizer(learning_rate = self.learningRate)
        self.updateModel = self.trainer.minimize(self.loss)

## Neural Net Definition 3 (LSTM)

In [None]:
class LSTM():
    def __init__(self, rnn_cell, scope):
        self.x = tf.placeholder("float", [None, num_input])
        #Length of the frames' sequence 
        self.train_length = tf.placeholder(dtype=tf.int32)
        self.batch_size = tf.placeholder(dtype=tf.int32,shape=[])

        #Reshape the flatten data with 3 channels (RGB)
        self.input_layer = tf.reshape(self.x, [-1, frame_size[1], frame_size[0], 3])

        #Convolutional Layer 1
        self.conv1 = tf.layers.conv2d(
            inputs = self.input_layer,
            filters = 32,
            kernel_size = [6, 6],
            strides=[2, 2],
            padding = "valid",
            activation = tf.nn.relu)
        #Output size = 18

        #Convolutional Layer 2
        self.conv2 = tf.layers.conv2d(
            inputs = self.conv1,
            filters = 64,           
            kernel_size = [4, 4],
            strides = [2, 2],
            padding = "valid",
            activation = tf.nn.relu)
        #Output size = 8

        #Convolutional Layer 3
        self.conv3 = tf.layers.conv2d(
            inputs = self.conv2,
            filters = num_nodes,#depth of the LSTM
            kernel_size = [8, 8],
            strides = [1, 1],
            padding = "valid",
            activation = tf.nn.relu)
        #Output size = 1

        #Flatten the data to pass it through the feed forward
        self.dims = self.conv3.get_shape().as_list()
        self.final_dimension = self.dims[1] * self.dims[2] * self.dims[3]
        self.conv3_flat = tf.reshape(self.conv3, [-1, self.final_dimension])
        self.rnn_input = tf.reshape(self.conv3_flat, [self.batch_size, self.train_length, num_nodes])
        
        #Initialize the LSTM state
        self.lstm_state_in = rnn_cell.zero_state(self.batch_size, tf.float32)
        self.rnn,self.rnn_state = tf.nn.dynamic_rnn(\
                inputs=self.rnn_input, cell=rnn_cell, dtype=tf.float32, initial_state=self.lstm_state_in, scope=scope + "_rnn")
        self.rnn = tf.reshape(self.rnn,shape=[-1,num_nodes])
        
        #Feed Forward
        self.dense = tf.layers.dense(inputs = self.rnn, units = 512, activation = tf.nn.relu)
        
        self.logits_layer = tf.layers.dense(inputs = self.dense, units = num_classes)
        
        #Scale the output to improve training
        self.Qout = tf.div(tf.subtract(self.logits_layer, tf.reduce_mean(self.logits_layer)), tf.subtract(tf.reduce_max(self.logits_layer), tf.reduce_min(self.logits_layer)))
        
        #Indexes of the actions the network shall take
        self.prediction = tf.argmax(self.Qout, 1)
        
        self.actions = tf.placeholder(shape = [None], dtype = tf.int32)
        #Multiply our actions values by a OneHotEncoding to only take the chosen ones.
        self.actions_onehot = tf.one_hot(self.actions, num_classes, dtype = tf.float32)
        #So that Q's going to be the Q values choosen by the network
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis = 1)

        #NextQ corresponds to the Q estimated by the Bellman Equation
        self.nextQ = tf.placeholder(shape = [None], dtype = tf.float32)

        #Scale the output to improve training
        self.nextQ_scaled = tf.div(tf.subtract(self.nextQ, tf.reduce_mean(self.nextQ)), tf.subtract(tf.reduce_max(self.nextQ), tf.reduce_min(self.nextQ)))
                
        #The loss value coresponds to the difference between the two different Q values estimated
        self.loss = tf.reduce_mean(tf.square(self.nextQ_scaled - self.Q))
        
        #Let's print the important informations
        tf.summary.histogram("nextQ", self.nextQ)
        tf.summary.histogram("Q", self.Q)
        tf.summary.scalar("LOSS_FUNCTION", self.loss)
        self.merged = tf.summary.merge_all()
        
        self.learningRate = learningRate
        #We would prefer the Adam Optimizer
        self.trainer = tf.train.AdamOptimizer(learning_rate = self.learningRate)
        self.updateModel = self.trainer.minimize(self.loss)

### Choose the Network
Obtained results: 
- Simple Feed Forward : Loss > e¹⁶
- Convolutional : Loss < e³
- LSTM : Loss

In [None]:
class ChooseNetwork(Enum):
    Feed_Forward = FeedForward
    Convolutional = Convolutional
    LSTM = LSTM

In [None]:
NetType = ChooseNetwork.Convolutional

### Some Variables

In [None]:
learningRate = 0.0001
num_input = frame_size[0] * frame_size[1] * 3
num_classes = len(env.action_names[0])
num_nodes = 64

## Experience Definition 

The experience allows us to define the moves to take care for our training.

In [None]:
class experience_buffer():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size

    def add(self, experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience) + len(self.buffer)) - self.buffer_size] = []
        self.buffer.extend(experience)

    def sample(self, size):
        return np.reshape(np.array(random.sample(self.buffer, size)), [size, 5])
    
    def get(self):
        return np.reshape(np.array(self.buffer), [len(self.buffer), 5])

In [None]:
class recurrent_experience_buffer():
    def __init__(self, buffer_size = 1000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + 1 >= self.buffer_size:
            self.buffer[0:(1+len(self.buffer))-self.buffer_size] = []
        self.buffer.append(experience)
            
    def sample(self,batch_size,trace_length):
        tmp_buffer = [episode for episode in self.buffer if len(episode)+1>trace_length]
        sampled_episodes = random.sample(tmp_buffer, batch_size)
        sampledTraces = []
        for episode in sampled_episodes:
            point = np.random.randint(0,len(episode)+1-trace_length)
            sampledTraces.append(episode[point:point+trace_length])
        sampledTraces = np.array(sampledTraces)
        return np.reshape(sampledTraces,[batch_size*trace_length,5])
    
    def get(self):
        return np.reshape(np.array(self.buffer), [len(buffer), 5])

### Reshape all the data (for the feed forward)

In [None]:
def processState(states):
    return np.reshape(states, [(num_input)])

### Update target Network

In [None]:
def updateTargetGraph(tfVars,tau):
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

## Train Settings

In [None]:
batch_size =  32 #How many experiences to use for each training step.
trace_length = 2 #How long each experience trace will be when training
if NetType == ChooseNetwork.LSTM:
    myBuffer = recurrent_experience_buffer()
    batch_size = batch_size / trace_length
else:
    myBuffer = experience_buffer()
    
update_freq = 4 #How often to perform a training step.
num_episodes = 1250000 #How many episodes of game environment to train network with
num_steps = 100
total_steps = 0
rList = [] #List of our rewards gained by game
jList = [] #Number of moves realised by game
j_by_loss = [] #Number of moves before resulting with a death of the agent
j_by_win = [] #Number of moves before resulting with a win of the agent
j_by_nothing = [] #This list's going to be used to count how many times the agent move until the limit of moves
y = .95 #Discount factor on the target Q-values

## Exploration Settings

An exploration step is realized before any exploitation step during the training which allows us to obtain a set of data to work with.

This exploration step reduces as the number of training increases, making the exploitation majority step by step (e-greedy).

In [None]:
pre_train_episodes = 1250#5000 #How many episodes of random actions before training begins.
startE = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action
annealing_episodes = 25000#50000. #How many epsiodes of training to reduce startE to endE.
e = startE
stepDrop = (startE - endE) / annealing_episodes
nb_win = 0
nb_nothing = 0
nb_loss = 0
tau = 0.001
load_model = False

### Creating an explicit and unique title for Tensorboard

In [None]:
date = str(time.time()).replace(".","")
bs = "BatchSize:" + str(batch_size)
strlr = "lr:" + str(learningRate)
rand_step = "RandStep:" + str(pre_train_episodes)
nb_to_reduce_e = "ReducE:" + str(annealing_episodes)
write_path = "./train/" + bs + "_" + strlr + "_" + rand_step + "_" + nb_to_reduce_e + "_" + date[-5:]

## Training

In [None]:
tf.reset_default_graph()
with tf.Session() as sess:
    
    #Use a Double Network
    #Using a double network increases his strength
    #Deep Reinforcement Learning with Double Q-learning : Hado van Hasselt and Arthur Guez and David Silver
    if NetType == ChooseNetwork.LSTM:
        cell = tf.contrib.rnn.BasicLSTMCell(num_units = num_nodes, state_is_tuple = True)
        cellT = tf.contrib.rnn.BasicLSTMCell(num_units = num_nodes, state_is_tuple = True)
        
        #Use a Double Network more robust
        mainQN = NetType.value(cell, 'main')
        targetQN = NetType.value(cellT, 'target')
    else:
        mainQN = NetType.value()
        targetQN = NetType.value()
    
    trainables = tf.trainable_variables()

    targetOps = updateTargetGraph(trainables,tau)
    
    #Save the network
    saver = tf.train.Saver()
    path_to_save = "./saves/" + str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) + "/"
    
    init = tf.global_variables_initializer()
    sess.run(init)
    
    writer = tf.summary.FileWriter(write_path)
    
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(path_to_save)
        saver.restore(sess,ckpt.model_checkpoint_path)
    
    for i in range(num_episodes):
        if NetType == ChooseNetwork.LSTM:
            episodeBuffer = recurrent_experience_buffer()
            lstm_state = (np.zeros([1, num_nodes]), np.zeros([1, num_nodes])) #Reset the recurrent layer's hidden state
        else:
            episodeBuffer = experience_buffer()
            
        s = env.reset()
        #env.render(mode="human")
        s = processState(s)
        rAll = 0
        d = False
        j = 0
        episode_frames = []
        episode_qvalues = []
        
        if i > pre_train_episodes:
            #Reducing portion of exploration
            if e > endE:
                e -= stepDrop
        
        ZPos = []
        XPos = []
        Yaw = []
        moves = []
        
        while not d:     
            j += 1
            #Make full exploration before the number or pre-train-episodes and play with an e chance of random action during the training (e-greedy)
            if(np.random.rand(1) < e or i < pre_train_episodes):
                if NetType == ChooseNetwork.LSTM:
                    lstm_state1 = sess.run(mainQN.rnn_state,\
                        feed_dict = {mainQN.x:[s/255.0], mainQN.train_length:1, \
                                     mainQN.lstm_state_in:lstm_state, mainQN.batch_size:1})
                index_action_predicted = env.action_space.sample()
                episode_qvalues.append([1 if i == index_action_predicted else 0 for i in range(len(env.action_names[0]))])
            else:
                if NetType == ChooseNetwork.LSTM:
                    prediction, qvalues, lstm_state1 = sess.run([mainQN.prediction, mainQN.Qout, mainQN.rnn_state],\
                        feed_dict={mainQN.x:[s/255.0], mainQN.train_length:1, \
                                   mainQN.lstm_state_in:lstm_state, mainQN.batch_size:1})
                else:      
                    prediction, qvalues = sess.run([mainQN.prediction, mainQN.Qout], \
                                                      feed_dict = {mainQN.x:[s/255.0]})
                index_action_predicted = prediction[0]
                episode_qvalues.append(qvalues[0])
            
            #Get new state and reward from environment
            s1_raw, r, d, info = env.step(index_action_predicted)
            if info["observation"]:
                ZPos.append(info['observation']['ZPos'])
                XPos.append(info['observation']['XPos'])
                Yaw.append(info['observation']['Yaw'])
            s1 = processState(s1_raw)
            moves.append(index_action_predicted)
            total_steps += 1
            episodeBuffer.add(np.reshape(np.array([s, index_action_predicted, r, s1, d]), [1, 5]))
            episode_frames.append(s1_raw)
            
            if i > pre_train_episodes:
                if total_steps % (update_freq) == 0:
                    
                    updateTarget(targetOps,sess)
                    
                    if NetType == ChooseNetwork.LSTM:
                        lstm_state_train = (np.zeros([batch_size, num_nodes]), \
                                            np.zeros([batch_size, num_nodes]))
                        
                        trainBatch = myBuffer.sample(batch_size,trace_length)
                        
                        #Estimate the action to chose by our first network
                        actionChosen = sess.run(mainQN.prediction, \
                                                feed_dict = {mainQN.x:np.vstack(trainBatch[:, 3]/255.0), \
                                                             mainQN.train_length:trace_length, \
                                                             mainQN.lstm_state_in:lstm_state_train, \
                                                             mainQN.batch_size:batch_size})
                        #Estimate all the Q values by our second network --> Double
                        allQValues = sess.run(targetQN.Qout, \
                                              feed_dict = {targetQN.x:np.vstack(trainBatch[:, 3]/255.0), \
                                                           targetQN.train_length:trace_length, \
                                                           targetQN.lstm_state_in:lstm_state_train, \
                                                           targetQN.batch_size:batch_size})

                        #Train our network using target and predicted Q values
                        end_multiplier = -(trainBatch[:, 4] -1)
                        maxQ = allQValues[range(batch_size*trace_length), actionChosen]
                        #Bellman Equation
                        targetQ = trainBatch[:, 2] + (y * maxQ * end_multiplier)

                        _, summaryPlot = sess.run([mainQN.updateModel, mainQN.merged], \
                                                  feed_dict = {mainQN.x:np.vstack(trainBatch[:, 0]/255.0), \
                                                               mainQN.nextQ:targetQ, \
                                                               mainQN.actions:trainBatch[:, 1], \
                                                               mainQN.train_length:trace_length, \
                                                               mainQN.lstm_state_in:lstm_state_train, \
                                                               mainQN.batch_size:batch_size})
                        
                        writer.add_summary(summaryPlot, total_steps)  
                    else:
                        trainBatch = myBuffer.sample(batch_size) #Get a random batch of experiences.
                    
                        #Estimate the action to chose by our first network
                        actionChosen = sess.run(mainQN.prediction, \
                                                feed_dict = {mainQN.x:np.vstack(trainBatch[:, 3]/255.0)})
                        #Estimate all the Q values by our second network --> Double
                        allQValues = sess.run(targetQN.Qout, \
                                              feed_dict = {targetQN.x:np.vstack(trainBatch[:, 3]/255.0)})

                        #Train our network using target and predicted Q values
                        end_multiplier = -(trainBatch[:, 4] -1)
                        maxQ = allQValues[range(batch_size), actionChosen]
                        #Bellman Equation
                        targetQ = trainBatch[:, 2] + (y * maxQ * end_multiplier)

                        _, summaryPlot = sess.run([mainQN.updateModel, mainQN.merged], \
                                                  feed_dict = {mainQN.x:np.vstack(trainBatch[:, 0]), \
                                                               mainQN.nextQ:targetQ, \
                                                               mainQN.actions:trainBatch[:, 1]})
                        
                        writer.add_summary(summaryPlot, total_steps)  
                    
            rAll += r
            s = s1
            if NetType == ChooseNetwork.LSTM:
                lstm_state = lstm_state1
                
            if d == True:
                if r == (win_reward+step_reward):
                    j_by_win.append(j)
                else:
                    if r == (out_of_time_reward+step_reward):
                        j_by_nothing.append(j)
                    else:
                        j_by_loss.append(j)
                break
                  
        myBuffer.add(episodeBuffer.buffer)
        jList.append(j)
        rList.append(rAll)
        rewards = np.array(rList)
        if i % (500) == 0:
            print("#######################################")
            print("% Win : " + str((len(j_by_win) - nb_win)/5) + "%")
            print("% Nothing : " + str((len(j_by_nothing) - nb_nothing)/5) + "%")
            print("% Loss : " + str((len(j_by_loss) - nb_loss)/5) + "%")
            
            print("Nb J before win: " + str(np.mean(j_by_win[-(len(j_by_win) - nb_win):])))
            print("Nb J before die: " + str(np.mean(j_by_loss[-(len(j_by_loss) - nb_loss):])))
                  
            print("Total Steps: " + str(total_steps))
            print("I: " + str(i))
            print("Epsilon: ", str(e))
                  
            nb_win = len(j_by_win)
            nb_nothing = len(j_by_nothing)
            nb_loss = len(j_by_loss)
            
            print("#### LAST EPISODE MOVES ####")
            last_episode_moves = episodeBuffer.get()
            for z in range(j):
                if z < 5:    
                    print("-----------------------")
                    plt.imshow(episode_frames[z])
                    plt.show()

                    print("- Buffer Move " + str(z) + " : " + env.action_names[0][last_episode_moves[z, 1]])
                    print("- Move Array " + str(z) + " : " + env.action_names[0][moves[z]])
                    if z != j-1:
                        print("ZPos : "+ str(ZPos[z]))
                        print("XPos : "+ str(XPos[z]))
                        print("Yaw : "+ str(Yaw[z]))
                    figure = plt.figure()
                    axes = figure.add_subplot(2, 1, 1)
                    axes.matshow([episode_qvalues[z]])
                    axes.set_xticks(range(len(env.action_names[0])))
                    actions_names = ["Straight", "Back", "Right", "Left"]
                    axes.set_xticklabels(actions_names)
                    plt.show()

                    print("         " + "          ".join(str(qval) for qval in episode_qvalues[z]))
                
        if i % (5000)== 0 and i != 0:
            #Save all the other important values
            saver.save(sess, path_to_save + str(i) + '.ckpt')
            with open(path_to_save + str(i) + ".pickle", 'wb') as file:
                dictionnary = {
                    "epsilon": e,
                    "Total_steps": total_steps,
                    "Buffer": myBuffer,
                    "rAll": rAll,
                    "rList": rList,
                    "Num Episodes": i,
                    "jList": jList
                }
                
                pickle.dump(dictionnary, file, protocol = pickle.HIGHEST_PROTOCOL)
                
    saver.save(sess, path_to_save + str(i) + '.ckpt')

## Testing

In [None]:
tf.reset_default_graph()
with tf.Session() as sess:
    
    if NetType == ChooseNetwork.LSTM:
        cell = tf.contrib.rnn.BasicLSTMCell(num_units = num_nodes, state_is_tuple = True)
        mainQN = NetType.value(cell, 'main')
    else:
        mainQN = NetType.value()
    
    saver = tf.train.Saver()
    path_to_save = "./saves/_save_date/"
    
    init = tf.global_variables_initializer()
    sess.run(init)
    
    print('Loading Model...')
    ckpt = tf.train.get_checkpoint_state(path_to_save)
    saver.restore(sess,ckpt.model_checkpoint_path)
            
    s = env.reset()
    s = processState(s)
    d = False
    j = 0

    while not d:  
        env.render(mode="human")
        j += 1
        if NetType == ChooseNetwork.LSTM:
            prediction, qvalues, lstm_state1 = sess.run([mainQN.prediction, mainQN.Qout, mainQN.rnn_state],\
                feed_dict={mainQN.x:[s/255.0], mainQN.train_length:1, \
                           mainQN.lstm_state_in:lstm_state, mainQN.batch_size:1})
        else:      
            prediction, qvalues = sess.run([mainQN.prediction, mainQN.Qout], \
                                              feed_dict = {mainQN.x:[s/255.0]})
        index_action_predicted = prediction[0]

        #Get new state and reward from environment
        s1_raw, r, d, info = env.step(index_action_predicted)

        s1 = processState(s1_raw)
        s = s1
        if NetType == ChooseNetwork.LSTM:
            lstm_state = lstm_state1

        if d == True:
            break

### Resources

- [Human-level control through deep reinforcement learning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
- [Deep Recurrent Q-Learning for Partially Observable MDPs - Matthew Hausknecht and Peter Stone](https://arxiv.org/pdf/1507.06527.pdf)
- [Deep Reinforcement Learning with Double Q-learning - Hado van Hasselt and Arthur Guez and David Silver](https://arxiv.org/pdf/1509.06461.pdf)
- [Teacher-Student Curriculum Learning - Tambet Matiisen and Avital Oliver and Taco Cohen and John Schulman](https://arxiv.org/pdf/1707.00183.pdf)
- [Deep Learning for Video Game Playing - Niels Justesen and Philip Bontrager and Julian Togelius and Sebastian Risi](https://arxiv.org/pdf/1708.07902.pdf)

### Thanks for reading ;)