# Breakout Deep Q-Learning 

* [1](#step1): Making sure the environment is working fine
* [2](#step2): Creating the Agent
* [3](#step3): Training the Agent
* [4](#step4): Testing the Agent

<a id='step1'></a>
## 1. Making sure that the environment is working 

In [1]:
import gym
import time
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('BreakoutDeterministic-v4')

state = env.reset()
env.render()
is_done = False

while(not is_done):
    action = env.action_space.sample()
    state, reward, is_done, _  = env.step(action)

    env.render()
    time.sleep(0.02)

env.close()

<a id='step2'></a>
## 2. Creating the Agent 

##### 2.1 In this part we will also initialize the replay memory 

In [2]:
from agents.dqn_agent import Agent
from agents import utils
import pickle
import random

Using TensorFlow backend.


In [3]:
INPUT_SHAPE = (82,72,4)
batch_size = 32
n_actions = env.action_space.n
memory_size = 100000
memory_initial_frames = 50000
gamma = 0.99

In [5]:
agent = Agent(input_shape=INPUT_SHAPE, n_actions=4, batch_size=batch_size, memory_size=memory_size, memory_initial_frames=memory_initial_frames, gamma=gamma, env=env)

In [None]:
agent.initialize_replay_memory()

<a id='step3'></a>
## 3. Training the Agent 

In [7]:
episodes = 10000
max_frames = 1000000
update_target_frequency = 5000
epsilon = 1.0
pas = 0.9/600000

In [None]:
current_frame = 0
total_reward_episode = 0
history = []
current_episode = 0

while(True):
    #First state of the episode
    state = env.reset()
    state1, r1, is_done1, _  = env.step(0)
    state2, r2, is_done2, _  = env.step(0)
    state3, r3, is_done3, _  = env.step(0)
    total_reward_episode = r1 + r2 + r3

    initial_state = utils.generate_input(state, state1, state2, state3)
    is_terminal_state = False

    start_frame = current_frame
    while(not is_terminal_state):
        #e-greedy policy for choosing the action
        if(random.random() < epsilon):
            action = env.action_space.sample()
        else:
            qs = agent.predict(initial_state.reshape(-1, *initial_state.shape))
            action = np.argmax(qs)

        #Taking the action and saving the new state
        state, reward, is_done, _  = env.step(action)
        state1, reward1, is_done1, _  = env.step(action)
        state2, reward2, is_done2, _  = env.step(action)
        state3, reward3, is_done3, _  = env.step(action) 
        total_reward_episode += reward + reward1 + reward2 + reward3

        next_state = utils.generate_input(state, state1, state2, state3)
        total_reward = utils.transform_reward(reward + reward1 + reward2 + reward3)
        is_terminal_state = is_done or is_done1 or is_done2 or is_done3

        #Stocking the transition
        agent.update_replay_memory([initial_state, action, total_reward, next_state, is_terminal_state])

        agent.train()
        initial_state = next_state
        current_frame += 1
        
        #Updating the target network acording to the updating frequency
        if(current_frame%update_target_frequency == 0):
            agent.update_target()
        if(epsilon > 0.1):
            epsilon -= pas
        

    history.append(total_reward_episode)
    current_episode += 1
    
    #Stock the history in a file every 10000 episodes 
    #to free some memory and save the models
    if(current_episode%10000 == 0):
        index = int(current_episode/episodes)
        pickle.dump(history, open( "history" + str(index) + ".pk", "wb" ) )
        history.clear()
        agent.save()

    print('\r', end='')
    print(current_frame, ' of ', max_frames, "n_frames:", (current_frame - start_frame), " reward: ", total_reward_episode, " epsilon: ", epsilon, "Episode: ", current_episode, end='')

    if(current_frame > max_frames):
        break

<a id='step4'></a>
## 4. Testing the Agent 

In [11]:
agent.model.load_weights('saved_models/model_1_000_000.h5')

In [15]:
state = env.reset()
env.render()
time.sleep(6)
state1, r1, is_done1, _  = env.step(0)
state2, r2, is_done2, _  = env.step(0)
state3, r3, is_done3, _  = env.step(0)
total_r = r1 + r2 + r3

initial_state = utils.generate_input(state, state1, state2, state3)
is_terminal_state = False

while(not is_terminal_state):
    qs = agent.predict(initial_state.reshape(-1, *initial_state.shape))
    action = np.argmax(qs)

    state, reward, is_done, _  = env.step(action)
    state1, reward1, is_done1, _  = env.step(action)
    state2, reward2, is_done2, _  = env.step(action)
    state3, reward3, is_done3, _  = env.step(action) 
    total_r += reward + reward1 + reward2 + reward3

    next_state = utils.generate_input(state, state1, state2, state3)
    total_reward = utils.transform_reward(reward + reward1 + reward2 + reward3)
    is_terminal_state = is_done or is_done1 or is_done2 or is_done3
    env.render()
    initial_state = next_state
    time.sleep(0.15)
env.close()
print("Reward: ", total_r)

Reward:  20.0
