# 0. Install Dependencies

In [1]:
!pip install tensorflow==2.3.1 gym keras-rl2 gym[atari]



# 1. Test Random Environment with OpenAI Gym

In [23]:
import gym
import random

In [24]:
# e.g., if you imported the supported version of Freeway
#from ale_py.roms import Freeway

# Print all registered ROMs
import ale_py.roms as roms
print(roms.__all__)

['Tetris', 'Adventure', 'AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'Atlantis2', 'Backgammon', 'BankHeist', 'BasicMath', 'BattleZone', 'BeamRider', 'Berzerk', 'Blackjack', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Casino', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'Crossbow', 'Darkchambers', 'Defender', 'DemonAttack', 'DonkeyKong', 'DoubleDunk', 'Earthworld', 'ElevatorAction', 'Enduro', 'Entombed', 'Et', 'FishingDerby', 'FlagCapture', 'Freeway', 'Frogger', 'Frostbite', 'Galaxian', 'Gopher', 'Gravitar', 'Hangman', 'HauntedHouse', 'Hero', 'HumanCannonball', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kaboom', 'Kangaroo', 'KeystoneKapers', 'KingKong', 'Klax', 'Koolaid', 'Krull', 'KungFuMaster', 'LaserGates', 'LostLuggage', 'MarioBros', 'MiniatureGolf', 'MontezumaRevenge', 'MrDo', 'MsPacman', 'NameThisGame', 'Othello', 'Pacman', 'Phoenix', 'Pitfall', 'Pitfall2', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaque

In [25]:
#Import SpaceInvaders-v0 environment and get height, width, channels and actions
env = gym.make('SpaceInvaders-v0')
height, width, channels = env.observation_space.shape
actions = env.action_space.n

In [26]:
#We have 6 actions to choose
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [27]:
# Test the games five times with random actions

episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = random.choice([0,1,2,3,4,5])
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()



Episode:1 Score:355.0
Episode:2 Score:35.0
Episode:3 Score:105.0
Episode:4 Score:105.0
Episode:5 Score:515.0


# 2. Create a Deep Learning Model with Keras.

In [28]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D
from tensorflow.keras.optimizers import Adam

from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [29]:
# Definite the model with cnn
def build_model(height, width, channels, actions):
    model = Sequential()
    model.add(Convolution2D(32, (8,8), strides=(4,4), activation='relu', input_shape=(3,height, width, channels)))
    model.add(Convolution2D(64, (4,4), strides=(2,2), activation='relu'))
    model.add(Convolution2D(64, (3,3), activation='relu'))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [30]:
# Delete the model if necessary
del model

In [31]:
# Build the model
model = build_model(height, width, channels, actions)

In [32]:
# Show the summary of model
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 3, 51, 39, 32)     6176      
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 3, 24, 18, 64)     32832     
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 3, 22, 16, 64)     36928     
_________________________________________________________________
flatten_1 (Flatten)          (None, 67584)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)               34603520  
_________________________________________________________________
dense_5 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_6 (Dense)              (None, 6)                

# 3. Build Agent with Keras-RL

In [33]:
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy

# All agent is besed on DQN
# An implementation of the DQN agent as described in Mnih (2013) and Mnih (2015).
# http://arxiv.org/pdf/1312.5602.pdf
# http://arxiv.org/abs/1509.06461

"""Implement the linear annealing policy

    Linear Annealing Policy computes a current threshold value and
    transfers it to an inner policy which chooses the action. The threshold
    value is following a linear function decreasing over time.
    """

"""Implement the epsilon greedy policy

    Eps Greedy policy either:

    - takes a random action with probability epsilon
    - takes current best action with prob (1 - epsilon)
    """

'Implement the epsilon greedy policy\n\n    Eps Greedy policy either:\n\n    - takes a random action with probability epsilon\n    - takes current best action with prob (1 - epsilon)\n    '

Agent 1 EpsGreedyQPolicy eps = 0.05 nb_steps = 10000  

In [None]:
# Build the agent 1

def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=10000)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn


dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))



In [14]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)



Training for 10000 steps ...




  490/10000: episode: 1, duration: 8.483s, episode steps: 490, steps per second:  58, episode reward: 80.000, mean reward:  0.163 [ 0.000, 25.000], mean action: 2.359 [0.000, 5.000],  loss: --, mean_q: --, mean_eps: --




 1462/10000: episode: 2, duration: 178.956s, episode steps: 972, steps per second:   5, episode reward: 240.000, mean reward:  0.247 [ 0.000, 30.000], mean action: 2.442 [0.000, 5.000],  loss: 7.163973, mean_q: 10.394617, mean_eps: 0.889210
 2215/10000: episode: 3, duration: 286.084s, episode steps: 753, steps per second:   3, episode reward: 155.000, mean reward:  0.206 [ 0.000, 30.000], mean action: 2.471 [0.000, 5.000],  loss: 0.916519, mean_q: 9.545321, mean_eps: 0.834580
 2994/10000: episode: 4, duration: 292.333s, episode steps: 779, steps per second:   3, episode reward: 240.000, mean reward:  0.308 [ 0.000, 30.000], mean action: 2.511 [0.000, 5.000],  loss: 1.143257, mean_q: 9.195589, mean_eps: 0.765640
 3934/10000: episode: 5, duration: 352.635s, episode steps: 940, steps per second:   3, episode reward: 215.000, mean reward:  0.229 [ 0.000, 30.000], mean action: 2.662 [0.000, 5.000],  loss: 0.648432, mean_q: 9.225279, mean_eps: 0.688285
 4730/10000: episode: 6, duration: 299.

<keras.callbacks.History at 0x215198d7580>

In [17]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=False)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...


  logger.warn(


Episode 1: reward: 190.000, steps: 887
Episode 2: reward: 210.000, steps: 1411
Episode 3: reward: 180.000, steps: 842
Episode 4: reward: 75.000, steps: 587
Episode 5: reward: 115.000, steps: 816
Episode 6: reward: 80.000, steps: 658
Episode 7: reward: 105.000, steps: 687
Episode 8: reward: 185.000, steps: 1080
Episode 9: reward: 320.000, steps: 833
Episode 10: reward: 105.000, steps: 667
156.5


Agent 2 EpsGreedyQPolicy eps = 0.2 nb_steps = 10000  

In [12]:
# Build the agent 2

def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.2, nb_steps=10000)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn

In [13]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))



In [14]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)

Training for 10000 steps ...




  600/10000: episode: 1, duration: 7.774s, episode steps: 600, steps per second:  77, episode reward:  5.000, mean reward:  0.008 [ 0.000,  5.000], mean action: 2.477 [0.000, 5.000],  loss: --, mean_q: --, mean_eps: --




 1251/10000: episode: 2, duration: 100.120s, episode steps: 651, steps per second:   7, episode reward: 110.000, mean reward:  0.169 [ 0.000, 25.000], mean action: 2.521 [0.000, 5.000],  loss: 6.928415, mean_q: 3.872397, mean_eps: 0.898705
 1756/10000: episode: 3, duration: 197.022s, episode steps: 505, steps per second:   3, episode reward: 80.000, mean reward:  0.158 [ 0.000, 25.000], mean action: 2.671 [0.000, 5.000],  loss: 0.579766, mean_q: 3.381719, mean_eps: 0.864730
 2715/10000: episode: 4, duration: 363.882s, episode steps: 959, steps per second:   3, episode reward: 205.000, mean reward:  0.214 [ 0.000, 30.000], mean action: 2.462 [0.000, 5.000],  loss: 0.920727, mean_q: 4.207735, mean_eps: 0.798850
 3246/10000: episode: 5, duration: 201.139s, episode steps: 531, steps per second:   3, episode reward: 105.000, mean reward:  0.198 [ 0.000, 25.000], mean action: 2.510 [0.000, 5.000],  loss: 0.322328, mean_q: 4.497552, mean_eps: 0.731800
 4257/10000: episode: 6, duration: 380.53

<keras.callbacks.History at 0x2039a4f3b80>

In [17]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=False)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...
Episode 1: reward: 255.000, steps: 932
Episode 2: reward: 190.000, steps: 944
Episode 3: reward: 495.000, steps: 1035
Episode 4: reward: 35.000, steps: 521
Episode 5: reward: 45.000, steps: 606
Episode 6: reward: 280.000, steps: 1111
Episode 7: reward: 110.000, steps: 593
Episode 8: reward: 420.000, steps: 1604
Episode 9: reward: 105.000, steps: 807
Episode 10: reward: 50.000, steps: 372
198.5


Agent 3 EpsGreedyQPolicy eps = 0.2 nb_steps = 20000 

In [None]:
# Build the agent 3

def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.2, nb_steps=20000)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn


dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))



In [12]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=20000, visualize=False, verbose=2)



Training for 20000 steps ...




   762/20000: episode: 1, duration: 10.924s, episode steps: 762, steps per second:  70, episode reward: 105.000, mean reward:  0.138 [ 0.000, 30.000], mean action: 2.524 [0.000, 5.000],  loss: --, mean_q: --, mean_eps: --




  1678/20000: episode: 2, duration: 269.352s, episode steps: 916, steps per second:   3, episode reward: 285.000, mean reward:  0.311 [ 0.000, 30.000], mean action: 2.525 [0.000, 5.000],  loss: 7.871606, mean_q: 7.289139, mean_eps: 0.939745
  2293/20000: episode: 3, duration: 240.216s, episode steps: 615, steps per second:   3, episode reward: 180.000, mean reward:  0.293 [ 0.000, 30.000], mean action: 2.561 [0.000, 5.000],  loss: 1.268222, mean_q: 8.170401, mean_eps: 0.910675
  2722/20000: episode: 4, duration: 173.505s, episode steps: 429, steps per second:   2, episode reward: 75.000, mean reward:  0.175 [ 0.000, 25.000], mean action: 2.476 [0.000, 5.000],  loss: 1.276355, mean_q: 8.079276, mean_eps: 0.887185
  4026/20000: episode: 5, duration: 512.924s, episode steps: 1304, steps per second:   3, episode reward: 650.000, mean reward:  0.498 [ 0.000, 200.000], mean action: 2.622 [0.000, 5.000],  loss: 4.177796, mean_q: 8.900995, mean_eps: 0.848193
  4643/20000: episode: 6, duration:

<keras.callbacks.History at 0x1a91aba07c0>

In [13]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=False)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...
Episode 1: reward: 345.000, steps: 1393
Episode 2: reward: 35.000, steps: 564
Episode 3: reward: 125.000, steps: 763
Episode 4: reward: 200.000, steps: 810
Episode 5: reward: 230.000, steps: 843
Episode 6: reward: 120.000, steps: 599
Episode 7: reward: 400.000, steps: 835
Episode 8: reward: 140.000, steps: 625
Episode 9: reward: 105.000, steps: 539
Episode 10: reward: 110.000, steps: 668
181.0


The performance of Agent 3 is better than that of Agent 1

Agent 4 EpsGreedyQPolicy eps = 0.05 nb_steps = 20000

In [12]:
# Build the agent 4

def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=20000)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn


dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))




In [14]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=20000, visualize=False, verbose=2)



Training for 20000 steps ...




   739/20000: episode: 1, duration: 6.028s, episode steps: 739, steps per second: 123, episode reward: 135.000, mean reward:  0.183 [ 0.000, 30.000], mean action: 2.356 [0.000, 5.000],  loss: --, mean_q: --, mean_eps: --




  1540/20000: episode: 2, duration: 204.768s, episode steps: 801, steps per second:   4, episode reward: 175.000, mean reward:  0.218 [ 0.000, 30.000], mean action: 2.325 [0.000, 5.000],  loss: 1.551605, mean_q: 5.847624, mean_eps: 0.942850
  2150/20000: episode: 3, duration: 229.309s, episode steps: 610, steps per second:   3, episode reward: 30.000, mean reward:  0.049 [ 0.000, 10.000], mean action: 2.605 [0.000, 5.000],  loss: 0.171828, mean_q: 5.474777, mean_eps: 0.916998
  2698/20000: episode: 4, duration: 207.425s, episode steps: 548, steps per second:   3, episode reward: 65.000, mean reward:  0.119 [ 0.000, 20.000], mean action: 2.520 [0.000, 5.000],  loss: 0.158851, mean_q: 5.277823, mean_eps: 0.890942
  3538/20000: episode: 5, duration: 317.719s, episode steps: 840, steps per second:   3, episode reward: 125.000, mean reward:  0.149 [ 0.000, 25.000], mean action: 2.536 [0.000, 5.000],  loss: 0.242780, mean_q: 5.453539, mean_eps: 0.859712
  4597/20000: episode: 6, duration: 40

<keras.callbacks.History at 0x1a91ab80e80>

In [15]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...


  logger.warn(


Episode 1: reward: 335.000, steps: 913
Episode 2: reward: 70.000, steps: 416
Episode 3: reward: 10.000, steps: 453
Episode 4: reward: 215.000, steps: 874
Episode 5: reward: 110.000, steps: 638
Episode 6: reward: 250.000, steps: 984
Episode 7: reward: 40.000, steps: 326
Episode 8: reward: 380.000, steps: 993
Episode 9: reward: 110.000, steps: 682
Episode 10: reward: 135.000, steps: 711
165.5


The performance of Agent 2 is worse than that of Agent 4

Agent 5 BoltzmannQPolicy nb_steps = 10000

In [34]:
"""Implement the Boltzmann Q Policy

    Boltzmann Q Policy builds a probability law on q values and returns
    an action selected randomly according to this law.
    """

from rl.policy import BoltzmannQPolicy


# Build the agent 5

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn


dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))


In [19]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)



Training for 10000 steps ...




 1091/10000: episode: 1, duration: 42.917s, episode steps: 1091, steps per second:  25, episode reward: 225.000, mean reward:  0.206 [ 0.000, 30.000], mean action: 3.057 [0.000, 5.000],  loss: 3.858079, mean_q: 4.985610
 1792/10000: episode: 2, duration: 263.096s, episode steps: 701, steps per second:   3, episode reward: 105.000, mean reward:  0.150 [ 0.000, 30.000], mean action: 2.919 [0.000, 5.000],  loss: 0.863608, mean_q: 5.090210
 2591/10000: episode: 3, duration: 299.888s, episode steps: 799, steps per second:   3, episode reward: 215.000, mean reward:  0.269 [ 0.000, 30.000], mean action: 2.852 [0.000, 5.000],  loss: 0.484350, mean_q: 4.841578
 3350/10000: episode: 4, duration: 284.071s, episode steps: 759, steps per second:   3, episode reward: 155.000, mean reward:  0.204 [ 0.000, 30.000], mean action: 2.702 [0.000, 5.000],  loss: 0.404858, mean_q: 5.305809
 4169/10000: episode: 5, duration: 306.690s, episode steps: 819, steps per second:   3, episode reward: 220.000, mean re

<keras.callbacks.History at 0x1aa4ffe6700>

In [36]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...




Episode 1: reward: 120.000, steps: 816
Episode 2: reward: 155.000, steps: 876
Episode 3: reward: 50.000, steps: 648
Episode 4: reward: 180.000, steps: 863
Episode 5: reward: 245.000, steps: 1117
Episode 6: reward: 245.000, steps: 1894
Episode 7: reward: 240.000, steps: 1940
Episode 8: reward: 135.000, steps: 838
Episode 9: reward: 210.000, steps: 866
Episode 10: reward: 180.000, steps: 892
176.0


Agent 5 chose the most left side to fire.

Agent 6 MaxBoltzmannQPolicy nb_steps = 10000

In [37]:
 """
    A combination of the eps-greedy and Boltzman q-policy.

    Wiering, M.: Explorations in Efficient Reinforcement Learning.
    PhD thesis, University of Amsterdam, Amsterdam (1999)

    https://pure.uva.nl/ws/files/3153478/8461_UBA003000033.pdf
    """
from rl.policy import MaxBoltzmannQPolicy

# Build the agent 6

def build_agent(model, actions):
    policy = MaxBoltzmannQPolicy()
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn


dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))



In [24]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)

Training for 10000 steps ...
  544/10000: episode: 1, duration: 4.411s, episode steps: 544, steps per second: 123, episode reward: 110.000, mean reward:  0.202 [ 0.000, 30.000], mean action: 2.528 [0.000, 5.000],  loss: --, mean_q: --




 1486/10000: episode: 2, duration: 185.578s, episode steps: 942, steps per second:   5, episode reward: 160.000, mean reward:  0.170 [ 0.000, 30.000], mean action: 2.807 [0.000, 5.000],  loss: 1.214559, mean_q: -0.595408
 2385/10000: episode: 3, duration: 338.062s, episode steps: 899, steps per second:   3, episode reward: 135.000, mean reward:  0.150 [ 0.000, 30.000], mean action: 2.880 [0.000, 5.000],  loss: 1.011598, mean_q: -0.650642
 2986/10000: episode: 4, duration: 225.965s, episode steps: 601, steps per second:   3, episode reward: 140.000, mean reward:  0.233 [ 0.000, 30.000], mean action: 3.559 [0.000, 5.000],  loss: 1.846490, mean_q: -0.370972
 4040/10000: episode: 5, duration: 395.554s, episode steps: 1054, steps per second:   3, episode reward: 540.000, mean reward:  0.512 [ 0.000, 200.000], mean action: 3.202 [0.000, 5.000],  loss: 4.038543, mean_q: -0.242950
 5325/10000: episode: 6, duration: 482.070s, episode steps: 1285, steps per second:   3, episode reward: 370.000, 

<keras.callbacks.History at 0x1aa674b89a0>

In [40]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...
Episode 1: reward: 510.000, steps: 1620
Episode 2: reward: 575.000, steps: 1244
Episode 3: reward: 235.000, steps: 676
Episode 4: reward: 205.000, steps: 559
Episode 5: reward: 305.000, steps: 744
Episode 6: reward: 435.000, steps: 1409
Episode 7: reward: 300.000, steps: 1119
Episode 8: reward: 485.000, steps: 1239
Episode 9: reward: 215.000, steps: 742
Episode 10: reward: 230.000, steps: 1162
349.5


Agent 6 chose the most right side to fire.

Agent 7 GreedyQPolicy nb_steps = 10000

In [22]:
"""Implement the greedy policy

    Greedy policy returns the current best action according to q_values
    """

"""
    A combination of the eps-greedy and Boltzman q-policy.

    Wiering, M.: Explorations in Efficient Reinforcement Learning.
    PhD thesis, University of Amsterdam, Amsterdam (1999)

    https://pure.uva.nl/ws/files/3153478/8461_UBA003000033.pdf
    """
from rl.policy import GreedyQPolicy

# Build the agent 7

def build_agent(model, actions):
    policy = GreedyQPolicy()
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn


dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))

In [23]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)

Training for 10000 steps ...
 1338/10000: episode: 1, duration: 138.801s, episode steps: 1338, steps per second:  10, episode reward: 365.000, mean reward:  0.273 [ 0.000, 30.000], mean action: 1.134 [1.000, 4.000],  loss: 3.657223, mean_q: 5.468041
 1894/10000: episode: 2, duration: 208.865s, episode steps: 556, steps per second:   3, episode reward: 105.000, mean reward:  0.189 [ 0.000, 30.000], mean action: 1.092 [1.000, 4.000],  loss: 1.062975, mean_q: 5.216114
 2572/10000: episode: 3, duration: 251.443s, episode steps: 678, steps per second:   3, episode reward: 160.000, mean reward:  0.236 [ 0.000, 30.000], mean action: 1.376 [1.000, 4.000],  loss: 0.835172, mean_q: 5.812083
 2973/10000: episode: 4, duration: 149.811s, episode steps: 401, steps per second:   3, episode reward: 65.000, mean reward:  0.162 [ 0.000, 20.000], mean action: 1.352 [1.000, 4.000],  loss: 0.645570, mean_q: 5.529462
 3314/10000: episode: 5, duration: 126.153s, episode steps: 341, steps per second:   3, epi

<keras.callbacks.History at 0x28a2e8944c0>

In [24]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...


  logger.warn(


Episode 1: reward: 270.000, steps: 727
Episode 2: reward: 270.000, steps: 718
Episode 3: reward: 270.000, steps: 722
Episode 4: reward: 270.000, steps: 730
Episode 5: reward: 270.000, steps: 711
Episode 6: reward: 270.000, steps: 724
Episode 7: reward: 265.000, steps: 692
Episode 8: reward: 270.000, steps: 727
Episode 9: reward: 270.000, steps: 722
Episode 10: reward: 270.000, steps: 713
269.5


Agent 7 chose the most right side to fire

Agent 8 BoltzmannGumbelQPolicy nb_steps = 10000

In [11]:
"""Implements Boltzmann-Gumbel exploration (BGE) adapted for Q learning
    based on the paper Boltzmann Exploration Done Right
    (https://arxiv.org/pdf/1705.10257.pdf).

    BGE is invariant with respect to the mean of the rewards but not their
    variance. The parameter C, which defaults to 1, can be used to correct for
    this, and should be set to the least upper bound on the standard deviation
    of the rewards.

    BGE is only available for training, not testing. For testing purposes, you
    can achieve approximately the same result as BGE after training for N steps
    on K actions with parameter C by using the BoltzmannQPolicy and setting
    tau = C/sqrt(N/K)."""

from rl.policy import BoltzmannGumbelQPolicy

# Build the agent 8

def build_agent(model, actions):
    policy = BoltzmannGumbelQPolicy()
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn


dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))



In [15]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)

Training for 10000 steps ...
  464/10000: episode: 1, duration: 3.770s, episode steps: 464, steps per second: 123, episode reward: 85.000, mean reward:  0.183 [ 0.000, 25.000], mean action: 2.403 [0.000, 5.000],  loss: --, mean_q: --
 1776/10000: episode: 2, duration: 295.968s, episode steps: 1312, steps per second:   4, episode reward: 335.000, mean reward:  0.255 [ 0.000, 200.000], mean action: 2.218 [0.000, 5.000],  loss: 27.783009, mean_q: 0.757882
 2145/10000: episode: 3, duration: 136.340s, episode steps: 369, steps per second:   3, episode reward: 45.000, mean reward:  0.122 [ 0.000, 15.000], mean action: 2.672 [0.000, 5.000],  loss: 3.589642, mean_q: 0.515953
 2804/10000: episode: 4, duration: 242.086s, episode steps: 659, steps per second:   3, episode reward: 105.000, mean reward:  0.159 [ 0.000, 30.000], mean action: 3.209 [0.000, 5.000],  loss: 0.500140, mean_q: 0.363015
 3477/10000: episode: 5, duration: 246.467s, episode steps: 673, steps per second:   3, episode reward: 

<keras.callbacks.History at 0x1bb185f7a90>

In [18]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...


  logger.warn(


Episode 1: reward: 285.000, steps: 986
Episode 2: reward: 285.000, steps: 971
Episode 3: reward: 285.000, steps: 969
Episode 4: reward: 285.000, steps: 974
Episode 5: reward: 285.000, steps: 960
Episode 6: reward: 285.000, steps: 963
Episode 7: reward: 285.000, steps: 967
Episode 8: reward: 285.000, steps: 970
Episode 9: reward: 285.000, steps: 978
Episode 10: reward: 285.000, steps: 982
285.0


Agent 8 chose the most left side to fire.

Agent 10 EpsGreedyQPolicy eps=0.1 nb_steps = 10000

In [37]:
from rl.policy import EpsGreedyQPolicy

# Build the agent 10

def build_agent(model, actions):
    policy = EpsGreedyQPolicy()
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=1000
                  )
    return dqn


dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))

In [38]:
# Use GPU to fasten the program
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Use DQN Agnet to fit the game environment
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)

Training for 10000 steps ...
  698/10000: episode: 1, duration: 5.808s, episode steps: 698, steps per second: 120, episode reward: 340.000, mean reward:  0.487 [ 0.000, 30.000], mean action: 3.881 [0.000, 5.000],  loss: --, mean_q: --




 1962/10000: episode: 2, duration: 366.105s, episode steps: 1264, steps per second:   3, episode reward: 210.000, mean reward:  0.166 [ 0.000, 30.000], mean action: 3.882 [0.000, 5.000],  loss: 2.973462, mean_q: 1.114301
 2539/10000: episode: 3, duration: 215.324s, episode steps: 577, steps per second:   3, episode reward: 255.000, mean reward:  0.442 [ 0.000, 30.000], mean action: 3.821 [0.000, 5.000],  loss: 1.689626, mean_q: 0.988069
 3736/10000: episode: 4, duration: 447.433s, episode steps: 1197, steps per second:   3, episode reward: 425.000, mean reward:  0.355 [ 0.000, 200.000], mean action: 1.850 [0.000, 5.000],  loss: 11.793288, mean_q: 1.301746
 4506/10000: episode: 5, duration: 287.909s, episode steps: 770, steps per second:   3, episode reward: 180.000, mean reward:  0.234 [ 0.000, 30.000], mean action: 1.162 [0.000, 5.000],  loss: 17.854282, mean_q: 1.182583
 6153/10000: episode: 6, duration: 614.431s, episode steps: 1647, steps per second:   3, episode reward: 550.000, m

<keras.callbacks.History at 0x28918a5e940>

In [39]:
# Test the performance of the agent
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))
env.close()

Testing for 10 episodes ...


  logger.warn(


Episode 1: reward: 270.000, steps: 728
Episode 2: reward: 270.000, steps: 706
Episode 3: reward: 270.000, steps: 729
Episode 4: reward: 270.000, steps: 730
Episode 5: reward: 270.000, steps: 719
Episode 6: reward: 270.000, steps: 719
Episode 7: reward: 270.000, steps: 714
Episode 8: reward: 270.000, steps: 713
Episode 9: reward: 270.000, steps: 717
Episode 10: reward: 270.000, steps: 712
270.0


# 4. Saving and Reloading Agent from Memory

In [15]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights.h5f')

In [15]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights2.h5f')

In [None]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights3.h5f')

In [15]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights4.h5f')

In [20]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights5.h5f')

In [25]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights6.h5f')

In [25]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights7.h5f')

In [16]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights8.h5f')

[TIP] Next time specify overwrite=True!


In [40]:
dqn.save_weights('SavedWeights/10k-Fast/dqn_weights10.h5f')

In [22]:
del model, dqn

In [16]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights.h5f')

In [16]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights2.h5f')

In [None]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights3.h5f')

In [13]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights4.h5f')

In [35]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights5.h5f')

In [38]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights6.h5f')

In [None]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights7.h5f')

In [17]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights8.h5f')

In [41]:
dqn.load_weights('SavedWeights/10k-Fast/dqn_weights10.h5f')

# 5. Conclusion

For EpsGreedyQPolicy, if the eps is low, the increasing of nb_steps will cause the performance become worse, while the eps is high, the increasing of nb_steps will cause the perfomance become better.

For most policies, the agent will choose the most left or right side to fire and those which choose the right side can get higher socre may because the plane can shoot extra planes on the process from the most left side to the most right side.

MaxBoltzmannQPolicy is the best policy to choose and it can get 312 on average and the best score is 645.