# Task-2 Reinforcement Learning

### Import Packages

In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import Callback
import random

Using TensorFlow backend.


In [2]:
# Path environment changed to make things work properly
# export DYLD_FALLBACK_LIBRARY_PATH=$DYLD_FALLBACK_LIBRARY_PATH:/usr/lib

ENV_NAME = 'LunarLander-v2'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(40))
model.add(Activation('relu'))
model.add(Dense(40))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())



# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy(eps=1.0)

#Call back for the begin and end of the episode
class OnBegin(Callback):
    def __init__(self, eps_poilcy, decay_rate=0.95):
        self.eps_poilcy = eps_poilcy
        self.decay_rate = decay_rate
    def on_episode_begin(self, episode, logs={}):
        self.eps_poilcy.eps *= self.decay_rate
        print ('eps = %s' % self.eps_poilcy.eps)

class OnEnd(Callback):
    def __init__(self, nb_episodes=4000, avgwindow=20):
        self.rewards = np.zeros(nb_episodes) - 1000.0
        self.X = np.arange(1, nb_episodes+1)
        self.avgrewards = np.zeros(nb_episodes) - 1000.0
        self.avgwindow = avgwindow
        self.rewardbuf = []
        self.episode = 0
        self.nb_episodes = nb_episodes
        
    def on_episode_end(self, episode, logs):
        if self.episode >= self.nb_episodes:
            return
        rw = logs['episode_reward']
        self.rewardbuf.append(rw)
        if len(self.rewardbuf) > self.avgwindow:
            del self.rewardbuf[0]
        self.rewards[self.episode] = rw
        self.avgrewards[self.episode] = np.mean(self.rewardbuf)
        self.episode += 1
     

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy, enable_double_dqn=False)
dqn.compile(Adam(lr=0.002, decay=2.25e-05), metrics=['mae'])

cbs = [OnBegin(eps_poilcy=policy, decay_rate=0.975)]
cbs += [OnEnd(nb_episodes=4000, avgwindow=20)]

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.

dqn.fit(env, nb_steps=50000, visualize=False, verbose=2,callbacks=cbs)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 40)                360       
_________________________________________________________________
activation_1 (Activation)    (None, 40)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 40)                1640      
_________________________________________________________________
activation_2 (Activation)    (None, 40)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 164       
_________________________________________________________________
acti



    71/50000: episode: 1, duration: 0.763s, episode steps: 71, steps per second: 93, episode reward: -135.040, mean reward: -1.902 [-100.000, 25.512], mean action: 1.507 [0.000, 3.000], mean observation: 0.023 [-1.594, 2.100], loss: 0.788042, mean_absolute_error: 0.800852, mean_q: 1.099213
eps = 0.9506249999999999
   196/50000: episode: 2, duration: 0.252s, episode steps: 125, steps per second: 497, episode reward: -201.706, mean reward: -1.614 [-100.000, 6.187], mean action: 1.344 [0.000, 3.000], mean observation: 0.230 [-1.640, 3.910], loss: 50.067806, mean_absolute_error: 1.318214, mean_q: 1.423686
eps = 0.9268593749999999
   290/50000: episode: 3, duration: 0.182s, episode steps: 94, steps per second: 515, episode reward: -161.535, mean reward: -1.718 [-100.000, 38.805], mean action: 1.447 [0.000, 3.000], mean observation: -0.034 [-1.431, 5.928], loss: 37.504807, mean_absolute_error: 1.575816, mean_q: 1.870928
eps = 0.9036878906249999
   380/50000: episode: 4, duration: 0.186s, epi

  3244/50000: episode: 27, duration: 0.569s, episode steps: 272, steps per second: 478, episode reward: 9.594, mean reward: 0.035 [-100.000, 17.270], mean action: 1.779 [0.000, 3.000], mean observation: 0.020 [-0.575, 1.411], loss: 12.091181, mean_absolute_error: 19.073853, mean_q: 13.015141
eps = 0.49218598109595263
  3392/50000: episode: 28, duration: 0.297s, episode steps: 148, steps per second: 499, episode reward: -42.417, mean reward: -0.287 [-100.000, 11.954], mean action: 1.709 [0.000, 3.000], mean observation: -0.006 [-1.115, 3.962], loss: 9.276083, mean_absolute_error: 19.593243, mean_q: 13.178141
eps = 0.4798813315685538
  4083/50000: episode: 29, duration: 1.988s, episode steps: 691, steps per second: 348, episode reward: -312.939, mean reward: -0.453 [-100.000, 17.155], mean action: 1.773 [0.000, 3.000], mean observation: 0.052 [-1.024, 1.664], loss: 7.499293, mean_absolute_error: 21.033991, mean_q: 13.237780
eps = 0.4678842982793399
  4275/50000: episode: 30, duration: 0.

 16235/50000: episode: 53, duration: 3.047s, episode steps: 1000, steps per second: 328, episode reward: -26.914, mean reward: -0.027 [-21.156, 21.510], mean action: 1.772 [0.000, 3.000], mean observation: 0.025 [-1.002, 1.404], loss: 5.157484, mean_absolute_error: 26.161810, mean_q: 22.484930
eps = 0.2548292333858097
 17235/50000: episode: 54, duration: 3.436s, episode steps: 1000, steps per second: 291, episode reward: -84.187, mean reward: -0.084 [-21.286, 10.696], mean action: 1.694 [0.000, 3.000], mean observation: -0.002 [-0.563, 1.410], loss: 5.712145, mean_absolute_error: 25.383011, mean_q: 22.877956
eps = 0.24845850255116445
 18235/50000: episode: 55, duration: 3.227s, episode steps: 1000, steps per second: 310, episode reward: -8.220, mean reward: -0.008 [-5.109, 4.992], mean action: 1.788 [0.000, 3.000], mean observation: 0.093 [-0.427, 2.067], loss: 6.133233, mean_absolute_error: 24.979727, mean_q: 24.075621
eps = 0.24224703998738534
 19184/50000: episode: 56, duration: 2.5

 37962/50000: episode: 79, duration: 3.264s, episode steps: 1000, steps per second: 306, episode reward: -42.839, mean reward: -0.043 [-5.550, 4.708], mean action: 1.888 [0.000, 3.000], mean observation: 0.098 [-0.480, 1.416], loss: 6.522413, mean_absolute_error: 21.613592, mean_q: 26.912512
eps = 0.13193780538690256
 38962/50000: episode: 80, duration: 3.075s, episode steps: 1000, steps per second: 325, episode reward: -33.906, mean reward: -0.034 [-4.608, 5.226], mean action: 1.719 [0.000, 3.000], mean observation: 0.048 [-0.665, 1.405], loss: 4.957975, mean_absolute_error: 21.615864, mean_q: 26.855505
eps = 0.12863936025223
 39962/50000: episode: 81, duration: 3.193s, episode steps: 1000, steps per second: 313, episode reward: -30.554, mean reward: -0.031 [-4.475, 5.083], mean action: 1.905 [0.000, 3.000], mean observation: 0.071 [-0.605, 1.397], loss: 5.651938, mean_absolute_error: 21.670254, mean_q: 27.063391
eps = 0.12542337624592423
 40962/50000: episode: 82, duration: 3.356s, e

<keras.callbacks.History at 0x1581fb75e48>