In [1]:
import gym
import random
import numpy as np
from collections import deque
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.optimizers as optimizers
import tensorflow.keras.initializers as initializers
from IPython.display import clear_output

In [2]:
class DQN(tf.keras.Model):
    def __init__(self, action_size):
        super(__class__, self).__init__()
        self.dense1 = layers.Dense(64, activation='relu')
        self.dense2 = layers.Dense(64, activation='relu')
        self.out = layers.Dense(action_size)
        
    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        value = self.out(x)
        return value
    
    
class Agent:
    def __init__(self, env, state_size, action_size, gamma=.99, lr=0.0001, update_step=4):
        self.env = env
        self.state_size = state_size
        self.action_size = action_size
        
        self.model = DQN(action_size)
        self.target_model = DQN(action_size)
        self.model.build(input_shape=(None, state_size))
        self.target_model.build(input_shape=(None, state_size))
        self.update_target_model()
        
        self.memory = deque(maxlen=100000)
        self.train_start = 1000
        self.step = 0
        
        self.gamma = gamma
        self.lr = lr
        self.update_step = update_step
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def get_action(self, state, epsilon=0.):
        if np.random.rand() < epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.model(state.reshape(1, -1))[0])
    
    def learn(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) < self.train_start:
            return            
        mini_batch = random.sample(self.memory, 64)
        states = np.array([sample[0] for sample in mini_batch])
        actions = np.array([sample[1] for sample in mini_batch])
        rewards = np.array([sample[2] for sample in mini_batch])
        next_states = np.array([sample[3] for sample in mini_batch])
        dones = np.array([sample[4] for sample in mini_batch])
        def loss_fn():
            q_values = self.model(states)
            one_hot_actions = tf.one_hot(actions, self.action_size)
            values = tf.reduce_sum(one_hot_actions * q_values, axis=1)
            target_q_values = self.target_model(next_states)
            target_values = rewards + (1 - dones) * self.gamma * np.amax(tf.stop_gradient(target_q_values), axis=-1)
            return tf.reduce_mean(tf.square(target_values - values))
        optimizers.Adam(lr=self.lr).minimize(loss_fn, var_list=self.model.trainable_variables)
        self.step = (self.step + 1) % self.update_step
        if self.step % self.update_step == 0:
            self.update_target_model()

In [4]:
SEED = 123
    
env = gym.make('LunarLander-v2')
env.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)


DISCOUNT_FACTOR = 0.99
LEARNING_RATE = 0.0001
TARGET_NETWORK_UPDATE_STEP = 10

agent = Agent(env, state_size=8, action_size=4, 
              gamma=DISCOUNT_FACTOR, 
              lr=LEARNING_RATE,
              update_step=TARGET_NETWORK_UPDATE_STEP)

scores = []
steps = []
    
for episode in range(1, 3001):
    state = env.reset()
    done = False
    score = 0
    step = 0
    samples = []
    while not done:
        action = agent.get_action(state, epsilon=epsilon)
        next_state, reward, done, _ = env.step(action)
        agent.learn(state, action, reward, next_state, done)
        score += reward
        step += 1
        state = next_state
    scores.append(score)
    steps.append(step)
        
    if episode % 10 == 0:
        print('Episode: {}\tTotal score: {}\tAverage score: {}\tAverage step: {}\tEpsilon: {}'.format(
            episode, 
            round(np.sum(scores), 3),
            round(np.mean(scores[-10:]), 3),
            round(np.mean(steps[-10:]), 3),
            round(epsilon, 3),
        ))
        mean_score = np.mean(scores[-100:])
        if mean_score < 0:
            epsilon = 0.3
        elif mean_score < 100:
            epsilon = 0.2
        elif mean_score < 150:
            epsilon = 0.1
        elif mean_score < 200:
            epsilon = 0.01
        else:
            epsilon = 0.
            
env.close()

Episode: 10 Total score: -6100.700683248552 Average score: -610.0700683248552 Average step: 109.7 Epsilon: 0.3
Episode: 20 Total score: -8277.27812847204 Average score: -217.65774452234868 Average step: 82.2 Epsilon: 0.3
Episode: 30 Total score: -9806.360601248034 Average score: -152.9082472775994 Average step: 76.5 Epsilon: 0.3
Episode: 40 Total score: -11646.379280618836 Average score: -184.00186793708022 Average step: 79.2 Epsilon: 0.3
Episode: 50 Total score: -13892.267057275803 Average score: -224.5887776656969 Average step: 75.0 Epsilon: 0.3
Episode: 60 Total score: -15771.676629057556 Average score: -187.94095717817532 Average step: 114.9 Epsilon: 0.3
Episode: 70 Total score: -17579.376522537354 Average score: -180.76998934797956 Average step: 398.3 Epsilon: 0.3
Episode: 80 Total score: -18855.08622055382 Average score: -127.57096980164677 Average step: 296.5 Epsilon: 0.3
Episode: 90 Total score: -20754.65796119969 Average score: -189.95717406458726 Average step: 493.1 Epsilon: 

In [10]:
env = gym.make('LunarLander-v2')

for episode in range(1, 11):
    done = False
    state = env.reset()
    score = 0
    step = 0
    while not done:
        env.render()
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        score += reward
        step += 1
        state = next_state
    print('Episode: {} Score: {} Step: {}'.format(episode, score, step))
    
env.close()

Episode: 1 Score: 171.82579332549008 Step: 471
Episode: 2 Score: 196.0449061319535 Step: 552
Episode: 3 Score: 9.732980368787366 Step: 125
Episode: 4 Score: 188.50929643596723 Step: 540
Episode: 5 Score: 24.08196124880571 Step: 116
Episode: 6 Score: 278.8592559934008 Step: 186
Episode: 7 Score: 217.84903023579727 Step: 436
Episode: 8 Score: -31.33883071276597 Step: 395
Episode: 9 Score: 170.1853306691373 Step: 498
Episode: 10 Score: 212.68453246150605 Step: 426


In [5]:
import pickle

with open('dqp_scores.pickle', 'wb') as f:
    pickle.dump(scores, f)
    
with open('dqn_weights.pickle', 'wb') as f:
    pickle.dump(agent.model.get_weights(), f)

In [16]:
import pickle

# with open('scores.pickle', 'rb') as f:
#     print(pickle.load(f))

new_agent = Agent(env, state_size, action_size, gamma=.99, lr=0.001)
with open('weights.pickle', 'rb') as f:
    new_agent.model.set_weights(pickle.load(f))

env = gym.make('LunarLander-v2')

for episode in range(1, 2):
    done = False
    state = env.reset()
    score = 0
    step = 0
    while not done:
        env.render()
        action = new_agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        score += reward
        step += 1
        state = next_state
    print('Episode: {} Score: {} Step: {}'.format(episode, score, step))
env.close()

Episode: 1 Score: -39.893084530631285 Step: 282
