In [1]:

import sys
import numpy as np
import tensorflow.keras.backend as K

from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape, LSTM, Lambda,Add,Embedding,Reshape
from tensorflow.keras.regularizers import l2
from collections import deque
import random

In [21]:
class D3QNAgent:
    """ Agent Class (Network) for DDQN
    """

    def __init__(self, state_dim, action_dim,gamma=0.95,epsilon=1.0,epsilon_min=0.01
                ,epsilon_decay=0.9999, batch_size=32,ddqn=True,Soft_Update=False,dueling=True):
        self.state_dim = state_dim
        self.action_dim = action_dim

        
        self.EPISODES = 2000
        self.memory = deque(maxlen=10000)
        
        self.gamma = gamma    # discount rate
        self.epsilon = epsilon  # exploration rate
        self.epsilon_min = epsilon_min # minimum exploration probability
        self.epsilon_decay = epsilon_decay # exponential decay rate for exploration prob
        self.batch_size = batch_size 
        self.train_start = 1000

        # defining model parameters
        self.ddqn = ddqn # use doudle deep q network
        self.Soft_Update = Soft_Update # use soft parameter update
        self.dueling = dueling # use dealing netowrk

        self.TAU = 0.1 # target network soft update hyperparameter
        
        # Initialize Deep Q-Network
        t = (self.state_dim,)
        self.model = self.network(t,self.action_dim,self.dueling)
        # Build target Q-Network
        self.target_model = self.network(t,self.action_dim,self.dueling)
        self.target_model.set_weights(self.model.get_weights())

    def huber_loss(self, y_true, y_pred):
        return K.mean(K.sqrt(1 + K.square(y_pred - y_true)) - 1, axis=-1)

    def network(self, input_shape, action_space, dueling):
        """ Build Deep Q-Network
        """
        X_input = Input(shape=input_shape)
        X = Embedding(500, 10, input_length=1)(X_input)
        X = Flatten()(X)
        X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(128, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)

        if dueling:
            state_value = Dense(1, kernel_initializer='he_uniform')(X)
            #Q = V + A - mean(A)
            #V = s[:, 0]
            state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)

            action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
            #Q = V + A - mean(A)
            #A = a[:, :]
            # mean(A) = K.mean(a[:, :], keepdims=True)
            action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)

            X = Add()([state_value, action_advantage])
        else:
            # Output Layer with # of actions: 2 nodes (left, right)
            X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)

        model = Model(inputs = X_input, outputs = X, name='D3QN_model')
        model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])

        model.summary()
        return model
        
    def update_target_model(self):
        if not self.Soft_Update:
            self.target_model.set_weights(self.model.get_weights())
            return
        else :
            q_model_W = self.model.get_weights()
            target_model_W = self.target_model.get_weights()
            index = 0
            for q_weight, target_weight in zip(q_model_W, target_model_W):
                target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
                target_model_W[index] = target_weight
                index += 1
            self.target_model.set_weights(target_model_W)
            return

    def choose_action(self,obs):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_dim) 
        else:
            return np.argmax(self.model.predict(obs))
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
            
    def learn(self):
        if len(self.memory) < self.train_start:
            return
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, done = self.preprocess_mem(minibatch)
        
        target = self.model.predict(states)
    
        #double dqn: Q_target(s_next,max_a(Q(s,a)))
        target_next = self.model.predict(next_states)
        target_val = self.target_model.predict(next_states)
        
        for i in range(len(minibatch)):
            # correction on the Q value for the action used
            if done[i]:
                target[i][actions[i]] = rewards[i]
            else:
                if self.ddqn: # Double - DQN
                    # current Q Network selects the action
                    # a'_max = argmax_a' Q(s', a')
                    a = np.argmax(target_next[i])
                    # target Q Network evaluates the action
                    # Q_max = Q_target(s', a'_max)
                    target[i][actions[i]] = rewards[i] + self.gamma * (target_val[i][a])   
                else: # Standard - DQN
                    # DQN chooses the max Q value among next actions
                    # selection and evaluation of action is on the target Q Network
                    # Q_max = max_a' Q_target(s', a')
                    target[i][action[i]] = rewards[i] + self.gamma * (np.amax(target_next[i]))
                    
        self.model.fit(states, target, batch_size=self.batch_size, verbose=0)
        
        
    def preprocess_mem(self,minibatch):
        state = np.zeros((self.batch_size, self.state_dim))
        next_state = np.zeros((self.batch_size, self.state_dim))
        action, reward, done = [], [], []
        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])
        return state,action,reward,next_state,done

In [25]:
import gym
env_name = 'Taxi-v2'
env = gym.make(env_name)
state_size = 1
agent = D3QNAgent(state_size, env.action_space.n)

Model: "D3QN_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 10)        5000        input_3[0][0]                    
__________________________________________________________________________________________________
flatten_3 (Flatten)             (None, 10)           0           embedding_9[0][0]                
__________________________________________________________________________________________________
dense_12 (Dense)                (None, 256)          2816        flatten_3[0][0]                  
_________________________________________________________________________________________

In [None]:
import matplotlib.pyplot as plt
EPISODES = 1000
scores=[]
average =[]
update =0
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, 1])
    done = False
    i = 0
    total_reward = 0
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, 1])

        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        scores.append(total_reward)
        i += 1
        update += 1
        if done:
            print("episode: {},{}/{}, score: {}, e: {:.2},avg:{} ".format(i,e, EPISODES, total_reward, agent.epsilon,np.mean(scores[-20:])))
        
        if update % 100 == 0:
            agent.update_target_model()
      
          
        agent.learn()

episode: 200,0/1000, score: -686, e: 1.0,avg:-653.55 
episode: 200,1/1000, score: -677, e: 1.0,avg:-656.7 
episode: 200,2/1000, score: -803, e: 1.0,avg:-753.9 
episode: 200,3/1000, score: -677, e: 1.0,avg:-645.0 
episode: 200,4/1000, score: -830, e: 1.0,avg:-792.15 
episode: 200,5/1000, score: -749, e: 0.98,avg:-709.8 
episode: 200,6/1000, score: -830, e: 0.96,avg:-790.35 
episode: 200,7/1000, score: -893, e: 0.94,avg:-851.1 
episode: 42,8/1000, score: -75, e: 0.94,avg:-77.8 
episode: 200,9/1000, score: -767, e: 0.92,avg:-730.5 
episode: 200,10/1000, score: -668, e: 0.9,avg:-619.8 
episode: 200,11/1000, score: -731, e: 0.88,avg:-711.15 
episode: 200,12/1000, score: -605, e: 0.87,avg:-568.5 
episode: 200,13/1000, score: -713, e: 0.85,avg:-675.6 
episode: 200,14/1000, score: -641, e: 0.83,avg:-618.45 
episode: 200,15/1000, score: -677, e: 0.82,avg:-631.5 
episode: 111,16/1000, score: -261, e: 0.81,avg:-264.7 
episode: 200,17/1000, score: -641, e: 0.79,avg:-622.05 
episode: 200,18/1000, s

episode: 200,148/1000, score: -227, e: 0.062,avg:-217.5 
episode: 200,149/1000, score: -254, e: 0.061,avg:-235.95 
episode: 200,150/1000, score: -254, e: 0.06,avg:-239.55 
episode: 59,151/1000, score: -47, e: 0.059,avg:-49.35 
episode: 200,152/1000, score: -227, e: 0.058,avg:-217.5 
episode: 200,153/1000, score: -236, e: 0.057,avg:-221.55 
episode: 200,154/1000, score: -227, e: 0.056,avg:-209.85 
episode: 54,155/1000, score: -42, e: 0.056,avg:-52.45 
episode: 200,156/1000, score: -200, e: 0.054,avg:-190.5 
episode: 200,157/1000, score: -227, e: 0.053,avg:-209.85 
episode: 200,158/1000, score: -218, e: 0.052,avg:-208.5 
episode: 200,159/1000, score: -218, e: 0.051,avg:-208.5 
episode: 200,160/1000, score: -227, e: 0.05,avg:-215.25 
episode: 200,161/1000, score: -245, e: 0.049,avg:-235.5 
episode: 200,162/1000, score: -218, e: 0.048,avg:-208.5 
episode: 200,163/1000, score: -245, e: 0.047,avg:-235.5 
episode: 200,164/1000, score: -200, e: 0.046,avg:-190.5 
episode: 200,165/1000, score: -

episode: 200,293/1000, score: -200, e: 0.01,avg:-190.5 
episode: 200,294/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,295/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,296/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,297/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,298/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,299/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,300/1000, score: -200, e: 0.01,avg:-190.5 
episode: 200,301/1000, score: -200, e: 0.01,avg:-190.5 
episode: 200,302/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,303/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,304/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,305/1000, score: -218, e: 0.01,avg:-208.5 
episode: 200,306/1000, score: -200, e: 0.01,avg:-190.5 
episode: 200,307/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,308/1000, score: -209, e: 0.01,avg:-199.5 
episode: 200,309/1000, score: -200, e: 0.01,avg:-190.5 
episode: 200,310/1000, score: -209, e: 0.01,avg: