In [28]:

import sys
import numpy as np
import tensorflow.keras.backend as K

from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape, LSTM, Lambda,Add
from tensorflow.keras.regularizers import l2
from collections import deque
import random

In [79]:
class D3QNAgent:
    """ Agent Class (Network) for DDQN
    """

    def __init__(self, state_dim, action_dim,gamma=0.95,epsilon=1.0,epsilon_min=0.01
                ,epsilon_decay=0.999, batch_size=32,ddqn=True,Soft_Update=False,dueling=True):
        self.state_dim = state_dim
        self.action_dim = action_dim

        
        self.EPISODES = 1000
        self.memory = deque(maxlen=4000)
        
        self.gamma = gamma    # discount rate
        self.epsilon = epsilon  # exploration rate
        self.epsilon_min = epsilon_min # minimum exploration probability
        self.epsilon_decay = epsilon_decay # exponential decay rate for exploration prob
        self.batch_size = batch_size 
        self.train_start = 1000

        # defining model parameters
        self.ddqn = ddqn # use doudle deep q network
        self.Soft_Update = Soft_Update # use soft parameter update
        self.dueling = dueling # use dealing netowrk

        self.TAU = 0.1 # target network soft update hyperparameter
        
        # Initialize Deep Q-Network
        t = (self.state_dim,)
        self.model = self.network(t,self.action_dim,self.dueling)
        # Build target Q-Network
        self.target_model = self.network(t,self.action_dim,self.dueling)
        self.target_model.set_weights(self.model.get_weights())

    def huber_loss(self, y_true, y_pred):
        return K.mean(K.sqrt(1 + K.square(y_pred - y_true)) - 1, axis=-1)

    def network(self, input_shape, action_space, dueling):
        """ Build Deep Q-Network
        """
        X_input = Input(input_shape)
        X = X_input
        X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)

        if dueling:
            state_value = Dense(1, kernel_initializer='he_uniform')(X)
            #Q = V + A - mean(A)
            #V = s[:, 0]
            state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)

            action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
            #Q = V + A - mean(A)
            #A = a[:, :]
            # mean(A) = K.mean(a[:, :], keepdims=True)
            action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)

            X = Add()([state_value, action_advantage])
        else:
            # Output Layer with # of actions: 2 nodes (left, right)
            X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)

        model = Model(inputs = X_input, outputs = X, name='D3QN_model')
        model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])

        model.summary()
        return model
        
    def update_target_model(self):
        if not self.Soft_Update:
            self.target_model.set_weights(self.model.get_weights())
            return
        else :
            q_model_W = self.model.get_weights()
            target_model_W = self.target_model.get_weights()
            index = 0
            for q_weight, target_weight in zip(q_model_W, target_model_W):
                target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
                target_model_W[index] = target_weight
                index += 1
            self.target_model.set_weights(target_model_W)
            return

    def choose_action(self,obs):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_dim) 
        else:
            return np.argmax(self.model.predict(obs))
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
            
    def learn(self):
        if len(self.memory) < self.train_start:
            return
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, self.batch_size)
#         print(minibatch.shape)
        states, actions, rewards, next_states, done = self.preprocess_mem(minibatch)
        
        target = self.model.predict(states)
    
        #double dqn: Q_target(s_next,max_a(Q(s,a)))
        target_next = self.model.predict(next_states)
        target_val = self.target_model.predict(next_states)
        
        for i in range(len(minibatch)):
            # correction on the Q value for the action used
            if done[i]:
                target[i][actions[i]] = rewards[i]
            else:
                if self.ddqn: # Double - DQN
                    # current Q Network selects the action
                    # a'_max = argmax_a' Q(s', a')
                    a = np.argmax(target_next[i])
                    # target Q Network evaluates the action
                    # Q_max = Q_target(s', a'_max)
                    target[i][actions[i]] = rewards[i] + self.gamma * (target_val[i][a])   
                else: # Standard - DQN
                    # DQN chooses the max Q value among next actions
                    # selection and evaluation of action is on the target Q Network
                    # Q_max = max_a' Q_target(s', a')
                    target[i][action[i]] = rewards[i] + self.gamma * (np.amax(target_next[i]))
                    
        self.model.fit(states, target, batch_size=self.batch_size, verbose=0)
        
        
    def preprocess_mem(self,minibatch):
        state = np.zeros((self.batch_size, self.state_dim))
        next_state = np.zeros((self.batch_size, self.state_dim))
        action, reward, done = [], [], []
        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])
        return state,action,reward,next_state,done

In [82]:
import gym
env_name = 'CartPole-v1'
env = gym.make(env_name)
state_size = env.observation_space.shape[0]
agent = D3QNAgent(state_size, env.action_space.n)

Model: "D3QN_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_42 (InputLayer)           [(None, 4)]          0                                            
__________________________________________________________________________________________________
dense_205 (Dense)               (None, 512)          2560        input_42[0][0]                   
__________________________________________________________________________________________________
dense_206 (Dense)               (None, 256)          131328      dense_205[0][0]                  
__________________________________________________________________________________________________
dense_207 (Dense)               (None, 64)           16448       dense_206[0][0]                  
_________________________________________________________________________________________

In [84]:
EPISODES = 10000
scores=[]
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    if np.mean(scores[-50:])>195:
        break
    i = 0
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if not done :
            reward = reward
        else:
            reward = -100

        agent.remember(state, action, reward, next_state, done)
        state = next_state
        i += 1
        if done:
            agent.update_target_model()
            scores.append(i)
            print("episode: {}/{}, score: {}, e: {:.2}, ".format(e, EPISODES, i, agent.epsilon))
          
        agent.learn()

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode: 0/10000, score: 500, e: 0.01, 
