In [None]:
### Deuling DQN is another modified version of DQN, 

### As we have previously seen, for DQN, we try to predict the Q[s,a] or the Q value for an action on a state

### The Q[s,a] value is the evaluation for the same.

### Dueling DQN, decomposes the Q[s,a] function into two components V[s] and A[s,a]

### V[s] is the value function of the state, on which we are taking the action in general, i.e, expected sum of the future rewards 
### from the state.

### A[s,a] is the advantage function of an action on a state, so it shows how better is the action on the state over all other action.

### A[s,a] = Q[s,a] - V[s]
### or, Q[s,a] = V[s] + A[s,a]

### So, we use the same DQN structure to have two branches, one that predicts the value function V[s], and other that predicts the advantage
### function.

### we have a base network, common to both the estimators parameterized w and for the value function branch, the parameters are x, and Advantage 
### branch the parameter z. The value branch will have a single output node, while the advantage will have one node per action.

### According to the authors, it serves two purposes

### 1. Sometimes it is not required to evaluate all the actions on a state, only the value function of the state is required. In such cases, 
### this arrangement is required

### 2. Sometimes, when we predict Q values directly, as in DQN, it may often create a bias for one of the actions on a state, if the agent sees that action 
### multiple times, which may contain some bias or noise. In case of Dueling DQN, we predict V[s] which is same for all the actions on the state, so bias is
### eliminated

### Equation:

### Q[s,a: w, x, z] = V[s: w, x] + A[s,a: w, z]

### There was an issue with this approach, as Q[s,a] was a combination of V and A, it is impossible to decompose the value into two unique values for A and V.
### This is called the unidentifiable problem.

### The authors states that, if we can force the maximum Q value to be the V value, it may solve the problem.

### Q[s,a: w, x, z] = V[s: w, x] + (A[s,a: w, z] - max(A[s, a': w, z]) for all a' belonging to the action space.

### So, we make the advantage of the best action equal to 0, and the lesser the more negative value.

### Later that was modified from max to mean. So, the equation was modified to:

### Q[s,a: w, x, z] = V[s: w, x] + (A[s,a: w, z] - (1/|A|)*(A[s, a': w, z]) for all a' belonging to the action space.

### For Dueling DQN, every thing remains similar to DQN, only model definition is modified.

### Dueling DQN on cartpole

In [None]:
import gym
import tensorflow as tf
import numpy as np
from memory_module import replayBuffer

In [None]:
#### instantiating environment
#### instantiating replay-buffer to 100k samples size

env=gym.make('CartPole-v0')
env._max_episode_steps=400
memory=replayBuffer(100000)

In [None]:
class Dueling_DQN:

  def __init__(self,env,buffer):
    self.env=env
    self.buffer=buffer    ### Replay buffer 
    self.state_dimension=env.observation_space.shape   ### Input state dimension
    self.no_of_action=env.action_space.n              ### No of actions
    self.learning_rate=0.01
    self.gamma=0.99
    self.optimizer=tf.keras.optimizers.RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
    self.train_DQN=None         #### Tranining network
    self.fixed_DQN=None         #### Oracle network
  
  def get_model(self):
    ### Q = f(s,a: w)

    state_input=tf.keras.layers.Input(self.state_dimension,name="State_input")  ### state input

    action_input=tf.keras.layers.Input((self.no_of_action,),name="Action_input") ### Action input

    net=tf.keras.layers.Dense(256,activation='relu')(state_input)
    net=tf.keras.layers.Dense(256,activation='relu')(net)
    
    value_out=tf.keras.layers.Dense(1,name="Value_function")(net)   ### Value function output,
    ### So, the output layer has a single node, for the entire state.

    advantage_out=tf.keras.layers.Dense(self.no_of_action, name="Advantage_function")(net) 
    ### Advantage function output ### One node for each action 

    normalized_adv=tf.add(advantage_out, -tf.math.reduce_mean(advantage_out, axis=1,keepdims=True))
    ### (A[s,a: w, z] - (1/|A|)*(A[s, a': w, z]) is achieved using the above equation

    Q_all=tf.add(value_out,normalized_adv)
    ### Q[s,a] = V[s] + A[s,a] 
    ### Q_all is for all actions output

    Q_vals=tf.multiply(Q_all,action_input)
    ### Selecting the Q-values for the required actions    

    model=tf.keras.Model(inputs=[state_input,action_input],outputs=[Q_vals],name="Dueling_DQN")
    ### Creating the model

    ### array of the Q values is the final output of the model.

    model.compile(loss="mse",optimizer=self.optimizer)

    ### as we want to minimize (Q[s,a]-Q'[s,a : w])^2 we use MSE.

    return model

  def update_fixed(self):
    self.fixed_DQN.set_weights(self.train_DQN.get_weights())
    ### We will need to update the target or fixed networks with the trainee networks weight 
    ### after a few epochs.

  def get_epsilon(self,episode,steady_epsilon=0.01,steady_episode=100000):
    #### Getting the epilon for the the greedy epsilon policy, 

    ### epsilon linearly decays till the steady step and then becomes constant
    if episode>steady_episode:  ##If we are above the steady episode, we return a steady epsilon
      return steady_epsilon
    else:
      slope=(steady_epsilon - 1)/(steady_episode - 0) 
      ### Line (1,0) to (steady_epsilon,steady_episode)

      ### slope*episode will give us the decrease in the value of epsilon
      ### To get the value we add 1 to the value so it is (1 - decrease), as epsilon starts from 1.
      return slope*episode + 1
  
  def get_action(self,state,epsilon):

    if np.random.random()<epsilon:
      return np.random.randint(self.no_of_action)
      ### choosing random action with probability epsilon/|actions| for each.
    else:
      ### State is given in the shape: array([-0.0213599 , -0.03238987, -0.0356761 , -0.0347844 ])
      ### as a 1D array, for each shape, we need to reshape it to provide a 2D array like:
      ### array([[-0.0213599 , -0.03238987, -0.0356761 , -0.0347844 ]])
      reshaped_state=state.reshape(1,-1)
      
      ### We need to pick the action which provides maximum action. To get all actions Q values, we need
      ### to send 1 for all the actions. so in this case, the action input to the model should be: [1,1]

      action_input=np.ones((1,self.no_of_action))
      action_probs=self.train_DQN.predict([reshaped_state,action_input])

      ### Action_probs has dimension 2: [[a1, a2]] as: array([[-0.00160907, -0.00242554]], dtype=float32)

      ### We need to take the maximum of the of the results of the actions. so, we take np.argmax()
      ### But we take on the axis=1 as: 
      ### in case there are mini-batches it is required to get the action for all the predictions.

      ### array([[-0.00242554, -0.00160907]], dtype=float32) for this action 
      ### np.argmax(res_2,axis=0) => 1

      ### array([[-0.00160907, -0.00242554],
      ###  [-0.00242554, -0.00160907]], dtype=float32) -> for this prediction
      ### np.argmax(res_2,axis=0) => 0   while,
      ### np.argmax(res_2,axis=1) => [0,1], so we take on axis =1

      optimal_action=np.argmax(action_probs,axis=1)[0]

      return optimal_action

  def on_batch(self,s,a,r,s_,not_done,gamma=0.99):

    ### batch inputs
    batch_size=s.shape[0]

    ## if s is of dimension (50,4). 50 is the batch size.
    ### as we know in q function, we take the maximum of the Q values for all the functions in the next_state.

    ### same as get_Action function, but here already in shape (,4) no need to reshape.
    ### the Q function is set using the target or fixed DQN.

    action_probs=self.fixed_DQN.predict([s_,np.ones((batch_size,self.no_of_action))])
    ## Now the Q target
    q_targets= r + gamma*np.multiply(not_done,np.max(action_probs,axis=1))
    ### Updated Q targets for all the states, and all the actions.
    ### If done, not done=0, for that state, only the rewards are considered.

    #### Q_targets is of the shape [v1, v2, v3.... vn]  ### where v1 is the q value updated, for that state.
    ### but to train the network, we need it in format, [[0,v1],[v2,0]...] considering for 1st sample, action 1
    ### was selected by the model, i.e, the value must be corresponding to the action for the state.

    q_target_formatted=np.multiply(q_targets.reshape(-1,1),tf.keras.utils.to_categorical(a,self.no_of_action))
    self.train_DQN.train_on_batch([s,tf.keras.utils.to_categorical(a,self.no_of_action)],q_target_formatted)
    ### Training for the state on which the action is taken.
  
  def get_experience(self):

    curr_state=self.env.reset()
    for _ in range(50000):  
      ### Creating 50k steps in experience to start the initial training

      act=self.env.action_space.sample()   ### initially we randomly sample from the action space.
      next_state,reward,done,_=self.env.step(act) ### Taking actions
      self.buffer.push(curr_state,act,reward,next_state,not done)  ### Recording the details in buffer.

      if done:
        curr_state=self.env.reset()   ### If done is 1, environment is reset.
      else:
        curr_state=next_state        ### state is updated.
    
  def train(self):
    self.train_DQN=self.get_model()
    self.fixed_DQN=self.get_model()
    self.get_experience()
    ### All Initialization steps done
    episode_reward=0
    no_of_comp=0
    curr_state=self.env.reset()
    for step in range(1000000):
      ### training on 1M steps
      act=self.get_action(curr_state,self.get_epsilon(step))  #### getting action according to current epsilon, and state
      next_state,reward,done,_=self.env.step(act) ### Taking the action
      episode_reward+=reward  ## updating the reward for the step
    
      self.buffer.push(curr_state,act,reward,next_state,not done)  ### Pushing the details in the buffer.
      ### Size of the buffer is fixed. so it works on LRU or first in first out policy.
      
      if done:

        curr_state=self.env.reset()
        if no_of_comp%50==0:
          print('On step {}, no. of complete episodes {} episode reward {}'.format(step,no_of_comp,episode_reward))
        episode_reward=0  ### Updating the reward to 0
        no_of_comp+=1
      
      else:
        curr_state=next_state

      if step%5000==0:    ### after 5000 steps the fixed or target DQN is updated.
        self.update_fixed()
      
      if step%4==0:    ### after training for 4 steps on the batch we sample new batch.
        s,a,r,s_,nd=self.buffer.sample(32)
        self.on_batch(s,a,r,s_,nd)


In [None]:
dqn=Dueling_DQN(env,memory)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
dqn.train()

On step 22, no. of complete episodes 0 episode reward 23.0
On step 1201, no. of complete episodes 50 episode reward 12.0
On step 2363, no. of complete episodes 100 episode reward 16.0
On step 3465, no. of complete episodes 150 episode reward 21.0
On step 4380, no. of complete episodes 200 episode reward 15.0
On step 5340, no. of complete episodes 250 episode reward 15.0
On step 6234, no. of complete episodes 300 episode reward 17.0
On step 7277, no. of complete episodes 350 episode reward 36.0
On step 8313, no. of complete episodes 400 episode reward 17.0
On step 9344, no. of complete episodes 450 episode reward 15.0
On step 10390, no. of complete episodes 500 episode reward 10.0
On step 11434, no. of complete episodes 550 episode reward 62.0
On step 12489, no. of complete episodes 600 episode reward 22.0
On step 13602, no. of complete episodes 650 episode reward 20.0
On step 15017, no. of complete episodes 700 episode reward 41.0
On step 16475, no. of complete episodes 750 episode rew