In [None]:
### The DQN algorithm always considers the action, on a particular state, as the best action which has the maximum
### value for the Q function. 

### This optimistic approach creates a maximization bias already as discussed. 

### As during the update, we are updating an estimate from a given estimate, this adds up enough noise, and using the 
### same network for action selection and action evaluation creates a high chance of a bias creation. 

### Often it is seen that, initially it is seen, that because of the noise, the action having the maximum Q-value function 
### is not best action taken at that point, the best actions, may have a low value of Q function.

### To reduce the bias, we use two different network, so technically we decouple the action selection and action evaluation 
### so one of the network, is used to select the action for a state, using argmax or a greedy policy, and another network is 
### used for evaluating the action and updating the Q-function.

### This way as two networks are involved, the estimates could be deemed unbaised, and the maximization bias is reduced.

### So, equations is modified to:

### Q[st,at : w] = Q[st,at : w] + alpha * (rt + gamma * Q'[s(t+1), a: w'] - Q[st,at: w])

### a= argmax(Q[s(t+1),:w]) with exploration ### The action that provides the maximum value for Q-value for all action on that state, estimated by w parametered function



In [None]:
### The implementatio is almost similar to DQN, only during the action selection the code changes and during the updation.

### For the oracle network, the action is produced by the train dqn

In [None]:
import gym
import tensorflow as tf
import numpy as np
from memory_module import replayBuffer

In [None]:
#### instantiating environment
#### instantiating replay-buffer to 100k samples size

env=gym.make('CartPole-v0')
env._max_episode_steps=400
memory=replayBuffer(100000)

In [None]:
class Double_DQN:
 
  def __init__(self,env,buffer):
    self.env=env
    self.buffer=buffer    ### Replay buffer 
    self.state_dimension=env.observation_space.shape   ### Input state dimension
    self.no_of_action=env.action_space.n              ### No of actions
    self.learning_rate=0.01
    self.gamma=0.99
    self.optimizer=tf.keras.optimizers.RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
    self.train_DQN=None         #### Tranining network
    self.fixed_DQN=None         #### Oracle network
  
  def get_model(self):
    ### Q = f(s,a: w)

    state_input=tf.keras.layers.Input(self.state_dimension,name="State_input")  ### state input

    action_input=tf.keras.layers.Input((self.no_of_action,),name="Action_input") ### Action input

    net=tf.keras.layers.Dense(256,activation='relu')(state_input)
    net=tf.keras.layers.Dense(256,activation='relu')(net)
    output=tf.keras.layers.Dense(self.no_of_action,name="function_out")(net)

    ### So, the model takes in the state representation as input and produces the Q values for the all the actions
    ### Then for each action, given by: action 1: [0 1], the [0 1] is multiplied with the output of the model in form [a1,a2]
    ### to get the output of corresponding to the action required. [a1, a2].[0, 1] = [0, a2]

    Q_values=tf.multiply(output,action_input, name="masking_output")

    model=tf.keras.Model(inputs=[state_input,action_input],outputs=[Q_values],name="DQN")

    ### array of the Q values is the final output of the model.

    model.compile(loss="mse",optimizer=self.optimizer)

    ### as we want to minimize (Q[s,a]-Q'[s,a : w])^2 we use MSE.

    return model
  
  def update_fixed(self):
    self.fixed_DQN.set_weights(self.train_DQN.get_weights())
    ### We will need to update the target or fixed networks with the trainee networks weight 
    ### after a few epochs.
  
  def get_epsilon(self,episode,steady_epsilon=0.01,steady_episode=100000):
    #### Getting the epilon for the the greedy epsilon policy, 

    ### epsilon linearly decays till the steady step and then becomes constant
    if episode>steady_episode:  ##If we are above the steady episode, we return a steady epsilon
      return steady_epsilon
    else:
      slope=(steady_epsilon - 1)/(steady_episode - 0) 
      ### Line (1,0) to (steady_epsilon,steady_episode)

      ### slope*episode will give us the decrease in the value of epsilon
      ### To get the value we add 1 to the value so it is (1 - decrease), as epsilon starts from 1.
      return slope*episode + 1

  def get_action(self,state,epsilon):

    if np.random.random()<epsilon:
      return np.random.randint(self.no_of_action)
      ### choosing random action with probability epsilon/|actions| for each.
    else:
      ### State is given in the shape: array([-0.0213599 , -0.03238987, -0.0356761 , -0.0347844 ])
      ### as a 1D array, for each shape, we need to reshape it to provide a 2D array like:
      ### array([[-0.0213599 , -0.03238987, -0.0356761 , -0.0347844 ]])
      reshaped_state=state.reshape(1,-1)
      
      ### We need to pick the action which provides maximum action. To get all actions Q values, we need
      ### to send 1 for all the actions. so in this case, the action input to the model should be: [1,1]

      action_input=np.ones((1,self.no_of_action))
      action_probs=self.train_DQN.predict([reshaped_state,action_input])

      ### Action_probs has dimension 2: [[a1, a2]] as: array([[-0.00160907, -0.00242554]], dtype=float32)

      ### We need to take the maximum of the of the results of the actions. so, we take np.argmax()
      ### But we take on the axis=1 as: 
      ### in case there are mini-batches it is required to get the action for all the predictions.

      ### array([[-0.00242554, -0.00160907]], dtype=float32) for this action 
      ### np.argmax(res_2,axis=0) => 1

      ### array([[-0.00160907, -0.00242554],
      ###  [-0.00242554, -0.00160907]], dtype=float32) -> for this prediction
      ### np.argmax(res_2,axis=0) => 0   while,
      ### np.argmax(res_2,axis=1) => [0,1], so we take on axis =1

      optimal_action=np.argmax(action_probs,axis=1)[0]

      return optimal_action

  def on_batch(self,s,a,r,s_,not_done,step,gamma=0.99):

    ### batch inputs
    batch_size=s.shape[0]

    ## if s is of dimension (50,4). 50 is the batch size.
    ### as we know in q function, we take the maximum of the Q values for all the functions in the next_state.

    ### same as get_Action function, but here already in shape (,4) no need to reshape.
    ### the Q function is set using the target or fixed DQN.

    q_values=[]
    for i in range(batch_size):    ### For each sample in the batch
      a_=self.get_action(s_[i],self.get_epsilon(step))     ### obtaining the actions for the 
      ### next state, using the training DQN (action selection)
      q=self.fixed_DQN.predict([s_[i].reshape(1,-1),(np.array(tf.keras.utils.to_categorical(a_,self.no_of_action))).reshape(1,-1)])
      ### picking the q values for the next state and the predicted actions from the fixed DQN (action evaluation)
      q_values.append(q[0][a_])   ### Maintaining the q value for the predicted actions for all the states in the batch

      

    ## Now the Q target
    q_targets= r + gamma*np.multiply(not_done,np.array(q_values))
    ### Updated Q targets for all the states, and all the actions.
    ### If done, not done=0, for that state, only the rewards are considered.

    #### Q_targets is of the shape [v1, v2, v3.... vn]  ### where v1 is the q value updated, for that state.
    ### but to train the network, we need it in format, [[0,v1],[v2,0]...] considering for 1st sample, action 1
    ### was selected by the model, i.e, the value must be corresponding to the action for the state.

    q_target_formatted=np.multiply(q_targets.reshape(-1,1),tf.keras.utils.to_categorical(a,self.no_of_action))
    self.train_DQN.train_on_batch([s,tf.keras.utils.to_categorical(a,self.no_of_action)],q_target_formatted)
    ### Training for the state on which the action is taken.

  def get_experience(self):

    curr_state=self.env.reset()
    for _ in range(50000):  
      ### Creating 50k steps in experience to start the initial training

      act=self.env.action_space.sample()   ### initially we randomly sample from the action space.
      next_state,reward,done,_=self.env.step(act) ### Taking actions
      self.buffer.push(curr_state,act,reward,next_state,not done)  ### Recording the details in buffer.

      if done:
        curr_state=self.env.reset()   ### If done is 1, environment is reset.
      else:
        curr_state=next_state        ### state is updated.

  def train(self):
    self.train_DQN=self.get_model()
    self.fixed_DQN=self.get_model()
    self.get_experience()
    ### All Initialization steps done
    episode_reward=0
    no_of_comp=0
    curr_state=self.env.reset()
    for step in range(1000000):
      ### training on 1M steps
      act=self.get_action(curr_state,self.get_epsilon(step))  #### getting action according to current epsilon, and state
      next_state,reward,done,_=self.env.step(act) ### Taking the action
      episode_reward+=reward  ## updating the reward for the step
    
      self.buffer.push(curr_state,act,reward,next_state,not done)  ### Pushing the details in the buffer.
      ### Size of the buffer is fixed. so it works on LRU or first in first out policy.
      
      if done:

        curr_state=self.env.reset()
        if no_of_comp%50==0:
          print('On step {}, no. of complete episodes {} episode reward {}'.format(step,no_of_comp,episode_reward))
        episode_reward=0  ### Updating the reward to 0
        no_of_comp+=1
      
      else:
        curr_state=next_state

      if step%5000==0:    ### after 5000 steps the fixed or target DQN is updated.
        self.update_fixed()
      
      if step%4==0:    ### after training for 4 steps on the batch we sample new batch.
        s,a,r,s_,nd=self.buffer.sample(32)
        self.on_batch(s,a,r,s_,nd,step)
  
  



  
  


In [None]:
dqn=Double_DQN(env,memory)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
dqn.train()

On step 49, no. of complete episodes 0 episode reward 50.0
On step 1191, no. of complete episodes 50 episode reward 14.0
On step 2280, no. of complete episodes 100 episode reward 55.0
On step 3523, no. of complete episodes 150 episode reward 31.0
On step 4649, no. of complete episodes 200 episode reward 11.0
On step 5736, no. of complete episodes 250 episode reward 9.0
On step 6909, no. of complete episodes 300 episode reward 26.0
On step 8324, no. of complete episodes 350 episode reward 21.0
On step 9550, no. of complete episodes 400 episode reward 17.0
On step 10890, no. of complete episodes 450 episode reward 30.0
On step 12294, no. of complete episodes 500 episode reward 10.0
On step 13673, no. of complete episodes 550 episode reward 14.0
On step 14934, no. of complete episodes 600 episode reward 26.0
On step 16302, no. of complete episodes 650 episode reward 26.0
On step 17741, no. of complete episodes 700 episode reward 48.0
On step 19543, no. of complete episodes 750 episode rew