<a href="https://colab.research.google.com/github/abhisheksuran/Atari_DQN/blob/master/DDDQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf 
import numpy as np 
import gym
from tensorflow.keras.models import load_model

In [2]:
!pip3 install box2d-py

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 17.4MB/s eta 0:00:01[K     |█▌                              | 20kB 2.1MB/s eta 0:00:01[K     |██▏                             | 30kB 2.8MB/s eta 0:00:01[K     |███                             | 40kB 3.1MB/s eta 0:00:01[K     |███▋                            | 51kB 2.5MB/s eta 0:00:01[K     |████▍                           | 61kB 2.7MB/s eta 0:00:01[K     |█████▏                          | 71kB 3.0MB/s eta 0:00:01[K     |█████▉                          | 81kB 3.4MB/s eta 0:00:01[K     |██████▋                         | 92kB 3.5MB/s eta 0:00:01[K     |███████▎                        | 102kB 3.4MB/s eta 0:00:01[K     |████████                        | 112kB 3.4MB/s eta 0:00:01[K     |████████▊                       | 12

In [3]:
env= gym.make("LunarLander-v2")
low = env.observation_space.low
high = env.observation_space.high

In [4]:
class DDDQN(tf.keras.Model):
    def __init__(self):
      super(DDDQN, self).__init__()
      self.d1 = tf.keras.layers.Dense(128, activation='relu')
      self.d2 = tf.keras.layers.Dense(128, activation='relu')
      self.v = tf.keras.layers.Dense(1, activation=None)
      self.a = tf.keras.layers.Dense(env.action_space.n, activation=None)

    def call(self, input_data):
      x = self.d1(input_data)
      x = self.d2(x)
      v = self.v(x)
      a = self.a(x)
      Q = v +(a -tf.math.reduce_mean(a, axis=1, keepdims=True))
      return Q

    def advantage(self, state):
      x = self.d1(state)
      x = self.d2(x)
      a = self.a(x)
      return a


In [5]:
class exp_replay():
    def __init__(self, buffer_size= 1000000):
        self.buffer_size = buffer_size
        self.state_mem = np.zeros((self.buffer_size, *(env.observation_space.shape)), dtype=np.float32)
        self.action_mem = np.zeros((self.buffer_size), dtype=np.int32)
        self.reward_mem = np.zeros((self.buffer_size), dtype=np.float32)
        self.next_state_mem = np.zeros((self.buffer_size, *(env.observation_space.shape)), dtype=np.float32)
        self.done_mem = np.zeros((self.buffer_size), dtype=np.bool)
        self.pointer = 0

    def add_exp(self, state, action, reward, next_state, done):
        idx  = self.pointer % self.buffer_size 
        self.state_mem[idx] = state
        self.action_mem[idx] = action
        self.reward_mem[idx] = reward
        self.next_state_mem[idx] = next_state
        self.done_mem[idx] = 1 - int(done)
        self.pointer += 1

    def sample_exp(self, batch_size= 64):
        max_mem = min(self.pointer, self.buffer_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)
        states = self.state_mem[batch]
        actions = self.action_mem[batch]
        rewards = self.reward_mem[batch]
        next_states = self.next_state_mem[batch]
        dones = self.done_mem[batch]
        return states, actions, rewards, next_states, dones


In [6]:
class agent():
      def __init__(self, gamma=0.99, replace=100, lr=0.001):
          self.gamma = gamma
          self.epsilon = 1.0
          self.min_epsilon = 0.01
          self.epsilon_decay = 1e-3
          self.replace = replace
          self.trainstep = 0
          self.memory = exp_replay()
          self.batch_size = 64
          self.q_net = DDDQN()
          self.target_net = DDDQN()
          opt = tf.keras.optimizers.Adam(learning_rate=lr)
          self.q_net.compile(loss='mse', optimizer=opt)
          self.target_net.compile(loss='mse', optimizer=opt)


      def act(self, state):
          if np.random.rand() <= self.epsilon:
              return np.random.choice([i for i in range(env.action_space.n)])

          else:
              actions = self.q_net.advantage(np.array([state]))
              action = np.argmax(actions)
              return action


      
      def update_mem(self, state, action, reward, next_state, done):
          self.memory.add_exp(state, action, reward, next_state, done)


      def update_target(self):
          self.target_net.set_weights(self.q_net.get_weights())     

      def update_epsilon(self):
          self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.min_epsilon else self.min_epsilon
          return self.epsilon

          
      def train(self):
          if self.memory.pointer < self.batch_size:
             return 
          
          if self.trainstep % self.replace == 0:
             self.update_target()
          states, actions, rewards, next_states, dones = self.memory.sample_exp(self.batch_size)
          target = self.q_net.predict(states)
          next_state_val = self.target_net.predict(next_states)
          max_action = np.argmax(self.q_net.predict(next_states), axis=1)
          batch_index = np.arange(self.batch_size, dtype=np.int32)
          q_target = np.copy(target)
          q_target[batch_index, actions] = rewards + self.gamma * next_state_val[batch_index, max_action]*dones
          self.q_net.train_on_batch(states, q_target)
          self.update_epsilon()
          self.trainstep += 1

      def save_model(self):
          self.q_net.save("model.h5")
          self.target_net.save("target_model.h5")


      def load_model(self):
          self.q_net = load_model("model.h5")
          self.target_net = load_model("model.h5")





In [None]:
agentoo7 = agent()
steps = 400
for s in range(steps):
  done = False
  state = env.reset()
  total_reward = 0
  while not done:
    #env.render()
    action = agentoo7.act(state)
    next_state, reward, done, _ = env.step(action)
    agentoo7.update_mem(state, action, reward, next_state, done)
    agentoo7.train()
    state = next_state
    total_reward += reward
    
    if done:
      print("total reward after {} episode is {} and epsilon is {}".format(s, total_reward, agentoo7.epsilon))

total reward after 0 episode is -111.92346954157561 and epsilon is 0.991
total reward after 1 episode is -271.61288142788675 and epsilon is 0.8969999999999999
total reward after 2 episode is -85.82763318050152 and epsilon is 0.8329999999999999
total reward after 3 episode is -161.77385305579173 and epsilon is 0.7649999999999998
total reward after 4 episode is -26.19204501091575 and epsilon is 0.6829999999999997
total reward after 5 episode is -456.2594406780948 and epsilon is 0.5529999999999996
total reward after 6 episode is -285.42813478035725 and epsilon is 0.3059999999999994
total reward after 7 episode is -270.03981550295214 and epsilon is 0.14299999999999924
total reward after 8 episode is -120.946282864365 and epsilon is 0.01
total reward after 9 episode is -118.83585612843262 and epsilon is 0.01
total reward after 10 episode is -45.4930394307914 and epsilon is 0.01
total reward after 11 episode is -176.72384024412753 and epsilon is 0.01
total reward after 12 episode is -31.0450