<a href="https://colab.research.google.com/github/abhisheksuran/Atari_DQN/blob/master/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import gym
from tensorflow.keras.models import load_model
!pip3 install box2d-py

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 19.4MB/s eta 0:00:01[K     |█▌                              | 20kB 4.6MB/s eta 0:00:01[K     |██▏                             | 30kB 6.0MB/s eta 0:00:01[K     |███                             | 40kB 6.1MB/s eta 0:00:01[K     |███▋                            | 51kB 5.2MB/s eta 0:00:01[K     |████▍                           | 61kB 5.7MB/s eta 0:00:01[K     |█████▏                          | 71kB 6.1MB/s eta 0:00:01[K     |█████▉                          | 81kB 6.6MB/s eta 0:00:01[K     |██████▋                         | 92kB 7.0MB/s eta 0:00:01[K     |███████▎                        | 102kB 7.1MB/s eta 0:00:01[K     |████████                        | 112kB 7.1MB/s eta 0:00:01[K     |████████▊                       | 12

In [2]:
env = gym.make("LunarLander-v2")

In [3]:
class model(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(256,activation='relu')
    self.d2 = tf.keras.layers.Dense(256,activation='relu')
    self.out = tf.keras.layers.Dense(4,activation=None)

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    x = self.out(x)
    return x

  def action_value(self, state):
        q_values = self.predict(state)
        return q_values

In [4]:
class experience():
  def __init__(self, buffer_size, state_dim):
    self.buffer_size = buffer_size
    self.pointer = 0
    self.state_mem = np.zeros((self.buffer_size, *state_dim), dtype=np.float32)
    self.action_mem = np.zeros(self.buffer_size, dtype=np.int32)
    self.next_state_mem = np.zeros((self.buffer_size, *state_dim), dtype=np.float32)
    self.reward_mem = np.zeros(self.buffer_size, dtype=np.int32)
    self.done_mem = np.zeros(self.buffer_size, dtype=np.bool)

  def add_exp(self, state, action, reward, next_state, done):
    idx = self.pointer % self.buffer_size
    self.state_mem[idx] = state
    self.action_mem[idx] = action
    self.reward_mem[idx] = reward
    self.next_state_mem[idx] = next_state
    self.done_mem[idx] = done
    self.pointer += 1

  def sample_exp(self, batch_size):
    max_mem = min(self.pointer, self.buffer_size)
    batch = np.random.choice(max_mem, batch_size, replace= False)
    state = self.state_mem[batch]
    action = self.action_mem[batch]
    reward = self.reward_mem[batch]
    next_state = self.next_state_mem[batch]
    done = self.done_mem[batch]
    return state, action , reward, next_state, done

In [5]:
class agent():
  def __init__(self):
    self.q_net = model()
    self.target_net = model()
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    self.q_net.compile(optimizer=opt, loss='mse')
    self.target_net.compile(optimizer=opt, loss='mse')
    self.epsilon = 1.0
    self.epsilon_decay = 1e-3
    self.min_epsilon = 0.01
    self.memory = experience(buffer_size=1000000, state_dim=env.observation_space.shape)
    self.batch_size = 64
    self.gamma = 0.99
    self.replace = 100
    self.trainstep = 0
    self.action_space = [i for i in range(4)]
  def act(self, state):
    if np.random.rand() <= self.epsilon:
      #action = env.action_space.sample()
      action = np.random.choice(self.action_space)

    else:
      state = np.array([state])
      action = self.q_net.action_value(state)
      action = np.argmax(action)

    return action

  def train(self):
      if self.memory.pointer < self.batch_size:
        return 

      if self.trainstep % self.replace == 0:
        self.update_target()

      states, actions, rewards, next_states, dones = self.memory.sample_exp(self.batch_size)
      target = self.q_net.action_value(states)
      next_state_val = self.target_net.action_value(next_states)
      q_next = tf.math.reduce_max(next_state_val, axis=1, keepdims=True).numpy()
      #print("next state pred {}".format(next_state_val))
      q_target = np.copy(target)
      for i, d in enumerate(dones):
        if d:
          q_target[i, actions[i]] = rewards[i]

        else:
          #q_target[i, actions[i]] = rewards[i] + self.gamma * np.max(next_state_val[i])
          q_target[i, actions[i]] = rewards[i] + self.gamma * q_next[i]

      #print(states)
      #print(q_target)    
      self.q_net.train_on_batch(states, q_target)
      self.update_epsilon()  
      self.trainstep +=1


  def update_mem(self, state, action, reward, next_state, done):
    self.memory.add_exp(state, action, reward, next_state, done)


  def update_target(self):
    self.target_net.set_weights(self.q_net.get_weights())  


  def update_epsilon(self):
    self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.min_epsilon else self.min_epsilon
    return self.epsilon



  def save_model(self):
      self.q_net.save("model.h5")
      self.target_net.save("target_model.h5")


  def load_model(self):
        self.q_net = load_model("model.h5")
        self.target_net = load_model("model.h5")


In [None]:
agentoo7 = agent()
steps = 500

for s in range(steps):
  done = False
  state = env.reset()
  total_reward = 0
  t = 0
  while not done:
    #env.render()
    action = agentoo7.act(state)
    #print(action)
    next_state, reward, done, _ = env.step(action)
    agentoo7.update_mem(state, action, reward, next_state, done)
    agentoo7.train()
    state = next_state
    total_reward += reward
    t += 1
    if done:
       print("total reward after {} episode is {} and epsilon is {}".format(s, total_reward, agentoo7.epsilon))

total reward after 0 episode is -42.61811453267964 and epsilon is 0.951
total reward after 1 episode is -410.4795281853554 and epsilon is 0.8649999999999999
total reward after 2 episode is -445.9066218745269 and epsilon is 0.7139999999999997
total reward after 3 episode is -142.0928661719887 and epsilon is 0.6159999999999997
total reward after 4 episode is -267.5745308339568 and epsilon is 0.4539999999999995
total reward after 5 episode is -305.9556669046533 and epsilon is 0.26899999999999935
total reward after 6 episode is -175.2796966257096 and epsilon is 0.14999999999999925
total reward after 7 episode is -278.22697309162163 and epsilon is 0.0929999999999992
total reward after 8 episode is -196.788924444431 and epsilon is 0.01
total reward after 9 episode is -70.10930961587783 and epsilon is 0.01
total reward after 10 episode is -66.53212194089366 and epsilon is 0.01
total reward after 11 episode is -87.47889383235864 and epsilon is 0.01
total reward after 12 episode is -47.56540055