<a href="https://colab.research.google.com/github/abhisheksuran/Atari_DQN/blob/master/atari_DDDQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# importing some lib...:p

import tensorflow as tf 
from tensorflow.keras.layers import  Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from collections import deque
import numpy as np
import gym
import random
import os
from tensorflow.keras.losses import Huber
import tqdm
from skimage import transform 
from skimage.color import rgb2gray

In [2]:
! pip install gym[atari]



In [3]:
# loading environment
env = gym.make('Breakout-v0')

In [4]:
# getting idea about environment
print("The size of our frame is: ", env.observation_space)
print("The action size is : ", env.action_space.n)

The size of our frame is:  Box(210, 160, 3)
The action size is :  4


In [22]:
class model(tf.keras.Model):
    def __init__(self):
      super().__init__()
      self.c1 = Conv2D(filters = 32,kernel_size = [8,8],strides = [4,4],padding = "VALID", activation='relu')
      self.c2 = Conv2D(filters = 64,kernel_size = [4,4],strides = [2,2],padding = "VALID", activation = 'relu')
      self.c3 = Conv2D(filters = 64,kernel_size = [3,3],strides = [2,2],padding = "VALID", activation = 'relu')
      self.d1 = Dense(512, activation = 'relu')
      self.f = Flatten()
      self.d2 = tf.keras.layers.Dense(128, activation='relu')
      self.d3 = tf.keras.layers.Dense(128, activation='relu')  
      self.v = tf.keras.layers.Dense(1, activation=None)
      self.a = tf.keras.layers.Dense(env.action_space.n, activation=None)

    def call(self, input_data):
      x = self.c1(input_data)
      x = self.c2(x)
      x = self.c3(x)
      x = self.d1(x)
      x = self.f(x)
      v = self.d2(x)
      v = self.v(v)
      a = self.d3(x)
      a = self.a(a)
      Q = v +(a -tf.math.reduce_mean(a, axis=1, keepdims=True))
      return Q

    def advantage(self, state):
      x = self.c1(state)
      x = self.c2(x)
      x = self.c3(x)
      x = self.d1(x)
      x = self.f(x)
      a = self.d3(x)
      a = self.a(a)
      return a


In [9]:

# processing images from environment

def image_process(frame):
    
    gray = rgb2gray(frame)
    cropped_frame = gray[8:-12,4:-12]
    normalized_frame = cropped_frame/255.0
    preprocessed_frame = transform.resize(normalized_frame, [110,80])
    #preprocessed_frame =  np.array(preprocessed_frame).reshape(-1, 105, 80, 1)    
    return preprocessed_frame 


# 4 images represents a single state


stack_size = 4 


stacked_frames  =  deque([np.zeros((110,80), dtype=np.int) for i in range(stack_size)], maxlen=4)


def stack_state(stacked_frames, state, is_new_episode):
    
    frame = image_process(state)
    
    #print(frame.shape)

    if is_new_episode:

        stacked_frames = deque([np.zeros((110,80), dtype=np.int) for i in range(stack_size)], maxlen=4)


        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)


        stacked_state = np.stack(stacked_frames, axis=0)

    else:

        stacked_frames.append(frame)


        stacked_state = np.stack(stacked_frames, axis=0) 
    stacked_state =  np.array(stacked_state).reshape(-1,110, 80, 4)   
    stac = stacked_state
    #print(stac.shape)
    return stac, stacked_frames

In [20]:
#our agent 007 :P

class my_agent:
    def __init__(self,env):
        self.env = env
        #self.state_input = Input((105,80,4), name='state_input')
        #self.action_input = action
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 1e-3 
        self.memory = deque(maxlen=100000)
        self.q_net = model()
        self.target_net = model()
        optimizer = Adam(learning_rate=0.01)
        huber = Huber()
        self.q_net.compile(optimizer, loss=huber)
        self.target_net.compile(optimizer, loss=huber)
        self.alighn_target_model()
        self.min_epsilon = 0.01
        self.trainstep = 0
        self.replace = 100
        self.mem_pointer = 0
  
        
    def store(self, state, action, reward, next_state, terminated):
        self.memory.append((state, action, reward, next_state, terminated))
        
    
    def _update_epsilon(self):
        self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.min_epsilon else self.min_epsilon
        #return self.epsilon
        
        
   
    def alighn_target_model(self):
        self.target_net.set_weights(self.q_net.get_weights())     
        
     
    def act(self, state):
        
        if np.random.rand() <= self.epsilon:
            action = np.random.choice([i for i in range(env.action_space.n)])
            
            return action

        state= state        
        q_values = self.q_net.advantage(state)
        a = np.argmax(q_values[0])
        #print(a)
        return a
        
   
    def retrain(self, batch_size):
        
        if self.trainstep % self.replace == 0:
            self.alighn_target_model()


        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, terminated in minibatch:
            
            state = state  
      
            next_state = next_state
           
            
            target = self.q_net.predict(state)
            max_action = np.argmax(self.q_net.predict(next_state)[0])
           
            if terminated:
                target[0][action] = reward
                
            else:
                t = self.target_net.predict(next_state)
                
                target[0][action] = reward + self.gamma * t[0][max_action]
              
            
            self.q_net.fit(state, target, epochs=1, verbose=0)
            self._update_epsilon()
 
     
    def load(self, path):
        self.q_net.load_weights(path)

    def save(self, path):
        self.q_net.save_weights(path)
        



In [None]:
state = env.reset()

agent = my_agent(env)

# some hyperparameter...

batch_size = 64                             
num_of_episodes = 1000
# our agent will play and learn


for e in tqdm.tqdm(range(0, num_of_episodes)):
    
    state = env.reset()
    state , stacked_frames = stack_state(stacked_frames, state, True)
    #print(state.shape)

    t_reward = 0
    terminated = False
    done = False
    #for timestep in range(timesteps_per_episode):
    while not done:
       # env.render()
        state = state 
        #print(state.shape)
        action = agent.act(state)
        #action = np.array([1])
        print(action)
        
        next_state, reward, terminated, info = env.step(action)
        next_state , stacked_frames = stack_state(stacked_frames, next_state , False)

        agent.store(state, action, reward, next_state, terminated)
        agent.mem_pointer += 1

        state  = next_state
        t_reward += reward


        if terminated:
            done = True
         
            print("Total reward is {} and  epsilon is {}".format(t_reward, agent.epsilon))
            break

        if len(agent.memory) > batch_size:
            agent.retrain(batch_size)



  0%|          | 0/1000 [00:00<?, ?it/s][A[A

1
3
1
1
1
0
3
1
2
3
1
1
1
3
3
1
0
1
0
0
2
2
1
3
2
1
3
1
2
3
3
0
2
2
1
1
2
1
2
0
2
0
2
1
0
1
3
2
2
2
2
0
0
1
0
1
2
1
1
1
0
0
2
3
1
2
0
2


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

2
0
0
2
3
2
2
3
2
3
2
3
3
1
1
2
3
2
1
1
1
0
3
2
1
0
1
0
1
1
1
0
2
2
1
3
0
1
3
1
3
1
3
0
3
1
1
0
2
1
1
3
1
3
0
3
2
1
1
0
0
1
2
0
0
0
1
0
1
1
0
0
0
3
1
0
3
1
3
1
1
1
0
2
2
0
3
1
0
0
1
2
1
3
3
1
0
0
2
1
1
2
1
0
0
1
1
2
0
2
1
2
3
1
3
1
0
1
2
3
0
1
3
0
3
1
3
3
1
2
1
1
1
3
2
3
1
2
1
3
0
0
0
1
2
1
2
0
2
1
3
0
0
1
3
1
0
1




  0%|          | 1/1000 [20:31<341:42:21, 1231.37s/it][A[A

3
Total reward is 1.0 and  epsilon is 0.01
3
2
2
1
0
1
0
3
0
0
3
1
3
0
0
0
3
3
2
2
0
3
2
1
1
3
0
2
1
0
1
1
0
1
1
3
1
0
2
1
0
1
2
1
1
1
1
0
2
3
2
1
3
2
0
0
0
1
3
0
1
1
1
3
2
3
2
3
1
0
3
3
1
3
1
3
3
2
2
2
0
3
0
2
3
3
2
3
3
2
1
3
1
1
3
3
2
1
2
2
0
1
2
1
1
2
0
1
1
2
2
0
2
0
2
1
2
3
1
1
2
2
1
1
1
3
1
2
2
2
1
1
3
2
1
0
1
3
2
2
0
0
0
3
1
0
2
2
2
3
1
0
0
0
0
1
1
2
0
1
2
1
1
1
0
1
2
1
0
0
0
1
1
0
2
1
3
0
3
2
0
1
3
3
0
2
1
0
3
3
1
1
0
3
0
3
0
1
1
2
0
0
2
3
1
0
3
0
1
3
0
3
2
0
1
1
1
1
0
1
2
0
0
1
0
3
3
1
1
1
3
2
0
3
0
3
3
2
1
1
1
1
3
0
2
1
1
1
2
3
0
1
1
0
2
1
2
3
3
2
1
1
1
2
3
3
2
0
3
3
1
3
0
3
0
1
2
1
0
0
1
3
1
3
0
0
0
1
0
0
3
2
1
1
0
1
3
1
1
0
1
0
0
3
0
1
0
1
1
1
3
2
0
2
3
2
1
1
0
3
0
0
0
0
2
3
1
3
0
2
0
2
2




  0%|          | 2/1000 [1:02:35<448:51:11, 1619.11s/it][A[A

1
Total reward is 3.0 and  epsilon is 0.01
1
0
1
1
2
1
1
3
1
3
3
3
1
0
1
1
1
1
3
2
0
0
0
2
3
2
0
1
2
2
1
1
1
0
2
2
3
0
2
1
2
2
1
3
2
3
2
0
2
2
1
2
2
2
1
0
0
0
1
3
3
1
1
1
0
2
2
1
1
1
0
2
1
3
0
2
3
2
0
0
1
3
1
1
0
2
0
1
1
1
0
0
0
0
0
3
1
3
0
3
0
3
0
2
2
2
0
3
2
2
1
1
0
1
3
2
0
0
2
0
0
0
2
3
0
3
2
2
1
3
1
0
1
0
0
0
0
1
3
0
2
1
1
0
3
0
0
2
3
1
3
0
3
0
1
3
0
1
1
2
3
0
1
0
0
3
0
0
0
1
1
1
1
3
1
1
0
3
2
3
2
1
2
3
0
3
1
1
2
0
3
1
0
1
1
0
1
0
0
1
0
0
1
3
0
2
0
2
3
2
1
1
0
3
0
1
1
0
2
0
0
2
0
0
3
2
0
2
0
0
0
0
3
0
0
0
2
0
1
0
3
3
1
3
0
3
1
1
1
3
0
0
0
0
0
3
1
2
2
0
1
0
1
3
0
0
2
1




  0%|          | 3/1000 [1:36:23<482:26:11, 1742.00s/it][A[A

1
Total reward is 2.0 and  epsilon is 0.01
1
0
2
0
2
3
0
0
2
3
3
2
1
1
3
1
0
2
0
1
3
2
0
3
2
1
3
0
2
0
3
2
2
2
1
3
3
1
0
0
3
2
3
1
0
3
0
3
1
1
2
2
2
2
3
0
1
2
0
0
1
0
2
0
1
0
0
2
2
1
2
3
1
3
1
0
0
3
2
1
1
1
3
1
0
2
0
3
0
0
0
1
0
0
2
1
2
0
2
1
2
1
0
2
3
2
1
1
2
2
0
2
0
3
0
3
2
1
0
0
0
2
2
0
0
3
3
3
0
1
3
2
1
0
2
1
2
0
1
0
1
0
0
0
0
1
2
3
2
1
0
1
2
0
3
1
3
1
2
3
0
0
0
3
3




  0%|          | 4/1000 [1:57:19<441:32:43, 1595.95s/it][A[A

2
Total reward is 0.0 and  epsilon is 0.01
2
3
0
3
2
1
2
3
2
1
0
0
1
1
1
1
1
1
3
1
3
3
1
0
0
2
1
0
3
0
0
0
2
2
3
0
3
0
3
0
3
2
2
2
3
0
0
2
1
2
0
1
0
3
2
1
0
1
2
3
0
3
2
1
2
0
0
2
3
1
2
3
2
0
1
2
1
0
0
3
0
0
1
1
2
1
0
1
3
1
1
0
0
2
1
2
1
3
0
1
1
3
2
2
0
0
3
1
3
2
0
2
0
0
2
3
1
0
1
1
0
3
2
1
2
1
2
3
1
2
1
2
1
0
3
0
3
0
0
1
0
0
1
0
3
1
2
3
2
1
1
0
3
0
0
3
2
2
3
1
0
0
1
3
0
0
0
2
0
1
3
1
0
3
0
1
0
0
3
0
1
3




  0%|          | 5/1000 [2:20:11<422:34:48, 1528.93s/it][A[A

0
Total reward is 0.0 and  epsilon is 0.01
0
0
3
1
2
3
2
1
1
3
0
0
0
1
0
2
1
1
2
0
3
0
2
3
0
3
1
1
1
3
1
3
2
1
1
1
0
1
0
0
0
1
1
1
3
1
0
1
1
0
3
0
2
2
1
2
0
3
0
1
0
0
1
1
3
1
3
0
2
3
1
2
1
3
1
1
0
3
2
1
2
0
3
3
0
1
3
0
0
2
3
0
1
3
3
3
2
1
3
3
2
2
2
1
1
2
2
0
2
3
1
3
1
1
3
2
0
1
0
3
2
3
0
1
2
3
3
1
1
3
2
0
3
3
2
2
2
1
2
1
3
0
3
2
3
1
1
0
2
3
3
0
3
1
1
0
3
0
1
3
0
3
2
2
0
1
2
0
2
3
1
1
3
2
3
0
2
3
0
1
1
1
3
1
3
2
3
3
2
1
1
2
3
3
2
0
1
3
1
0
0
1
2
1
1
0
0
0
0
3
0
1
2
0
3
0
3
0
2
0
2
1
1
0
3
2
3
3
3
3
0
1
1
0
2
2
0
3
0
0
3
1
1
2
1
1
2
2
2
2
2
3
3
3
1
0
3
2
3
2
0
1
0
2
0
0
3
0
2
3
3
3
2
3
2
0
2
0
3
1
3
2
2
2
3
3
3
0
0
2
1
0




  1%|          | 6/1000 [2:56:50<477:38:51, 1729.91s/it][A[A

1
Total reward is 2.0 and  epsilon is 0.01
1
3
2
3
1
2
1
1
3
3
0
3
3
2
3
0
0
1
3
0
3
2
3
2
3
3
0
1
2
2
2
3
0
1
0
3
3
1
3
1
1
3
2
2
2
0
0
2
0
0
3
1
3
0
0
3
0
1
2
3
2
3
3
1
0
1
0
3
3
2
1
0
0
3
3
3
3
1
0
3
2
3
1
3
3
2
0
3
1
1
3
2
3
0
2
3
0
3
2
1
3
1
2
2
0
2
0
3
0
0
3
2
1
0
2
1
3
1
1
3
1
0
3
2
2
0
1
1
3
1
0
1
3
1
1
0
1
2
1
3
0
1
1
2
3
0
2
1
2
0
1
3
2
1
2
1
0
3
2
2
2
3
2
2
1
1
0
2
0
3
3
2
3
1
3
3
0
0
1
3
0
1
1
2
2
2
3
3
3
3
0
3
2
3
3
2
1
3
1
3
0
3
3
0
1
0
3
2
2
0
0
3
0
3
0
0
1
3
1
3
3
1
2
3
3
3
2
0
2
3
0
0
0
2
2
2
0
1
3
3
3
2
2
1
0
1
0
1
0
0
2
0
3
0
3
0
3
0
2
0
2
0
2
1
2
1
1
2
3
3
0
2
2
2
0
2
3
0
0
3
0
0
2
1
0
2
1
2
1
1
2
0
0
2
2
1
3
0
3
0
2
3
3
0
1
1
1
0
3
3
3
0
0
2
3
0
1
1
3
2
2
3
1
2
3
3
1
3
0
2
1
1
3
3
0
0
1
3
0
0
0
2
0
1
2
1
0
0
2
0
2
3
3
0
1
3
0
1
2
2
3
0
1
3
1
1
2
1
3
0
3
2
0
0
3
2
1
3
3
3
3
2
2
0
2
2
2
0
2
0
0
2
2
3
0
0
0
1
2
1




  1%|          | 7/1000 [3:46:50<582:14:42, 2110.86s/it][A[A

1
Total reward is 4.0 and  epsilon is 0.01
1
0
3
2
2
1
2
1
3
3
3
1
0
2
1
3
3
0
1
0
3
2
1
0
1
2
2
3
1
0
0
2
3
2
0
1
3
1
2
1
3
2
3
1
1
3
2
2
0
2
3
3
1
0
0
0
1
0
1
2
2
3
1
0
3
0
1
1
1
1
2
2
0
3
3
0
1
3
2
3
2
0
0
2
1
1
3
1
1
0
3
0
3
3
2
1
2
3
0
0
0
1
3
0
3
3
3
1
0
2
2
2
1
3
3
2
0
2
2
0
1
2
2
0
3
1
0
0
0
2
2
1
0
1
0
2
1
3
1
3
2
0
3
3
0
0
3
1
3
1
1
0
2
0
3
3
0
0
2
1
3
2
1
1
1
1
1
3
3
0
1
2
0
0
3
3
1
3
1
0
3
0
0
1
2
0
1
0
3
3
0
1
1
0
3
3
3
1
0
2
3
1
0
1
1
1
2
2
0
3
2
3
3
1
3
0
1
2
3
1
3
3
1
1
3
1
3
3




  1%|          | 8/1000 [4:15:23<548:48:28, 1991.64s/it][A[A

0
Total reward is 1.0 and  epsilon is 0.01
0
3
0
3
0
2
3
1
1
3
3
3
1
1
3
2
1
2
0
2
1
0
3
1
2
1
2
0
1
1
1
2
1
0
1
2
1
2
1
2
1
0
0
3
0
3
3
1
1
0
2
1
3
0
2
3
1
2
3
2
2
0
2
3
2
1
2
3
1
1
0
1
2
0
3
1
2
0
0
3
3
0
1
1
3
3
3
0
0
3
0
1
2
3
0
1
3
0
1
3
1
2
3
0
3
2
0
0
1
2
0
3
0
0
1
3
2
3
3
2
0
3
2
1
1
2
3
1
1
2
1
2
0
2
1
2
0
2
2
2
1
0
1
2
2
0
3
2
0
2
1
2
1
0
2
3
2
0
3
1
3
0
1
3
1
0
3
2
3
1
0
3
1
3
0
1
0
2
0
3
2
2
3
2
0
2
1
3
3
2
3
0
0
0
3
3
1
2
0
3
1
0
0
2
2
1
1
3
