In [17]:
!pip install ale-py
!pip install gym[accept-rom-license,atari]



In [18]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import deque
import random
from gym import envs
import matplotlib.pyplot as plt
from matplotlib import animation
from keras.layers import Activation,Dense,Conv2D,Flatten,Input
from keras.models import Sequential,load_model,clone_model
from keras.optimizers import Adam
from keras import backend as K
import cv2


In [19]:
def resize_state(state):
    state = state[35:195,:]
    state = np.average(state,axis = 2)
    state = cv2.resize(state,(80,80),interpolation = cv2.INTER_NEAREST)
    state = np.array(state,dtype = np.uint8)
    return state

In [20]:
def plot_state(state):
  plt.imshow(state)
  plt.axis("off")
  plt.show()

def update_scene(num,frames,patch):
  patch.set_data(frames[num])
  return patch

def plot_animation(frames):
  fig = plt.figure()
  patch = plt.imshow(frames[0])
  plt.axis('off')
  anime = animation.FuncAnimation(
      fig,update_scene,fargs=(frames,patch),frames=len(frames),repeat=False,
      interval=40
  )
  plt.close()
  anime.save('show.gif',writer='imagemagick')
  return anime

In [21]:
class Memory():
    def __init__(self,max_len):
        self.max_len = max_len
        self.states = deque(maxlen = max_len)
        self.actions = deque(maxlen = max_len)
        self.rewards = deque(maxlen = max_len)
        self.done = deque(maxlen = max_len)

    def remember(self, next_state, next_reward, next_action, next_done):
        self.states.append(next_state)
        self.actions.append(next_action)
        self.rewards.append(next_reward)
        self.done.append(next_done)

In [22]:
class DDQNAgent():
    def __init__(self,env):
        self.env = env
        self.state_shape = (80,80,4)
        self.action_space=[0,2,3]
        self.memory = Memory(max_len=100000)
        self.total_timesteps = 0
        self.gamma = 0.95 
        self.epsilon = 1.0 
        self.epsilon_decay = 0.9995 
        self.epsilon_min = 0.05
        self.learning_rate = 0.0001 
        self.model = self.build_model()
        self.target_model = clone_model(self.model)
        self.replay = []
        self.learns =0

   
    def build_model(self):
        model = Sequential()
        model.add(Input((80,80,4)))
        model.add(Conv2D(filters = 32,kernel_size = (8,8),strides = 4,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Conv2D(filters = 64,kernel_size = (4,4),strides = 2,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Conv2D(filters = 64,kernel_size = (3,3),strides = 1,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Flatten())
        model.add(Dense(512,activation = 'relu', kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Dense(len(self.action_space), activation = 'linear'))
        optimizer = Adam(self.learning_rate)
        model.compile(optimizer, loss=tf.keras.losses.Huber())
        model.summary()
        return model

   
    def remember(self, next_state, next_reward, next_action, next_done):
      self.memory.remember(next_state, next_reward, next_action, next_done)

    
    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.choice([0, 2, 3])
        q_values = self.model.predict(state)
        action = self.action_space[np.argmax(q_values)]
        return action

    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    
    def train(self, batch_size=64):
      states = []
      next_states = []
      actions_taken = []
      next_rewards = []
      next_done = []

      while len(states) < batch_size:
          index = np.random.randint(4, len(self.memory.states) - 1)

        
          if any(self.memory.done[i] for i in range(index-3, index+1)):
              continue

          state = [self.memory.states[index-3],self.memory.states[index-2],self.memory.states[index-1],self.memory.states[index]]
          next_state = [self.memory.states[index-2],self.memory.states[index-1],self.memory.states[index],self.memory.states[index+1]]

          state = np.moveaxis(state, 0, 2) / 255.0
          next_state = np.moveaxis(next_state, 0, 2) / 255.0

          states.append(state)
          next_states.append(next_state)
          actions_taken.append(self.memory.actions[index])
          next_rewards.append(self.memory.rewards[index+1])
          next_done.append(self.memory.done[index+1])

      states = np.array(states)
      next_states = np.array(next_states)

      labels = self.model.predict(states)
      next_state_values = self.target_model.predict(next_states)

      for i in range(batch_size):
          action = self.action_space.index(actions_taken[i])
          labels[i][action] = next_rewards[i] + (1 - next_done[i]) * self.gamma * np.max(next_state_values[i])

      self.model.fit(states, labels, batch_size=batch_size, epochs=1, verbose = 0)

      if self.epsilon > self.epsilon_min:
          self.epsilon *= self.epsilon_decay
      self.learns += 1

      if self.learns % 10000 == 0:
          self.target_model.set_weights(self.model.get_weights())



In [23]:
import gc
last_100_avg = [-21]
scores = deque(maxlen=100)
max_score = -21
env = gym.make("PongDeterministic-v4",render_mode="rgb_array")
env.reset()
agent = DDQNAgent(env)

for i in range(1000):
    gc.collect()
    
    env.reset()
    starting_frame = resize_state(env.step(0)[0])

    dummy_action = 0
    dummy_reward = 0
    dummy_done = False
    for _ in range(3):
        agent.memory.remember(starting_frame, dummy_reward, dummy_action, dummy_done)

    done = False
    score = 0
    while True:
        
        agent.total_timesteps += 1
        if agent.total_timesteps % 10000 == 0:
            agent.model.save_weights('recent_weights.h5')
            keras.backend.clear_session()
            agent.model.load_weights('recent_weights.h5')
            agent.target_model.set_weights(agent.model.get_weights())

            print('\nWeights saved!')
        
        next_state, next_reward, next_done, _ ,e= env.step(agent.memory.actions[-1])

        
        next_state = resize_state(next_state)
        new_state = [agent.memory.states[-3], agent.memory.states[-2], agent.memory.states[-1], next_state]
        new_state = np.moveaxis(new_state, 0, 2) / 255
        new_state = np.expand_dims(new_state, 0)

        
        next_action = agent.choose_action(new_state)

       
        if next_done:
            agent.memory.remember(next_state, next_reward, next_action, next_done)
            score += next_reward
            done = True
            break

        
        agent.memory.remember(next_state, next_reward, next_action, next_done)


        
        if len(agent.memory.states) > 10000:
            agent.train()

        score += next_reward

    scores.append(score)
    if score > max_score:
        max_score = score
    if max_score > 20:
      print("Beat the computer at Episode"+str(i))
      print('Epsilon: ' + str(agent.epsilon))
    print('\nEpisode: ' + str(i))
    print('Score: ' + str(score))
    print('Max Score: ' + str(max_score))
    print('Epsilon: ' + str(agent.epsilon))



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 19, 19, 32)        8224      
                                                                 
 conv2d_1 (Conv2D)           (None, 8, 8, 64)          32832     
                                                                 
 conv2d_2 (Conv2D)           (None, 6, 6, 64)          36928     
                                                                 
 flatten (Flatten)           (None, 2304)              0         
                                                                 
 dense (Dense)               (None, 512)               1180160   
                                                                 
 dense_1 (Dense)             (None, 3)                 1539      
                                                                 
Total params: 1259683 (4.81 MB)
Trainable params: 125968




Episode: 10
Score: -20.0
Max Score: -20.0
Epsilon: 0.9016526843969468

































Episode: 11
Score: -21.0
Max Score: -20.0
Epsilon: 0.5295864140177553





























Episode: 12
Score: -21.0
Max Score: -20.0
Epsilon: 0.3403538508311784





























Episode: 13
Score: -21.0
Max Score: -20.0
Epsilon: 0.22540139257172281





































Episode: 14
Score: -20.0
Max Score: -20.0
Epsilon: 0.13392133983322763







































Episode: 15
Score: -21.0
Max Score: -20.0
Epsilon: 0.0811360454849559































Episode: 16
Score: -21.0
Max Score: -20.0
Epsilon: 0.05373283597470724































Episode: 17
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185



































Episode: 18
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185



































Episode: 19
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185













































Episode: 20
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185













Weights saved!























Episode: 21
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185



































Episode: 22
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185





































Episode: 23
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185

































Episode: 24
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185









































Episode: 25
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185

































Episode: 26
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185





























Episode: 27
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185





































Episode: 28
Score: -20.0
Max Score: -20.0
Epsilon: 0.04999916370086185

































Episode: 29
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185



































Episode: 30
Score: -20.0
Max Score: -20.0
Epsilon: 0.04999916370086185

































Episode: 31
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185















Weights saved!















Episode: 32
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185





































Episode: 33
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185































Episode: 34
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185































Episode: 35
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185

































Episode: 36
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185



































Episode: 37
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185



































Episode: 38
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185





































Episode: 39
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185































Episode: 40
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185





































Episode: 41
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185

































Episode: 42
Score: -21.0
Max Score: -20.0
Epsilon: 0.04999916370086185



























Weights saved!















Episode: 43
Score: -19.0
Max Score: -19.0
Epsilon: 0.04999916370086185



































Episode: 44
Score: -21.0
Max Score: -19.0
Epsilon: 0.04999916370086185





































Episode: 45
Score: -21.0
Max Score: -19.0
Epsilon: 0.04999916370086185







































Episode: 46
Score: -21.0
Max Score: -19.0
Epsilon: 0.04999916370086185






























KeyboardInterrupt: 

## Test the model


In [25]:
env = gym.make("PongDeterministic-v4",render_mode="rgb_array")
env.reset()
agent = DDQNAgent(env)
agent.model.load_weights('recent_weights.h5')
agent.epsilon = 0.049675

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 19, 19, 32)        8224      
                                                                 
 conv2d_4 (Conv2D)           (None, 8, 8, 64)          32832     
                                                                 
 conv2d_5 (Conv2D)           (None, 6, 6, 64)          36928     
                                                                 
 flatten_1 (Flatten)         (None, 2304)              0         
                                                                 
 dense_2 (Dense)             (None, 512)               1180160   
                                                                 
 dense_3 (Dense)             (None, 3)                 1539      
                                                                 
Total params: 1259683 (4.81 MB)
Trainable params: 1259

In [27]:
env.reset()
starting_frame = resize_state(env.step(0)[0])
score = -21
dummy_action = 0
dummy_reward = 0
dummy_done = False
final_frame = []
for _ in range(3):
  agent.memory.remember(starting_frame, dummy_reward, dummy_action, dummy_done)
while True:
  next_state, next_reward, next_done, _,e = env.step(agent.memory.actions[-1])
  final_frame.append(next_state)
  next_state = resize_state(next_state)
  new_state = [agent.memory.states[-3], agent.memory.states[-2], agent.memory.states[-1], next_state]
  new_state = np.moveaxis(new_state, 0, 2) / 255
  new_state = np.expand_dims(new_state, 0)

  next_action = agent.choose_action(new_state)
  agent.memory.remember(next_state, next_reward, next_action, next_done)
  if next_done:
      agent.memory.remember(next_state, next_reward, next_action, next_done)
      score += next_reward
      done = True
      break
plot_animation(final_frame)















<matplotlib.animation.FuncAnimation at 0x2651744d050>