In [None]:
import numpy as np
import random
from IPython.display import clear_output
from collections import deque


import gym

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [None]:
enviroment = gym.make("Taxi-v2").env
enviroment.render()

In [None]:
class DDDQN(tf.keras.Model):
    def __init__(self,state_size,embeding_size,action_size):
        super(DDDQN,self).__init__()
        self.embed = Embedding(state_size, embeding_size, input_length=1)
        self.reshape = Reshape((10,))
        self.d1 = Dense(50,activation='relu')
        self.d2 = Dense(50,activation='relu')
        self.a = Dense(action_size,activation='linear')
        self.v = Dense(1,activation='linear')
    
    def call(self,input_data):
        x = self.embed(input_data)
        x = self.reshape(x)
        x = self.d1(x)
        x = self.d2(x)
        v = self.v(x)
        a = self.a(x)
        
        Q = v + (a - tf.math.reduce_mean(a, axis = 1,keepdims=True))
        return Q
    def advantage(self,input_data):
        x = self.embed(input_data)
        x = self.reshape(x)
        x = self.d1(x)
        x = self.d2(x)
        a = self.a(x)
        return a

In [None]:
class Agent:
    def __init__(self, enviroment, lr=0.005,epsilon=1,eps_dec=0.02,eps_min=0.01,replace = 100):
        self.expirience_replay = deque(maxlen = 20000)
        
        self.gamma = 0.96
        self.replace =replace
        self.setps = 0
        
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        
        self.action_size = enviroment.action_space.n
        self.state_size = enviroment.observation_space.n
        self.q_network = DDDQN(self.state_size,10,self.action_size)
        self.target_network = DDDQN(self.state_size,10,self.action_size)
        
        self.q_network.compile(optimizer=Adam(learning_rate=lr),loss='mean_squared_error')
        self.target_network.compile(optimizer=Adam(learning_rate=lr),loss='mean_squared_error')
        
        self.set_target_model_weights()
        
    def exprience(self, state, action, reward, next_state, terminated):
        self.expirience_replay.append([state, action, reward, next_state, terminated])
        
    def set_target_model_weights(self):
        self.target_network.set_weights(self.q_network.get_weights())
        
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return enviroment.action_space.sample()
        else:
            state = np.array([state],copy=False,dtype=np.float32)
            actions = self.q_network.advantage(state)
            action = tf.math.argmax(actions, axis=1).numpy()[0]
            return action
        
    def train(self, batch_size):
        
        samples = random.sample(self.expirience_replay,batch_size)
        for state, action, reward, next_state, terminated in samples:
        
      
            state = np.array([state],copy=False,dtype=np.float32)
            next_state = np.array([next_state],copy=False,dtype=np.float32)
            
            Q_pred = self.q_network(state)
            
            best_action = tf.math.argmax(self.q_network(next_state), axis=1).numpy()[0]
            Q_next = self.target_network(next_state)
            Double_q = Q_next[0,best_action].numpy()
            
            q_target = np.copy(Q_pred)
         
            if terminated:
                Q_next=0

            q_target[0,action] = reward + self.gamma*Double_q
            self.q_network.fit(state, q_target, epochs=1, verbose=0)   

        if self.setps%self.replace == 0:
            self.set_target_model_weights()
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon>self.eps_min else self.eps_min

        self.setps += 1
            

In [None]:
agent = Agent(enviroment)

batch_size = 32
num_of_episodes = 100
timesteps_per_episode = 1000


In [None]:
from time import sleep
for e in range(0, num_of_episodes):
    state = enviroment.reset()
    
    # Initialize variables
    reward = 0
    terminated = False
    
    
    for timestep in range(timesteps_per_episode):
        
        action = agent.get_action(state)
        next_state, reward, terminated, info = enviroment.step(action) 
        agent.exprience(state, action, reward, next_state, terminated)
        if len(agent.expirience_replay) > 5+batch_size:
            agent.train(batch_size)
        if terminated:
            break
        else:
            state = next_state
            
        if timestep % 50==0:
            print(e,timestep % 50)
            


In [None]:
from time import sleep
for e in range(0, num_of_episodes):
    state = enviroment.reset()
   
    
    # Initialize variables
    reward = 0
    terminated = False
    
    
    for timestep in range(timesteps_per_episode):
        
        action = agent.get_action(state)
        next_state, reward, terminated, info = enviroment.step(action) 
        
        if terminated:
            break
        else:
            state = next_state
            clear_output(wait=True)
            enviroment.render()
            print(timestep)
            sleep(0.1)