In [1]:
import numpy as np
import random
from IPython.display import clear_output
from collections import deque


import gym

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [2]:
enviroment = gym.make("Taxi-v2").env
enviroment.render()

+---------+
|[34;1mR[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [3]:
class DDDQN(tf.keras.Model):
    def __init__(self,state_size,embeding_size,action_size):
        super(DDDQN,self).__init__()
        self.embed = Embedding(state_size, embeding_size, input_length=1)
        self.reshape = Reshape((10,))
        self.d1 = Dense(50,activation='relu')
        self.d2 = Dense(50,activation='relu')
        self.a = Dense(action_size,activation='linear')
        self.v = Dense(1,activation='linear')
    
    def call(self,input_data):
        x = self.embed(input_data)
        x = self.reshape(x)
        x = self.d1(x)
        x = self.d2(x)
        v = self.v(x)
        a = self.a(x)
        
        Q = v + (a - tf.math.reduce_mean(a, axis = 1,keepdims=True))
        return Q
    def advantage(self,input_data):
        x = self.embed(input_data)
        x = self.reshape(x)
        x = self.d1(x)
        x = self.d2(x)
        a = self.a(x)
        return a

In [4]:
class Agent:
    def __init__(self, enviroment, lr=0.005,epsilon=1,eps_dec=0.02,eps_min=0.01,replace = 100):
        self.expirience_replay = deque(maxlen = 20000)
        
        self.gamma = 0.96
        self.replace =replace
        self.setps = 0
        
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        
        self.action_size = enviroment.action_space.n
        self.state_size = enviroment.observation_space.n
        self.q_network = DDDQN(self.state_size,10,self.action_size)
        self.target_network = DDDQN(self.state_size,10,self.action_size)
        
        self.q_network.compile(optimizer=Adam(learning_rate=lr),loss='mean_squared_error')
        self.target_network.compile(optimizer=Adam(learning_rate=lr),loss='mean_squared_error')
        
        self.set_target_model_weights()
        
    def exprience(self, state, action, reward, next_state, terminated):
        self.expirience_replay.append([state, action, reward, next_state, terminated])
        
    def set_target_model_weights(self):
        self.target_network.set_weights(self.q_network.get_weights())
        
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return enviroment.action_space.sample()
        else:
            state = np.array([state],copy=False,dtype=np.float32)
            actions = self.q_network.advantage(state)
            action = tf.math.argmax(actions, axis=1).numpy()[0]
            return action
        
    def train(self, batch_size):
        
        samples = random.sample(self.expirience_replay,batch_size)
        for state, action, reward, next_state, terminated in samples:
        
      
            state = np.array([state],copy=False,dtype=np.float32)
            next_state = np.array([next_state],copy=False,dtype=np.float32)
            
            Q_pred = self.q_network(state)
            
            best_action = tf.math.argmax(self.q_network(next_state), axis=1).numpy()[0]
            Q_next = self.target_network(next_state)
            Double_q = Q_next[0,best_action].numpy()
            
            q_target = np.copy(Q_pred)
         
            if terminated:
                Q_next=0

            q_target[0,action] = reward + self.gamma*Double_q
            self.q_network.fit(state, q_target, epochs=1, verbose=0)   

        if self.setps%self.replace == 0:
            self.set_target_model_weights()
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon>self.eps_min else self.eps_min

        self.setps += 1
            

In [5]:
agent = Agent(enviroment)

batch_size = 32
num_of_episodes = 100
timesteps_per_episode = 1000


In [9]:
from time import sleep
for e in range(0, num_of_episodes):
    state = enviroment.reset()
    
    # Initialize variables
    reward = 0
    terminated = False
    
    
    for timestep in range(timesteps_per_episode):
        
        action = agent.get_action(state)
        next_state, reward, terminated, info = enviroment.step(action) 
        agent.exprience(state, action, reward, next_state, terminated)
        if len(agent.expirience_replay) > 5+batch_size:
            agent.train(batch_size)
        if terminated:
            break
        else:
            state = next_state
            
        if timestep % 50==0:
            print(e,timestep /50)
            


0 0.0
0 1.0
0 2.0
0 3.0
0 4.0
0 5.0
0 6.0
0 7.0
0 8.0
0 9.0
0 10.0
0 11.0
0 12.0
0 13.0
0 14.0
0 15.0
0 16.0
0 17.0
0 18.0
0 19.0
1 0.0
1 1.0
1 2.0
1 3.0
1 4.0
1 5.0
1 6.0
1 7.0
1 8.0
1 9.0
1 10.0
1 11.0
1 12.0
1 13.0
1 14.0
1 15.0
1 16.0
1 17.0
1 18.0
1 19.0
2 0.0
2 1.0
2 2.0
2 3.0
2 4.0
2 5.0
2 6.0
2 7.0
2 8.0
2 9.0
2 10.0
2 11.0
2 12.0
2 13.0
2 14.0
2 15.0
2 16.0
2 17.0
2 18.0
2 19.0
3 0.0
3 1.0
3 2.0
3 3.0
3 4.0
3 5.0
3 6.0
3 7.0
3 8.0
3 9.0
3 10.0
3 11.0
3 12.0
3 13.0
3 14.0
3 15.0
3 16.0
3 17.0
3 18.0
3 19.0
4 0.0
4 1.0
4 2.0
4 3.0
4 4.0
4 5.0
4 6.0
4 7.0
4 8.0
4 9.0
4 10.0
4 11.0
4 12.0
4 13.0
4 14.0
4 15.0
4 16.0
4 17.0
4 18.0
4 19.0
5 0.0
5 1.0
5 2.0
5 3.0
5 4.0
5 5.0
5 6.0
5 7.0
5 8.0
5 9.0
5 10.0
5 11.0
5 12.0
5 13.0
5 14.0
5 15.0
5 16.0
5 17.0
5 18.0
5 19.0
6 0.0
6 1.0
6 2.0
6 3.0
6 4.0
6 5.0
6 6.0
6 7.0
6 8.0
6 9.0
6 10.0
6 11.0
6 12.0
6 13.0
6 14.0
6 15.0
6 16.0
6 17.0
6 18.0
6 19.0
7 0.0
7 1.0
7 2.0
7 3.0
7 4.0
7 5.0
7 6.0
7 7.0
7 8.0
7 9.0
7 10.0
7 11.0
7 12.0
7 13.0
7 

KeyboardInterrupt: 

In [10]:
from time import sleep
for e in range(0, num_of_episodes):
    state = enviroment.reset()
   
    
    # Initialize variables
    reward = 0
    terminated = False
    
    
    for timestep in range(timesteps_per_episode):
        
        action = agent.get_action(state)
        next_state, reward, terminated, info = enviroment.step(action) 
        
        if terminated:
            break
        else:
            state = next_state
            clear_output(wait=True)
            enviroment.render()
            print(timestep)
            sleep(0.1)

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|[34;1mB[0m: |
+---------+
  (East)
804


KeyboardInterrupt: 