In [10]:
import numpy as np
import random
from IPython.display import clear_output
from collections import deque


import gym

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam

## without any library, simple deque as Replay buffer.

In [11]:
enviroment = gym.make("Taxi-v2").env
enviroment.render()

print('Number of states: {}'.format(enviroment.observation_space.n))
print('Number of actions: {}'.format(enviroment.action_space.n))

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[43mB[0m: |
+---------+

Number of states: 500
Number of actions: 6


# Agent

In [12]:
class Agent:
    def __init__(self, enviroment, optimizer):
        self.expirience_replay = deque(maxlen = 2000)
        self.gamma = 0.9
        self.epsilon = 0.1
        self.action_size = enviroment.action_space.n
        self.state_size = enviroment.observation_space.n
        self._optimizer = optimizer
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.set_target_model_weights()
        
    def exprience(self, state, action, reward, next_state, terminated):
        self.expirience_replay.append([state, action, reward, next_state, terminated])
        
    def _build_compile_model(self):
        model =Sequential()
        model.add(Embedding(self.state_size, 10, input_length=1))
        model.add(Reshape((10,)))
        model.add(Dense(50,activation='relu'))
        model.add(Dense(50,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        
        model.compile(loss='mse', optimizer=self._optimizer)
        return model
        
    def set_target_model_weights(self):
        self.target_network.set_weights(self.q_network.get_weights())
        
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return enviroment.action_space.sample()
        else:
            Q_value = self.q_network(state)
            return np.argmax(Q_value[0])
        
    def train(self, batch_size):
        samples = random.sample(self.expirience_replay,batch_size)
        for state, action, reward, next_state, terminated in samples:
            Q = self.q_network.predict(state)
            
            if terminated:
                Q[0][action] = reward
            else:
                t = self.target_network.predict(next_state)
                Q[0][action] = reward + self.gamma * np.amax(t)
            self.q_network.fit(state, Q, epochs=1, verbose=0)   

In [13]:
optimizer = Adam(learning_rate=0.01)
agent = Agent(enviroment, optimizer)

batch_size = 32
num_of_episodes = 10
timesteps_per_episode = 512
agent.q_network.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 10)             5000      
_________________________________________________________________
reshape_2 (Reshape)          (None, 10)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                550       
_________________________________________________________________
dense_7 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_8 (Dense)              (None, 6)                 306       
Total params: 8,406
Trainable params: 8,406
Non-trainable params: 0
_________________________________________________________________


In [None]:
for e in range(0, num_of_episodes):
    state = enviroment.reset()
    state = np.reshape(state, [1, 1])
    
    # Initialize variables
    reward = 0
    terminated = False
    
    
    for timestep in range(timesteps_per_episode):
        
        action = agent.get_action(state)
          
        next_state, reward, terminated, info = enviroment.step(action) 
        next_state = np.reshape(next_state, [1, 1])
        agent.exprience(state, action, reward, next_state, terminated)
        
        state = next_state
        
        if terminated:
            agent.alighn_target_model()
            break
            
        if len(agent.expirience_replay) > batch_size:
            agent.train(batch_size)
        
    

    if (e + 1) % 10 == 0:
        print("**********************************")
        print("Episode: {}".format(e + 1))
        enviroment.render()
        print("**********************************")

In [9]:
from time import sleep
for e in range(0, num_of_episodes):
    state = enviroment.reset()
    state = np.reshape(state, [1, 1])
    
    # Initialize variables
    reward = 0
    terminated = False
    
    
    for timestep in range(timesteps_per_episode):
        
        action = agent.get_action(state)
        next_state, reward, terminated, info = enviroment.step(action) 
        next_state = np.reshape(next_state, [1, 1])
        
        if terminated:
            break
        else:
            state = next_state
            clear_output(wait=True)
            enviroment.render()
            print(timestep)
            sleep(0.1)

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | :[43m [0m|
|[34;1mY[0m| : |B: |
+---------+
  (Dropoff)
62


KeyboardInterrupt: 