# Gym Taxi con Q-Learning y Deep Q-Learning

In [2]:
# used to help with visualizing in Colab                 
from IPython.display import display, clear_output   
from time import sleep  

## Inicializacion del entorno
Probamos el enviroment y sampleamos el movimiento del taxi y sus diferentes acciones:

0. Move South 
1. Move North
2. Move east 
3. Move west
4. Pickup passenger 
5. Drop off passenger

Hay 4 localizaciones concretas en el grid:

0. Red
1. Green
2. Yellow
3. Blue

In [3]:
# used to help with visualizing in Colab                 
from IPython.display import display, clear_output   
from time import sleep  
import gym
env = gym.make('Taxi-v3')
env.reset()

for _ in range(100):
    clear_output(wait=True) 

    observation, reward, done, info = env.step(env.action_space.sample())
    env.render()
    # if done:
    #     observation, info = env.reset(return_info=True)
    sleep(0.3)

env.close()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)


In [4]:
import gym
import random

random.seed(1234)

streets = gym.make("Taxi-v3").env #New versions keep getting released; if -v3 doesn't work, try -v2 or -v4
streets.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |B: |
+---------+



Each state is defined by a 4 entries tuple: （taxi_row, taxi_col, passenger_location, destination). For example, the image shows state (2,3,2,0), which means we are at position row index 2 (note that python index start at 0 so this means row 3), and column index 3, the passenger is at Yellow, encoded by 2 and our destination is red, encoded by 0

In [5]:
initial_state = streets.encode(2, 3, 2, 0)

streets.s = initial_state

streets.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



## Aprendizaje de la Q-Table
Entrenamiento del modelo

In [6]:
import numpy as np

q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
print(np.size(q_table))
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
learning_rate = 0.1
discount_factor = 0.6
exploration = 0.1
epochs = 10000

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    
    while not done:
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            # print(f'Q_table {q_table[state]} el max es { np.argmax(q_table[state])}')
            action = np.argmax(q_table[state]) # Use the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
        
        state = next_state
        

3000


In [7]:
q_table[streets.encode(1,0,2,0)]  

array([-2.12208668, -2.21134532, -2.27445838, -2.21401063, -7.20615896,
       -6.1696944 ])

In [8]:
from IPython.display import clear_output
from time import sleep
lengths=[]
for tripnum in range(1, 11):
    state = streets.reset()
   
    done = False
    trip_length = 0
    
    while not done and trip_length < 25:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render(mode='ansi'))
        sleep(.2)
        state = next_state
        trip_length += 1
    lengths.append(trip_length)
    
    sleep(.2)
avg_len=sum(lengths)/10
print(f'La media de pasos es: {avg_len}')

Trip number 10 Step 24
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |B: |
+---------+
  (South)

La media de pasos es: 14.0


## Definimos las funciones
Ahora tenemos el Q-Learning para aprender nuestra tabla y probarla

In [9]:
def q_learning(learning_rate,discount_factor,exploration,epochs):
    q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
    # a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
    for taxi_run in range(epochs):
        state = streets.reset()
        done = False
            
        while not done:
            random_value = random.uniform(0, 1)
            if (random_value < exploration):
                action = streets.action_space.sample() # Explore a random action
            else:
                action = np.argmax(q_table[state]) # Use the action with the highest q-value
                    
            next_state, reward, done, info = streets.step(action)
                
            prev_q = q_table[state, action]
            next_max_q = np.max(q_table[next_state])
            new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
            q_table[state, action] = new_q
                
            state = next_state



def average_trip_length():
    lengths=[]
    for tripnum in range(1, 11):
        state = streets.reset()
        done = False
        trip_length = 0
        
        while not done and trip_length < 25:
            action = np.argmax(q_table[state])
            next_state, reward, done, info = streets.step(action)
            clear_output(wait=True)
            state = next_state
            trip_length += 1
        lengths.append(trip_length)
    avg_len=sum(lengths)/10
    return avg_len

## Ajuste de hiperparametros

In [10]:
learning_rate = 0.1
discount_factor = [0.5,0.6,0.7,0.8,0.9]
exploration = 0.1
epochs = 1000
difdis=[0,0,0,0,0]
for j in range(1,10):
    for i in range(len(discount_factor)):
        q_learning(learning_rate,discount_factor[i],exploration,epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[12.15 12.59 12.2  12.15 12.62]


In [11]:
learning_rate = [0.1,0.2,0.3,0.4,0.5]
discount_factor = 0.9
exploration = 0.1
epochs = 1000
difdis=[0,0,0,0,0]
for j in range(1,10):
    for i in range(len(learning_rate)):
        q_learning(learning_rate[i],discount_factor,exploration,epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[12.47 12.99 13.08 12.78 12.76]


In [12]:
learning_rate = 0.5
discount_factor = 0.5
exploration = [0.1,0.2,0.3,0.4]
epochs = 1000
difdis=[0,0,0,0]
for j in range(1,10):
    for i in range(len(exploration)):
        q_learning(learning_rate,discount_factor,exploration[i],epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[12.43 12.26 12.55 12.64]


In [13]:
learning_rate = 0.4
discount_factor = 0.5
exploration = 0.3
epochs = 1000
difdis=[]
for j in range(1,10):
        q_learning(learning_rate,discount_factor,exploration,epochs)
        difdis.append(average_trip_length())

print(sum(difdis)/10)

12.370000000000001


## Deep Q-Learning

In [14]:
from model import model
from agent import DQNAgent
from config import config

ImportError: cannot import name 'model' from 'model' (c:\Users\Arribas\Desktop\Code\TFG_Atari_NeuralNetworks\Taxi\model.py)