### Imports

In [1]:
import gym
import numpy
import random
from os import system, name
from time import sleep
import time

### Set up the environment and the problem's parameters:

In [2]:
env = gym.make("Taxi-v3").env 

In [3]:
q_table = numpy.zeros([env.observation_space.n, env.action_space.n]) # instantiate q-table
training_episodes = 20000 # number of training episodes
test_episodes = 10 # number of test episodes
display_episodes = 10 # number of episodes to display
done = False # flag to indicate if episode is done

In [4]:
alpha = 0.1 # learning rate
gamma = 0.6 # discount factor
epsilon = 0.1 # exploration - exploitation tradeoff

### Training of the agent using *Q-learning*:

In [5]:
class Qlearning:
    
    def __init__(self, environment, gamma, alpha, epsilon, training_episodes, q_table, done):
        self.environment = environment
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.episodes = training_episodes
        self.Q_table =  q_table

    def apply(self):
        for i in range(self.episodes):
            state = self.environment.reset()[0] # Reset returns observation state and other info, but we only need the state number, which is in the first position of the tuple.
            done = False # The episode is not done yet
            penalties, reward = 0, 0 # Initialize penalties and reward to 0
            
            while not done:
                if random.uniform(0, 1) < self.epsilon: # Check if agent should explore (random action) or exploit (choose the action that maximizes the reward).
                    action = self.environment.action_space.sample() # Pick a possible new action for this state.
                else:
                    action = numpy.argmax(self.Q_table[state]) # Pick the action which has previously given the highest reward.
                
                next_state, reward, done, info, *_ = self.environment.step(action) 
                old_value = self.Q_table[state, action] # Retrieve old value from the Q-table.
                next_max = numpy.max(self.Q_table[next_state])
                temp_diff = next_max - old_value # Calculate the temporal difference.
                # Update Q-value for current state.
                new_value = old_value + self.alpha * (reward + self.gamma * temp_diff)
                self.Q_table[state, action] = new_value
                if reward == -10: # Check if agent attempted to do an illegal action.
                    penalties += 1
    
                state = next_state
                
            if i % 100 == 0: # Output number of completed episodes every 100 episodes.
                
                print(f"Episode: {i}")
    
        print("Training finished.\n")


In [6]:
q_learning = Qlearning(env, gamma, alpha, epsilon, training_episodes, q_table, done)

In [7]:
q_learning.apply()

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0
Episode: 100
Episode: 200
Episode: 300
Episode: 400
Episode: 500
Episode: 600
Episode: 700
Episode: 800
Episode: 900
Episode: 1000
Episode: 1100
Episode: 1200
Episode: 1300
Episode: 1400
Episode: 1500
Episode: 1600
Episode: 1700
Episode: 1800
Episode: 1900
Episode: 2000
Episode: 2100
Episode: 2200
Episode: 2300
Episode: 2400
Episode: 2500
Episode: 2600
Episode: 2700
Episode: 2800
Episode: 2900
Episode: 3000
Episode: 3100
Episode: 3200
Episode: 3300
Episode: 3400
Episode: 3500
Episode: 3600
Episode: 3700
Episode: 3800
Episode: 3900
Episode: 4000
Episode: 4100
Episode: 4200
Episode: 4300
Episode: 4400
Episode: 4500
Episode: 4600
Episode: 4700
Episode: 4800
Episode: 4900
Episode: 5000
Episode: 5100
Episode: 5200
Episode: 5300
Episode: 5400
Episode: 5500
Episode: 5600
Episode: 5700
Episode: 5800
Episode: 5900
Episode: 6000
Episode: 6100
Episode: 6200
Episode: 6300
Episode: 6400
Episode: 6500
Episode: 6600
Episode: 6700
Episode: 6800
Episode: 6900
Episode: 7000
Episode: 7100
Epis

### Agent's evaluation after learning with *Q-learning*:

Running the following code cell, a window will open, in order to wisualize what the trained agent is doing.

In [8]:
for episode in range(test_episodes):
    env = gym.make("Taxi-v3", render_mode="human").env
    state = env.reset()[0]
    done = False
    print("\n\nEPISODE ", episode+1, "\n\n")
    time.sleep(1)

    for step in range(0,20):        
        env.render()
        time.sleep(0.3)

        action = numpy.argmax(q_table[state])        
        new_state, reward, done, info, *_ = env.step(action)
        
        if done:
            env.render()
            if reward == 20:
                print("You reached your destination!")
                time.sleep(1.5)
            break

        state = new_state

env.close()



EPISODE  1 




  if not isinstance(terminated, (bool, np.bool8)):


You reached your destination!


EPISODE  2 


You reached your destination!


EPISODE  3 


You reached your destination!


EPISODE  4 


You reached your destination!


EPISODE  5 


You reached your destination!


EPISODE  6 


You reached your destination!


EPISODE  7 


You reached your destination!


EPISODE  8 


You reached your destination!


EPISODE  9 


You reached your destination!


EPISODE  10 


You reached your destination!
