## 1. Import libs

In [10]:
import gym
import numpy
import random
from os import system, name
from time import sleep

## 2. Setup

In [11]:
def clearConsole(): 
  
    # Windows.
    if name == 'nt': 
        _ = system('cls')
  
    # Mac and Linux.
    else: 
        _ = system('clear')

clearConsole()

In [12]:
env = gym.make("Taxi-v3").env

# Matrix filled with zeros.
# The matrix will be 500x6 as there are 500 states and 6 actions.
q_table = numpy.zeros([env.observation_space.n, env.action_space.n])

training_episodes = 20000 # Amount of times to run environment while training.
display_episodes = 10 # Amount of times to run environment after training.

# Hyperparameters
alpha = 0.1 # Learning Rate
gamma = 0.6 # Discount Rate
epsilon = 0.1 # Chance of selecting a random action instead of maximising reward.

# For plotting metrics
all_epochs = []
all_penalties = []




## 3. Training the Agent

In [13]:
for i in range(training_episodes):
    state = env.reset()
    done = False
    penalties, reward, = 0, 0
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = numpy.argmax(q_table[state])

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = numpy.max(q_table[next_state])

        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        
    if i % 100 == 0:
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 0
Episode: 100
Episode: 200
Episode: 300
Episode: 400
Episode: 500
Episode: 600
Episode: 700
Episode: 800
Episode: 900
Episode: 1000
Episode: 1100
Episode: 1200
Episode: 1300
Episode: 1400
Episode: 1500
Episode: 1600
Episode: 1700
Episode: 1800
Episode: 1900
Episode: 2000
Episode: 2100
Episode: 2200
Episode: 2300
Episode: 2400
Episode: 2500
Episode: 2600
Episode: 2700
Episode: 2800
Episode: 2900
Episode: 3000
Episode: 3100
Episode: 3200
Episode: 3300
Episode: 3400
Episode: 3500
Episode: 3600
Episode: 3700
Episode: 3800
Episode: 3900
Episode: 4000
Episode: 4100
Episode: 4200
Episode: 4300
Episode: 4400
Episode: 4500
Episode: 4600
Episode: 4700
Episode: 4800
Episode: 4900
Episode: 5000
Episode: 5100
Episode: 5200
Episode: 5300
Episode: 5400
Episode: 5500
Episode: 5600
Episode: 5700
Episode: 5800
Episode: 5900
Episode: 6000
Episode: 6100
Episode: 6200
Episode: 6300
Episode: 6400
Episode: 6500
Episode: 6600
Episode: 6700
Episode: 6800
Episode: 6900
Episode: 7000
Episode: 7100
Epis

## 4. Display and evaluate agent's performance

In [15]:
total_epochs, total_penalties = 0, 0

for _ in range(display_episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = numpy.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

        clearConsole()
        
        env.render()

        print("--------------------------------------------------")
        print(f"Episode: {_}")
        print(f"Timestep: {epochs}")
        print(f"State: {state}")
        print(f"Action: {action}")
        print(f"Reward: {reward}")
        sleep(0.15)

    total_penalties += penalties
    total_epochs += epochs

+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
--------------------------------------------------
Episode: 0
Timestep: 1
State: 2
Action: 3
Reward: -1
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
--------------------------------------------------
Episode: 0
Timestep: 2
State: 18
Action: 4
Reward: -1
+---------+
|R: | : :G|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
--------------------------------------------------
Episode: 0
Timestep: 3
State: 118
Action: 0
Reward: -1
+---------+
|R: | : :G|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
--------------------------------------------------
Episode: 0
Timestep: 4
State: 218
Action: 0
Reward: -1
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[42m_[0m| : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
--------

In [16]:
print(f"Results after {display_episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / display_episodes}")
print(f"Average penalties per episode: {total_penalties / display_episodes}")

Results after 10 episodes:
Average timesteps per episode: 11.4
Average penalties per episode: 0.0
