Import packages

In [1]:
import gym
from IPython.display import clear_output
import time
import numpy as np
import matplotlib.pyplot as plt

Create the environment

In [2]:
env = gym.make('Taxi-v3')

Set hyperparameters

In [3]:
alpha = 0.81
gamma = 0.96
n_episodes = 10000
timesteps = 100
episode = 0
epsilon = 0.25
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.0005

Set parameters

In [4]:
n_A = env.action_space.n
n_S = env.observation_space.n

Initialise Q-table

In [5]:
Q_table = np.zeros((n_S, n_A))

Performance metrics

In [6]:
training_episode_lengths = timesteps*np.ones(n_episodes)
training_episode_rewards = np.zeros(n_episodes)

Definition of policy

In [7]:
def policy(observation):
    if np.random.rand() < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[observation,:])
    return action

Training loop

In [8]:
for i_episode in range(n_episodes): #loop for each episode
    observation = env.reset() #initialise S
    episode_reward = 0
    for t in range(timesteps): #loop for each step of episode
        
        #time.sleep(.1)
        #clear_output(wait=True)
        #print("Episode: ", i_episode+1)
        #env.render()
        
        action = policy(observation) #choose A from S using epsilon-greedy policy derived from Q-table
        next_observation, reward, done, info = env.step(action) #take action A, observe R, S'
        episode_reward+= reward
        if done:
            #print("Episode {} finished after {} timesteps".format(i_episode+1,t+1))
            training_episode_lengths[i_episode] = t+1
            break
        Q_table[observation, action] = Q_table[observation, action] + alpha * (reward + gamma * max(Q_table[next_observation,:]) - Q_table[observation, action])#Q-learning update rule
        observation = next_observation
    training_episode_rewards[i_episode] = episode_reward
    episode+=1
    epsilon = min_epsilon + (max_epsilon - min_epsilon) *np.exp(-decay_rate*episode) # Reduce epsilon per episode (because we need less and less exploration)
env.close()

Plot rewards

In [None]:
plt.plot(training_episode_rewards, '.', markersize=.5)
plt.ylabel('reward')
plt.xlabel('episode')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.axhline(y=20, color = 'r')
plt.axhline(y=0, color = 'y')

Plot episode lengths

In [None]:
plt.plot(training_episode_lengths, '.', markersize = .5)
plt.ylabel('episode length [timesteps]')
plt.xlabel('episode')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

Use the generated Q-table to play game

In [None]:
n_episodes_test = 20
test_episode_lengths = timesteps*np.ones(n_episodes_test)
test_episode_rewards = []

In [None]:
for i_episode in range (n_episodes_test):
    observation = env.reset()
    time.sleep(1)
    episode_reward = 0
    for t in range (timesteps):
        time.sleep(.1)
        clear_output(wait=True)
        print("Episode: ", i_episode+1)
        env.render()
        action = np.argmax(Q_table[observation,:])
        observation, reward, done, info = env.step(action)
        episode_reward+=reward
        if done:
            test_episode_lengths[i_episode] = t+1
            test_episode_rewards.append(episode_reward)
            break
        if t+1 == timesteps:
            test_episode_rewards[i_episode] = episode_reward
env.close()
        
        

Look at trained run rewards

In [None]:
plt.plot(test_episode_rewards, '.')
plt.ylabel('reward')
plt.xlabel('episode')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.axhline(y=20, color = 'r')
plt.axhline(y=0, color = 'y')

Look at trained run episode lengths

In [None]:
plt.plot(test_episode_lengths, '.')
plt.ylabel('episode length [timesteps]')
plt.xlabel('episode')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)