In [None]:
from __future__ import annotations
from collections import defaultdict
from matplotlib.patches import Patch
from tqdm import tqdm
from IPython.display import clear_output

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random
import time
import gymnasium as gym

In [None]:
env = gym.make('Taxi-v3')

In [None]:
aSize = env.action_space.n
sSize = env.observation_space.n

qTable = np.zeros((sSize, aSize))

episodesCount = 10000
maxStep = 100

learningRate = 0.1  
discountRate = 0.99  

explorationRate = 1
maxExploration = 1
minExploration = 0.01
eDecay = 0.001

qTables = []
intervals = [0, 1000, 5000, 10000]  

rewardsList = []

In [None]:
if episodesCount not in intervals:
    intervals.append(episodesCount)

rewardsList = []
errorList = []

for episode in range(episodesCount):
    state = env.reset()[0]
    done = False
    currentReward = 0
    totalError = 0  
    
    for step in range(maxStep): 
        
        explorationRate_threshold = random.uniform(0, 1)
        if explorationRate_threshold > explorationRate:
            action = np.argmax(qTable[state,:]) 
        else:
            action = env.action_space.sample()

        oldQ = qTable[state, action]
        currently, reward, done, _, _ = env.step(action)
        
        winnersMove = np.max(qTable[currently, :])
        newQ = (1 - learningRate) * oldQ + learningRate * (reward + discountRate * winnersMove)
        qTable[state, action] = newQ
        
        error = abs(newQ - oldQ)
        totalError += error
        
        state = currently
        currentReward += reward
        
        if done: 
            break
            
    explorationRate = minExploration + \
        (maxExploration - minExploration) * np.exp(-eDecay * episode)
    
    rewardsList.append(currentReward)
    errorList.append(totalError)  
    
    if episode in intervals:
        qTables.append(qTable.copy())

In [None]:
def plotRE(rewardsList, errorList):
    episodes = range(len(rewardsList))
    
    fig, ax1 = plt.subplots(figsize=(12, 6))

    ax1.set_xlabel('Episodes')
    ax1.set_ylabel('Reward')
    ax1.plot(episodes, rewardsList, color='tab:green', label='Reward per Episode')
    ax1.tick_params(axis='y')

    ax2 = ax1.twinx()
    ax2.set_ylabel('Training Error', color='tab:blue')
    ax2.plot(episodes, errorList, color='tab:blue', label='Error per Episode')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.suptitle('Rewards and Errors Over Time', fontsize=16)
    fig.tight_layout()
    plt.show()

plotRE(rewardsList, errorList)