# Import relevant libraries

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import seaborn as sns
import time
sns.set()

# Load the environment


In [5]:
taxi = gym.make('Taxi-v3')

# Writing the taxi class

In [6]:
class Taxi3():
    """
    The class takes the Taxi environmnet created by OpenAI and train the agent using SARSA algorithm.
    Args:
        env: the predefined Taxi environment
        n_epochs: total number of episodes
        lr: learning rate, default: 0.01
        df: discount factor, default: 0.99
        init_epsilon: initial probability of exploration, default 1.
        decay_epsilon: the factor by which the epsilon value dereases exponentialy, defalut 0.90.
        min_epsilon: the minimum likelihood of exploration, default 0.01.
    """
    def __init__(self, env, n_epochs, *, lr=0.3, df= 0.99, init_epsilon=1, 
                 min_epsilon=0.01, decay_epsilon=0.9):
        
        self.env = env
        self.n_epochs = n_epochs
        self.lr = lr
        self.df = df
        self.init_epsilon = init_epsilon
        self.epsilon = self.init_epsilon
        self.min_epsilon = min_epsilon
        self.decay_epsilon = decay_epsilon
        
        # the Q table is initialized with random values between -0.5 and +0.5
        self.Q = np.random.uniform(-0.5, 0.5, size=(env.observation_space.n, env.action_space.n))
        self.trained_ = False
        
    def train(self):
        for eps in range(self.n_epochs):
            
            self.epsilon = self.init_epsilon     #at the beginning of each episode, epsilon is reset to its initial value
            s = self.env.reset()
            a = self.env.action_space.sample()
            
            for t in range(self.env.spec.max_episode_steps):
                s_, r, done, __ = self.env.step(a)               #the env returns the next state, s_, and reward, r. 
                a_ = np.argmax(self.Q[s_, :])                    #the action is chosen based on the e-greedy policy
                if np.random.random() < self.epsilon:
                    a_ = self.env.action_space.sample()
                    
                #if epsilon is bigger than its minimum, it should be multiplied by the decay value
                if self.epsilon > self.min_epsilon:       
                    self.epsilon *= self.decay_epsilon
                
                #the Q table is updated based on the action chosen above and the next state returned by the env
                self.Q[s, a] += self.lr * (r + self.df * self.Q[s_, a_] - self.Q[s, a])
                
                #old state and action are replaced by the next state and action obtained above
                a = a_
                s = s_
                
                if done:
                    print(f'Episode {eps} finished after {t} time steps')
                    break
        self.trained_ = True
        
    def test(self):
        if self.trained_:
            s = self.env.reset()
            a = self.env.action_space.sample()
            for t_ in range(self.env.spec.max_episode_steps):
                clear_output(wait=True)
                s_, r, done, __ = self.env.step(a)
                self.env.render()
                a_ = np.argmax(self.Q[s_, :])
                a = a_
                s = s_
                time.sleep(0.8)
                if done:
                    break
            print(f'Finished in {t_} time steps')
        else:
            print('The model is not trained yet. First call the train method.')
                
    def reset_Q(self):
        self.Q = np.zeros((env.observation_space.n, env.action_space.n))
        self.trained_ = False
    
        

In [7]:
Taxi = Taxi3(taxi, 1000)

In [8]:
Taxi.train()

Episode 0 finished after 199 time steps
Episode 1 finished after 199 time steps
Episode 2 finished after 146 time steps
Episode 3 finished after 199 time steps
Episode 4 finished after 199 time steps
Episode 5 finished after 199 time steps
Episode 6 finished after 199 time steps
Episode 7 finished after 199 time steps
Episode 8 finished after 199 time steps
Episode 9 finished after 199 time steps
Episode 10 finished after 199 time steps
Episode 11 finished after 199 time steps
Episode 12 finished after 199 time steps
Episode 13 finished after 199 time steps
Episode 14 finished after 199 time steps
Episode 15 finished after 199 time steps
Episode 16 finished after 199 time steps
Episode 17 finished after 95 time steps
Episode 18 finished after 199 time steps
Episode 19 finished after 118 time steps
Episode 20 finished after 199 time steps
Episode 21 finished after 199 time steps
Episode 22 finished after 199 time steps
Episode 23 finished after 199 time steps
Episode 24 finished after 1

Episode 329 finished after 32 time steps
Episode 330 finished after 25 time steps
Episode 331 finished after 21 time steps
Episode 332 finished after 101 time steps
Episode 333 finished after 46 time steps
Episode 334 finished after 84 time steps
Episode 335 finished after 20 time steps
Episode 336 finished after 19 time steps
Episode 337 finished after 32 time steps
Episode 338 finished after 22 time steps
Episode 339 finished after 29 time steps
Episode 340 finished after 14 time steps
Episode 341 finished after 20 time steps
Episode 342 finished after 18 time steps
Episode 343 finished after 63 time steps
Episode 344 finished after 100 time steps
Episode 345 finished after 95 time steps
Episode 346 finished after 12 time steps
Episode 347 finished after 22 time steps
Episode 348 finished after 54 time steps
Episode 349 finished after 18 time steps
Episode 350 finished after 19 time steps
Episode 351 finished after 33 time steps
Episode 352 finished after 83 time steps
Episode 353 fi

Episode 722 finished after 199 time steps
Episode 723 finished after 18 time steps
Episode 724 finished after 28 time steps
Episode 725 finished after 26 time steps
Episode 726 finished after 28 time steps
Episode 727 finished after 24 time steps
Episode 728 finished after 35 time steps
Episode 729 finished after 39 time steps
Episode 730 finished after 34 time steps
Episode 731 finished after 26 time steps
Episode 732 finished after 24 time steps
Episode 733 finished after 37 time steps
Episode 734 finished after 18 time steps
Episode 735 finished after 42 time steps
Episode 736 finished after 23 time steps
Episode 737 finished after 23 time steps
Episode 738 finished after 34 time steps
Episode 739 finished after 69 time steps
Episode 740 finished after 19 time steps
Episode 741 finished after 18 time steps
Episode 742 finished after 46 time steps
Episode 743 finished after 30 time steps
Episode 744 finished after 37 time steps
Episode 745 finished after 88 time steps
Episode 746 fin