# Import relevant libraries

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import seaborn as sns
import time
sns.set()

# Load the environment


In [2]:
taxi = gym.make('Taxi-v3')

# Writing the taxi class

In [3]:
class Taxi3():
    """
    The class takes the Taxi environmnet created by OpenAI and train the agent using SARSA algorithm.
    Args:
        env: the predefined Taxi environment
        n_epochs: total number of episodes
        lr: learning rate, default: 0.01
        df: discount factor, default: 0.99
        init_epsilon: initial probability of exploration, default 1.
        decay_epsilon: the factor by which the epsilon value dereases exponentialy, defalut 0.90.
        min_epsilon: the minimum likelihood of exploration, default 0.01.
    """
    def __init__(self, env, n_epochs, *, lr=0.3, df= 0.99, init_epsilon=1, 
                 min_epsilon=0.01, decay_epsilon=0.9):
        
        self.env = env
        self.n_epochs = n_epochs
        self.lr = lr
        self.df = df
        self.init_epsilon = init_epsilon
        self.epsilon = self.init_epsilon
        self.min_epsilon = min_epsilon
        self.decay_epsilon = decay_epsilon
        
        # the Q table is initialized with random values between -0.5 and +0.5
        self.Q = np.random.uniform(-0.5, 0.5, size=(env.observation_space.n, env.action_space.n))
        self.trained_ = False
        
    def train(self):
        """
        Train the model using the predefined hyperparameters.
        """
        for eps in range(self.n_epochs):
            
            self.epsilon = self.init_epsilon     #at the beginning of each episode, epsilon is reset to its initial value
            s = self.env.reset()
            a = self.env.action_space.sample()
            
            for t in range(self.env.spec.max_episode_steps):
                s_, r, done, __ = self.env.step(a)               #the env returns the next state, s_, and reward, r. 
                a_ = np.argmax(self.Q[s_, :])                    #the action is chosen based on the e-greedy policy
                if np.random.random() < self.epsilon:
                    a_ = self.env.action_space.sample()
                    
                #if epsilon is bigger than its minimum, it should be multiplied by the decay value
                if self.epsilon > self.min_epsilon:       
                    self.epsilon *= self.decay_epsilon
                
                #the Q table is updated based on the action chosen above and the next state returned by the env
                self.Q[s, a] += self.lr * (r + self.df * self.Q[s_, a_] - self.Q[s, a])
                
                #old state and action are replaced by the next state and action obtained above
                a = a_
                s = s_
                
                if done:
                    print(f'Episode {eps} finished after {t} time steps')
                    break
        self.trained_ = True
        
    def test(self):
        """
        run the model for one episode using the trained model.
        """
        if self.trained_:
            s = self.env.reset()
            a = self.env.action_space.sample()
            for t_ in range(self.env.spec.max_episode_steps):
                clear_output(wait=True)
                s_, r, done, __ = self.env.step(a)
                self.env.render()
                a_ = np.argmax(self.Q[s_, :])
                a = a_
                s = s_
                time.sleep(0.8)
                if done:
                    break
            print(f'Finished in {t_} time steps')
        else:
            print('The model is not trained yet. First call the train method.')
                
    def reset_Q(self):
        """
        Reset the Q table if the method is called. The test method is not available after calling this method.
        """
        self.Q = np.zeros((env.observation_space.n, env.action_space.n))
        self.trained_ = False
    
        