In [1]:
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import time
from IPython.display import clear_output
import matplotlib as mpl
import random

%matplotlib inline
# %matplotlib notebook

In [2]:
class Drive():
    def __init__(self, render_mode='rgb_array', lr=8e-1, df=5e-1, epoches=1000, max_move=100, eps=0.7, qtable=None):
        self.env = gym.make('Taxi-v3', render_mode=render_mode).env
        self.active_state = self.env.reset()[0]
        self.state_space = self.env.observation_space.n
        self.action_space = self.env.action_space.n
        self.lr = lr
        self.df = df
        if qtable is not None:
            self.qtable = qtable
        else:
            self.qtable = np.zeros((self.state_space, self.action_space))
        self.epoches = epoches
        self.max_move = max_move
        self.eps = eps
    
    def update_qtable(self, state, action, reward, new_state):
        self.qtable[state, action] += self.lr * (reward + self.df*np.max(self.qtable[new_state,:])) - self.qtable[state, action]
    
    def reset(self):
        self.active_state = self.env.reset()[0]
    
    def select_action(self, state, greedy=False):
        # epsilon-greedy algo used for explore vs exploit
        action = None
        if (np.random.uniform() > self.eps) and (not greedy):
            # choose action ranomly
            action = np.random.choice(self.action_space)
        else:
            # act greedly pick the action with max q-value
            action = np.argmax(self.qtable[state,:])
        return action
    
    def train(self):
        for epoch in range(self.epoches):
            self.reset()
            for m in range(self.max_move):
                action = self.select_action(self.active_state)
                next_state, reward, done, _ , _ = self.env.step(action)
                self.update_qtable(self.active_state, action, reward, next_state)
                self.active_state = next_state
                if done:
                    break
            print(f'epoch {epoch}/{self.epoches}')
            
    def optimal_drive(self, max_ride=5):
        self.reset()
        for i in range(max_ride):
            for m in range(self.max_move):
                action = self.select_action(self.active_state, True)
                next_state, reward, done, _ , _ = self.env.step(action)
                self.active_state = next_state
                self.env.render()
                if done:
                    break
            print(f'{i+1} of {max_ride} ride completed!!')
            self.reset()
            
        self.env.close()
        
    def random_drive(self, n_move=10):
        self.reset()
        for i in range(n_move):
            print(f'step {i+1}/{n_move}')
            ract = self.env.action_space.sample()
            self.active_state = self.env.step(ract)
            print(self.active_state)
            self.env.render()
        self.env.close()   
        self.reset()

In [7]:
class Driver(Drive):
    def __init__(self):
        super().__init__()
    
    def driving_training(self):
        self.train()
    
    def take_customer_ride(self):
        self.env = gym.make('Taxi-v3', render_mode='human').env
        self.active_state = self.env.reset()[0]
        self.optimal_drive()

In [8]:
driver = Driver()

In [10]:
driver.driving_training()

epoch 0/1000
epoch 1/1000
epoch 2/1000
epoch 3/1000
epoch 4/1000
epoch 5/1000
epoch 6/1000
epoch 7/1000
epoch 8/1000
epoch 9/1000
epoch 10/1000
.....
epoch 705/1000
epoch 706/1000
epoch 707/1000
epoch 708/1000
epoch 709/1000
epoch 710/1000
epoch 711/1000
epoch 712/1000
epoch 713/1000
epoch 999/1000


In [11]:
driver.take_customer_ride()

1 of 5 ride completed!!
2 of 5 ride completed!!
3 of 5 ride completed!!
4 of 5 ride completed!!
5 of 5 ride completed!!
