In [1]:
!pip install gymnasium
!pip install moviepy



In [2]:
import random
import time

import gymnasium as gym
import numpy as np
import moviepy.editor as mpy
from tqdm import trange

In [3]:
env = gym.make('Taxi-v3', render_mode='rgb_array')

In [4]:
state_space = env.observation_space.n
print(f"Possible states: {state_space}")
action_space = env.action_space.n
print(f"Possible actions: {action_space}")

Possible states: 500
Possible actions: 6


In [5]:
# Hyperparameters
total_episodes = 25000
total_test_episodes = 10
max_steps = 200

learning_rate = 0.01
gamma = 0.99

# Exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.001
decay_rate = 0.01

In [6]:
def epsilon_greedy_policy(Q, state):
  if(random.uniform(0,1) > epsilon):
    return np.argmax(Q[state]) # exploit
  return env.action_space.sample() # explore

In [7]:
 Q = np.zeros((state_space, action_space))
 for episode in range(total_episodes):
    state, _ = env.reset()
    step = 0
    done = False

    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

    for step in range(max_steps):
        action = epsilon_greedy_policy(Q, state)
        observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        Q[state][action] = Q[state][action] + learning_rate * (reward + gamma *
                                    np.max(Q[observation]) - Q[state][action])
        if done:
            break
        state = observation

In [8]:
rewards = []
frames = []

for episode in trange(total_test_episodes):
    state, _ = env.reset()
    step = 0
    done = False
    total_rewards = 0

    for step in range(max_steps):
        # Render the environment and store the frame
        frame = env.render()
        frames.append(frame)

        # Take the action with the maximum expected future reward
        action = np.argmax(Q[state][:])
        observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        total_rewards += reward

        if done:
            rewards.append(total_rewards)
            break

        state = observation

env.close()

print("Score over time: " + str(sum(rewards)/total_test_episodes))

clip = mpy.ImageSequenceClip(frames, fps=5)
clip.write_videofile("taxi_v3_test.mp4")

100%|██████████| 10/10 [00:01<00:00,  5.78it/s]


Score over time: 7.8
Moviepy - Building video taxi_v3_test.mp4.
Moviepy - Writing video taxi_v3_test.mp4





Moviepy - Done !
Moviepy - video ready taxi_v3_test.mp4
