In [10]:
!pip install gymnasium
!pip install -q matplotlib
!pip install pygame
!pip install numpy
!pip install progressbar

import gymnasium as gym
import numpy as np
import progressbar



In [11]:
env = gym.make('Taxi-v3')
# env = gym.make('Taxi-v3', render_mode="human")

# Parameters
alpha = 0.1  # Learning rate
gamma = 0.6  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 100000

# Initialize Q-table with zeros
q_table = np.zeros([env.observation_space.n, env.action_space.n])


print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [23]:
bar = progressbar.ProgressBar()
bar(range(num_episodes))
bar.start()

# Training the agent
for i in range(num_episodes):
    state = env.reset()[0]  # Get the initial state from the reset method
    epochs, penalties, reward = 0, 0, 0
    done = False

    while not done:
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore action space
        else:
            action = np.argmax(q_table[state])  # Exploit learned values

        next_state, reward, done, truncated, info = env.step(action)
        next_state = next_state  # Ensure next_state is an integer
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        # Q-learning formula
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    bar.update(i)

bar.finish()
print("Training finished.\n")

100% |########################################################################|

Training finished.






In [25]:
bar = progressbar.ProgressBar()
bar(range(num_episodes))
bar.start()

# Evaluate the agent's performance after Q-learning
total_epochs, total_penalties = 0, 0
episodes = 10000

for episode in range(episodes):
    state = env.reset()[0]
    epochs, penalties, reward = 0, 0, 0
    done = False

    while not done:
        action = np.argmax(q_table[state])
        next_state, reward, done, truncated, info = env.step(action)
        next_state = next_state
        
        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    total_penalties += penalties
    total_epochs += epochs
    
    bar.update(episode)

bar.finish()

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

100% |########################################################################|

Results after 10000 episodes:
Average timesteps per episode: 13.0658
Average penalties per episode: 0.0





In [26]:
# Visualize the agent 10 laps
env = gym.make('Taxi-v3', render_mode="human")

for _ in range(10):
    state = env.reset()[0]
    epochs, penalties, reward = 0, 0, 0
    done = False

    while not done:
        action = np.argmax(q_table[state])
        next_state, reward, done, truncated, info = env.step(action)
        next_state = next_state
        
        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    print(f"Time steps: {epochs}, Penalties: {penalties}")
env.close()

Time steps: 13, Penalties: 0
Time steps: 17, Penalties: 0
Time steps: 18, Penalties: 0
Time steps: 18, Penalties: 0
Time steps: 11, Penalties: 0
Time steps: 18, Penalties: 0
Time steps: 11, Penalties: 0
Time steps: 8, Penalties: 0
Time steps: 14, Penalties: 0
Time steps: 13, Penalties: 0
