**Experiment No. 6**

Reinforcement Learning:

Solve the Taxi problem using reinforcement learning where the agent acts as a taxi driver to pick up a passenger at one location and then drop the passenger off at their destination.

In [None]:
import numpy as np
import gym
import random

# Create the Taxi environment
env = gym.make("Taxi-v3")

# Initialize Q-table (500 states, 6 possible actions)
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.1 # Learning rate
gamma = 0.6 # Discount factor
epsilon = 0.1 # Exploration-exploitation tradeoff
episodes = 10000 # Number of episodes for training
truncated = 0
new_step_api=True

# Training the agent using Q-learning
for episode in range(episodes):
  # Reset the environment and capture the state
  state = env.reset() if isinstance(env.reset(), int) else env.reset()[0]
  done = False

  while not done:
    # Exploration-exploitation tradeoff
    if random.uniform(0, 1) < epsilon:
      action = env.action_space.sample() # Explore action space
    else:
      action = np.argmax(q_table[state]) # Exploit learned values

    # Take action and observe outcome
    result = env.step(action)
    if len(result) == 4: # For older gym versions
      next_state, reward, done, info = result
    else: # For newer gym versions (with 'truncated')
      next_state, reward, done, truncated, info = result
    done = done or truncated # End the episode if either is True

    # Update Q-value using the Q-learning formula
    old_value = q_table[state, action]
    next_max = np.max(q_table[next_state])
    # Q-learning update formula
    q_table[state, action] = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
    # Transition to the next state
    state = next_state

# Testing the trained agent
total_epochs, total_penalties = 0, 0
episodes_test = 100

for _ in range(episodes_test):
  state = env.reset() if isinstance(env.reset(), int) else env.reset()[0]
  epochs, penalties, reward = 0, 0, 0
  done = False

  while not done:
    action = np.argmax(q_table[state]) # Exploit learned values
    result = env.step(action)

    if len(result) == 4: # Handle older gym version
      next_state, reward, done, info = result
    else: # Handle newer gym version with truncated flag
      next_state, reward, done, truncated, info = result
      done = done or truncated
    if reward == -10: # Penalty for illegal pick-up/drop-off
      penalties += 1

    epochs += 1
    state = next_state

  total_penalties += penalties
  total_epochs += epochs

print(f"Results after {episodes_test} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes_test}")
print(f"Average penalties per episode: {total_penalties / episodes_test}")


Results after 100 episodes:
Average timesteps per episode: 40.88
Average penalties per episode: 0.0
