In [None]:
# To run the code:
# pip install gymnasium numpy matplotlib
# pip install gymnasium[toy_text]

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

# Initialize environment
env = gym.make("Taxi-v3", render_mode="ansi")
actions = env.action_space.n
states = env.observation_space.n
Q = np.zeros((states, actions))

# Hyperparameters
episodes = 1000
learning_rate = 0.7
discount = 0.618
epsilon = 1.0
epsilon_decay = 0.01
rewards = []

# Training loop
for ep in range(episodes):
    state, info = env.reset()
    done = False
    total_reward = 0

    while not done:
        # Epsilon-greedy action selection
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Q-Learning update rule
        old_value = Q[state, action]
        next_max = np.max(Q[next_state])
        Q[state, action] = old_value + learning_rate * (reward + discount * next_max - old_value)

        state = next_state
        total_reward += reward

    # Decay exploration
    epsilon = max(0.01, epsilon * (1 - epsilon_decay))
    rewards.append(total_reward)

env.close()

# Plot learning curve
plt.plot(rewards)
plt.title("Episode Rewards Over Time")
plt.xlabel("Episodes")
plt.ylabel("Rewards")
plt.grid(True)
plt.show()
