In [None]:
!pip install gymnasium



In [None]:
import gymnasium as gym
import numpy as np

# Initialize the Taxi environment
env = gym.make("Taxi-v3")
n_states = env.observation_space.n
n_actions = env.action_space.n
# Access the unwrapped environment to get the P attribute
P = env.unwrapped.P  # Access to transition probabilities

# Value Iteration parameters
gamma = 0.9
theta = 1e-6  # Convergence threshold

# Initialize value function
V = np.zeros(n_states)

# ----- Value Iteration -----
iteration = 0
while True:
    delta = 0
    for s in range(n_states):
        v = V[s]
        q_values = np.zeros(n_actions)
        for a in range(n_actions):
            for prob, next_state, reward, done in P[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state])
        V[s] = max(q_values)
        delta = max(delta, abs(v - V[s]))
    iteration += 1
    if delta < theta:
        break

# ----- Extract Optimal Policy -----
policy = np.zeros(n_states, dtype=int)
for s in range(n_states):
    q_values = np.zeros(n_actions)
    for a in range(n_actions):
        for prob, next_state, reward, done in P[s][a]:
            q_values[a] += prob * (reward + gamma * V[next_state])
    policy[s] = np.argmax(q_values)

# ----- Display Sample Results -----
sample_states = [0, 100, 200, 300, 400]
for s in sample_states:
    print(f"State {s}: Optimal Action = {policy[s]}, V*(s) = {V[s]:.4f}")

print(f"\nConverged in {iteration} iterations.")

State 0: Optimal Action = 4, V*(s) = 89.4737
State 100: Optimal Action = 1, V*(s) = 79.5263
State 200: Optimal Action = 1, V*(s) = 70.5737
State 300: Optimal Action = 1, V*(s) = 62.5163
State 400: Optimal Action = 1, V*(s) = 55.2647

Converged in 85 iterations.
