#### **Dynamic Programming Taxi Environment**

In this example there is an overflow doing the policy iteration (in my package combination). 
I did not clip the values for the sake of loop <br>
clarity. Sorry for the warning.


In [1]:
import gymnasium as gym
import numpy as np
import session_info

In [2]:
# Initialize the Taxi environment
env = gym.make('Taxi-v3', render_mode='human')
env = env.unwrapped  # Access the raw environment without the TimeLimit wrapper
env.reset()  # Reset the environment

# Parameters
gamma = 0.8  # Discount factor
theta = 1e-8  # Convergence threshold

# Supporting functions
def policy_evaluation(env, policy, theta=1e-8, gamma=0.9):
    V = np.zeros(env.observation_space.n)
    while True:
        delta = 0

        for s in range(env.observation_space.n):
            
            old_v = V[s]
            new_v = sum(p * (r + gamma * V[ns]) for a, action_prob in enumerate(policy[s]) if action_prob > 0 
                        for p, ns, r, _ in env.P[s][a])
            V[s] = new_v
            delta = max(delta, np.abs(old_v - V[s]))
        if delta < theta:
            break
    return V

def policy_improvement(env, V, gamma=0.9):
    policy = np.zeros([env.observation_space.n, env.action_space.n])
    for s in range(env.observation_space.n):
        q_values = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            q_values[a] = sum([p * (r + gamma * V[ns]) for p, ns, r, _ in env.P[s][a]])
        best_action = np.argmax(q_values)
        policy[s, best_action] = 1.0
    return policy

def policy_iteration(env, theta=1e-8, gamma=0.9):
    policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
    while True:
        V = policy_evaluation(env, policy, theta, gamma)
        new_policy = policy_improvement(env, V, gamma)
        if np.array_equal(policy, new_policy):
            break
        policy = new_policy
    return policy, V

def value_iteration(env, theta=1e-8, gamma=0.9):
    V = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            q_values = [sum([p * (r + gamma * V[ns]) for p, ns, r, _ in env.P[s][a]]) for a in range(env.action_space.n)]
            max_q_value = max(q_values)
            delta = max(delta, abs(V[s] - max_q_value))
            V[s] = max_q_value
        if delta < theta:
            break
    policy = policy_improvement(env, V, gamma)
    return policy, V

# Run policy iteration
pi_policy, pi_value = policy_iteration(env, theta=1e-8, gamma=0.9)
print("\nPolicy from Policy Iteration:")

# Simulate using the optimal policy
def simulate_optimal_policy_taxi(env, policy, max_steps=100):
    """
    Simulate an episode in the Taxi environment using the optimal policy.
    
    Args:
        env: The Taxi environment.
        policy: The optimal policy from policy or value iteration.
        max_steps: Maximum steps to simulate.

    Returns:
        total_reward: Total accumulated reward during simulation.
        path: List of states visited.
        actions: List of actions taken.
    """
    state = env.reset()[0]  # Reset environment and get initial state
    total_reward = 0
    path = [state]  # Track states visited
    actions = []  # Track actions taken

    for _ in range(max_steps):
        action = np.argmax(policy[state])  # Best action from the policy
        next_state, reward, done, _, _ = env.step(action)  # Step in environment

        total_reward += reward
        path.append(next_state)
        actions.append(action)

        state = next_state
        env.render()  # Optional: Render each step

        if done:
            break

    return total_reward, path, actions



  new_v = sum(p * (r + gamma * V[ns]) for a, action_prob in enumerate(policy[s]) if action_prob > 0
  delta = max(delta, np.abs(old_v - V[s]))



Policy from Policy Iteration:


In [3]:
# Using the optimal policy to simulate
total_reward, path, actions = simulate_optimal_policy_taxi(env, pi_policy)
print("Total Reward:", total_reward)
print("Path taken:", path)
print("Actions taken:", actions)

Total Reward: 14
Path taken: [308, 408, 416, 316, 216, 116, 16, 0]
Actions taken: [0, 4, 1, 1, 1, 1, 5]


In [4]:
session_info.show(html=False)

-----
gymnasium           1.0.0
numpy               1.26.4
session_info        1.0.0
-----
IPython             8.26.0
jupyter_client      8.6.2
jupyter_core        5.7.2
-----
Python 3.12.3 (main, Sep 11 2024, 14:17:37) [GCC 13.2.0]
Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.39
-----
Session information updated at 2024-11-02 19:36
