# Frozen Lake with Policy Iteration

In [None]:
import numpy as np
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

# Policy Iteration Parameters
GAMMA = 0.95  # Discount factor
THETA = 1e-6  # Convergence threshold
EPISODES = 100  # Number of test episodes

def policy_evaluation(policy, env, gamma=GAMMA, theta=THETA):
    """Evaluates a given policy by computing the state-value function."""
    V = np.zeros(env.observation_space.n)
    envP = env.unwrapped.P  # Access transition dynamics
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            v = 0
            for a, action_prob in enumerate(policy[s]):
                for prob, next_state, reward, done in envP[s][a]:
                    v += action_prob * prob * (reward + gamma * V[next_state] * (not done))
            delta = max(delta, abs(V[s] - v))
            V[s] = v
        if delta < theta:
            break
    return V

def policy_improvement(env, V, gamma=GAMMA):
    """Improves the policy based on the computed state-value function."""
    policy = np.zeros([env.observation_space.n, env.action_space.n])
    envP = env.unwrapped.P  # Access transition dynamics
    for s in range(env.observation_space.n):
        q_values = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for prob, next_state, reward, done in envP[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state] * (not done))
        best_action = np.argmax(q_values)
        policy[s, best_action] = 1.0
    return policy

def policy_iteration(env):
    """Runs the policy iteration algorithm."""
    policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
    while True:
        V = policy_evaluation(policy, env)
        new_policy = policy_improvement(env, V)
        if np.all(policy == new_policy):
            break
        policy = new_policy
    return policy, V

def evaluate_policy(env, policy, episodes=EPISODES):
    """Evaluates the learned policy by running it in the environment."""
    total_steps = []
    success_count = 0
    goal_state = np.where(env.unwrapped.desc.flatten() == b'G')[0][0]
    
    for _ in range(episodes):
        state, _ = env.reset()
        steps = 0
        done = False
        while not done:
            action = np.argmax(policy[state])
            state, _, done, _, _ = env.step(action)
            steps += 1
        total_steps.append(steps)
        success_count += 1 if state == goal_state else 0
    print(f"\nAverage Steps per Episode: {np.mean(total_steps):.2f}")
    print(f"Success Rate: {success_count / episodes:.2%}")

# Create FrozenLake environment
map_size = 100
env = gym.make("FrozenLake-v1", is_slippery=False, desc=generate_random_map(size=map_size, p=0.9, seed=123))

# Run Policy Iteration
optimal_policy, optimal_value = policy_iteration(env)

# Print Optimal Policy
policy_arrows = {0: "←", 1: "↓", 2: "→", 3: "↑"}
policy_grid = np.array([policy_arrows[np.argmax(optimal_policy[s])] if np.max(optimal_policy[s]) > 0 else "•" for s in range(env.observation_space.n)]).reshape(map_size, map_size)
print("\nOptimal Policy:")
print("\n".join([" ".join(row) for row in policy_grid]))

# Evaluate Policy
evaluate_policy(env, optimal_policy)
