# ðŸ§  Unit 5.2: Dynamic Programming

**Course:** Advanced Machine Learning (AICC 303)  
**Topic:** 5.3 Dynamic Programming (Value Iteration, Policy Iteration)

**Goal:** Solve MDPs when the environment model $P(s'|s,a)$ is fully known.

---

In [None]:
import numpy as np
import gymnasium as gym

env = gym.make('FrozenLake-v1', is_slippery=True, render_mode=None)
n_states = env.observation_space.n
n_actions = env.action_space.n

# Accessing Transitions P[state][action] = [(prob, next_state, reward, done)]
P = env.unwrapped.P

## 1. Value Iteration
Iteratively update value function $V(s)$ until convergence.

$V_{k+1}(s) = \max_a \sum_{s',r} p(s',r|s,a)[r + \gamma V_k(s')]$

In [None]:
def value_iteration(env, gamma=0.99, theta=1e-8):
    V = np.zeros(n_states)
    
    while True:
        delta = 0
        for s in range(n_states):
            v = V[s]
            # Calculate value for all actions
            q_values = np.zeros(n_actions)
            for a in range(n_actions):
                for prob, next_state, reward, done in P[s][a]:
                    q_values[a] += prob * (reward + gamma * V[next_state])
            
            V[s] = max(q_values)
            delta = max(delta, abs(v - V[s]))
            
        if delta < theta:
            break
    
    # Extract Policy
    policy = np.zeros(n_states, dtype=int)
    for s in range(n_states):
        q_values = np.zeros(n_actions)
        for a in range(n_actions):
            for prob, next_state, reward, done in P[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state])
        policy[s] = np.argmax(q_values)
        
    return V, policy

V_opt, policy_opt = value_iteration(env)

print("Optimal Value Function:")
print(V_opt.reshape(4, 4))

print("\nOptimal Policy (0:Left, 1:Down, 2:Right, 3:Up):")
print(policy_opt.reshape(4, 4))