Simple Grid:

+---+---+---+
| 0 | 1 | 2 |
+---+---+---+
| 3 | 4 | 5 |
+---+---+---+
| 6 | 7 | 8 | (Terminal)
+---+---+---+

Policy: move east if you can, otherwise move south.  8 is terminal state. (Simple deterministic policy)

Rewards: The reward for each action is -1, except actions leading to the terminal state, which have 0 reward.



In [17]:
import numpy as np

transitions = {
    0: 1, 1: 2, 2: 5,  
    3: 4, 4: 5, 5: 8,  
    6: 7, 7: 8
}
TERMINAL_STATE = 8

# Initialize the value function and parameters
V = [0] * 9  # Initial value function with state 8 as terminal
gamma = 1  # Discount factor
delta_threshold = 0.01  # Convergence threshold

def update_value_function(V, gamma, transitions):
    V_next = V.copy()
    for s in range(8):  # Iterate through non-terminal states
        # Check if the state is 7, transitioning to the terminal state 8
        next_state = transitions[s]
        if next_state == TERMINAL_STATE:
            reward = 0  # Reward for moving into the terminal state
        else:
            reward = -1  # Standard movement cost for all other transitions 
        V_next[s] = reward + (gamma * V[next_state])  # Apply the update rule
    return V_next

# Iteratively update V until convergence, using the corrected update_value_function
iterations = 0
max_change = 0
while True:
    print(np.reshape(V, (3,3)))
    iterations += 1
    V_new = update_value_function(V, gamma, transitions)
    max_change = max(abs(V_new[s] - V[s]) for s in range(9))  # Calculate max change for convergence check
    V = V_new.copy()  # Update value function for next iteration
    if max_change < delta_threshold:  # Check if convergence criterion is met
        break

print(f"Converged after {iterations} iterations with max_change: {max_change}.")



[[0 0 0]
 [0 0 0]
 [0 0 0]]
[[-1. -1. -1.]
 [-1. -1.  0.]
 [-1.  0.  0.]]
[[-1.5 -1.5 -1. ]
 [-1.5 -1.   0. ]
 [-1.   0.   0. ]]
[[-1.75 -1.5  -1.  ]
 [-1.5  -1.    0.  ]
 [-1.    0.    0.  ]]
Converged after 4 iterations with max_change: 0.0.
