# Value function with predetermined policy

Simple Grid:

+---+---+---+
| 0 | 1 | 2 |
+---+---+---+
| 3 | 4 | 5 |
+---+---+---+
| 6 | 7 | 8 | (Terminal)
+---+---+---+

Policy: move east if you can, otherwise move south.  8 is terminal state. (Simple deterministic policy)

Rewards: The reward for each action is -1, except actions leading to the terminal state, which have 0 reward.



In [17]:
import numpy as np

transitions = {
    0: 1, 1: 2, 2: 5,  
    3: 4, 4: 5, 5: 8,  
    6: 7, 7: 8
}
TERMINAL_STATE = 8

# Initialize the value function and parameters
V = [0] * 9  # Initial value function with state 8 as terminal
gamma = 1  # Discount factor
delta_threshold = 0.01  # Convergence threshold

def update_value_function(V, gamma, transitions):
    V_next = V.copy()
    for s in range(8):  # Iterate through non-terminal states
        # Check if the state is 7, transitioning to the terminal state 8
        next_state = transitions[s]
        if next_state == TERMINAL_STATE:
            reward = 0  # Reward for moving into the terminal state
        else:
            reward = -1  # Standard movement cost for all other transitions 
        V_next[s] = reward + (gamma * V[next_state])  # Apply the update rule
    return V_next

# Iteratively update V until convergence, using the corrected update_value_function
iterations = 0
max_change = 0
while True:
    print(np.reshape(V, (3,3)))
    iterations += 1
    V_new = update_value_function(V, gamma, transitions)
    max_change = max(abs(V_new[s] - V[s]) for s in range(9))  # Calculate max change for convergence check
    V = V_new.copy()  # Update value function for next iteration
    if max_change < delta_threshold:  # Check if convergence criterion is met
        break

print(f"Converged after {iterations} iterations with max_change: {max_change}.")



[[0 0 0]
 [0 0 0]
 [0 0 0]]
[[-1. -1. -1.]
 [-1. -1.  0.]
 [-1.  0.  0.]]
[[-1.5 -1.5 -1. ]
 [-1.5 -1.   0. ]
 [-1.   0.   0. ]]
[[-1.75 -1.5  -1.  ]
 [-1.5  -1.    0.  ]
 [-1.    0.    0.  ]]
Converged after 4 iterations with max_change: 0.0.


# Policy Iteration

Now we'll have it learn the policy with Policy Iteration

In [37]:
import random
random.seed(1)

# Initialize environment and parameters
grid_dim = 3 # Width/height of grid
cell_count = grid_dim ** 2
states = list(range(cell_count))  # States 0 through 8
actions = ['N', 'S', 'E', 'W']
terminal_state = cell_count - 1
gamma = 1  # Assuming gamma = 1 for simplicity
policy = {s: random.choice(actions) for s in range(9) if s != terminal_state}  # Randomly assign actions to begin
V = {s: 0 for s in states}  # Initial value function

# Transition function: Returns the next state given a state and an action
def transition(state, action):
    if state == terminal_state: 
        return state
    if action == 'N':
        return state - grid_dim if state - grid_dim >= 0 else state
    if action == 'S':
        return state + grid_dim if state + grid_dim <= (cell_count - 1) else state
    if action == 'E':
        return state + 1 if (state % grid_dim) < (grid_dim - 1) else state
    if action == 'W':
        return state - 1 if (state % grid_dim) > 0 else state

def get_reward(next_state):
    if next_state == terminal_state:
        return 0
    else:
        return - 1

def get_action_value(state, action):
    next_state = transition(state, action)
    reward = get_reward(next_state)
    return reward + (gamma * V[next_state])

# Policy evaluation (simplified version for demonstration)
def evaluate_policy(V, threshold=0.01):
    while True:
        V_next = V.copy()
        for s in states:
            if s == terminal_state:
                continue 
            a = policy[s]
            V_next[s] = get_action_value(s, a)
        
        max_change = max(abs(V_next[s] - V[s]) for s in states)
        V = V_next
        if max_change < threshold:
            return V

def improve_policy(V):
    policy_stable = True
    for s in states:
        if s == terminal_state:
            continue
        old_action = policy[s]
        action_values = {}
        for a in actions:
            action_values[a] = get_action_value(s, a)
        best_action = max(action_values, key=action_values.get)
        policy[s] = best_action
        if old_action != best_action:
            policy_stable = False
    return policy_stable

while True:
    V = evaluate_policy(V)
    if improve_policy(V):
        break

print("Optimal Policy:", policy)
print("Value Function:", V)

Optimal Policy: {0: 'S', 1: 'S', 2: 'S', 3: 'S', 4: 'S', 5: 'S', 6: 'E', 7: 'E'}
Value Function: {0: -3, 1: -2, 2: -1, 3: -2, 4: -1, 5: 0, 6: -1, 7: 0, 8: 0}


# Value iteration

Even better is value iteration, where we combine policy evaluation and policy improvement into a single step.

In [51]:

grid_dim = 3 # Width/height of grid
cell_count = grid_dim ** 2
states = list(range(cell_count))  # States 0 through 8
actions = ['N', 'S', 'E', 'W']
terminal_state = cell_count - 1
gamma = 1  # Assuming gamma = 1 for simplicity
V = {s: 0 for s in states}  # Initial value function
threshold = 0.01

def transition(state, action):
    if state == terminal_state: 
        return state
    if action == 'N':
        return state - grid_dim if state - grid_dim >= 0 else state
    if action == 'S':
        return state + grid_dim if state + grid_dim <= (cell_count - 1) else state
    if action == 'E':
        return state + 1 if (state % grid_dim) < (grid_dim - 1) else state
    if action == 'W':
        return state - 1 if (state % grid_dim) > 0 else state

def get_reward(next_state):
    return 0 if next_state == terminal_state else -1

def get_action_value(state, action):
    next_state = transition(state, action)
    reward = get_reward(next_state)
    return reward + (gamma * V[next_state])

def value_iteration(V):
    policy = {s: None for s in states if s != terminal_state}
    while True:
        max_delta = 0
        for s in states:
            if s == terminal_state:
                continue

            action_values = []
            for a in actions:
                value = get_action_value(s, a)
                action_values.append([a, value])
            
            best_action, best_value = max(action_values, key=lambda x: x[1])

            max_delta = max(max_delta, abs(best_value - V[s]))

            V[s] = best_value
            policy[s] = best_action

        if max_delta < threshold:
            return V, policy

V, optimal_policy = value_iteration(V)

print("Optimal Policy:", optimal_policy)
print("Value Function:", V)

Optimal Policy: {0: 'S', 1: 'S', 2: 'S', 3: 'S', 4: 'S', 5: 'S', 6: 'E', 7: 'E'}
Value Function: {0: -3, 1: -2, 2: -1, 3: -2, 4: -1, 5: 0, 6: -1, 7: 0, 8: 0}
