## Value Iteration for a 4x4 Grid Navigation Problem

In this example, the agent's task is to reach the goal state (bottom-right corner of the grid) from any initial state on the grid. The agent receives a reward of +1 upon reaching the goal and -1 for each move to encourage reaching the goal quickly.

In [3]:
import numpy as np

# Define the grid size
grid_size = 4

# Define the rewards for each state
rewards = np.full((grid_size, grid_size), -1.0)  # Initialize with -1 for all moves
rewards[grid_size - 1, grid_size - 1] = 0  # Goal state reward

# Define value function (initialize to zero)
values = np.zeros((grid_size, grid_size))

# Define parameters
gamma = 0.9  # Discount factor
theta = 1e-4  # Convergence threshold
actions = ['up', 'down', 'left', 'right']
action_deltas = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)}

In [4]:
# Function to check if a state is terminal
def is_terminal_state(x, y):
    return (x == grid_size - 1 and y == grid_size - 1)

# Function to get the next state given an action
def get_next_state(x, y, action):
    delta = action_deltas[action]
    new_x, new_y = x + delta[0], y + delta[1]
    if 0 <= new_x < grid_size and 0 <= new_y < grid_size:
        return new_x, new_y
    else:
        return x, y  # No change if out of bounds

# Value Iteration Algorithm
iteration = 0
while True:
    delta = 0
    new_values = np.copy(values)
    for x in range(grid_size):
        for y in range(grid_size):
            if is_terminal_state(x, y):
                continue
            value_updates = []
            for action in actions:
                new_x, new_y = get_next_state(x, y, action)
                reward = rewards[new_x, new_y]
                value_updates.append(reward + gamma * values[new_x, new_y])
            new_values[x, y] = max(value_updates)
            delta = max(delta, abs(new_values[x, y] - values[x, y]))

    values = new_values
    iteration += 1

    # Display the value function after each iteration
    print(f"\nValue Function after iteration {iteration}:")
    print(values)

    if delta < theta:
        break



Value Function after iteration 1:
[[-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]
 [-1. -1.  0.  0.]]

Value Function after iteration 2:
[[-1.9 -1.9 -1.9 -1.9]
 [-1.9 -1.9 -1.9 -1. ]
 [-1.9 -1.9 -1.   0. ]
 [-1.9 -1.   0.   0. ]]

Value Function after iteration 3:
[[-2.71 -2.71 -2.71 -1.9 ]
 [-2.71 -2.71 -1.9  -1.  ]
 [-2.71 -1.9  -1.    0.  ]
 [-1.9  -1.    0.    0.  ]]

Value Function after iteration 4:
[[-3.439 -3.439 -2.71  -1.9  ]
 [-3.439 -2.71  -1.9   -1.   ]
 [-2.71  -1.9   -1.     0.   ]
 [-1.9   -1.     0.     0.   ]]

Value Function after iteration 5:
[[-4.0951 -3.439  -2.71   -1.9   ]
 [-3.439  -2.71   -1.9    -1.    ]
 [-2.71   -1.9    -1.      0.    ]
 [-1.9    -1.      0.      0.    ]]

Value Function after iteration 6:
[[-4.0951 -3.439  -2.71   -1.9   ]
 [-3.439  -2.71   -1.9    -1.    ]
 [-2.71   -1.9    -1.      0.    ]
 [-1.9    -1.      0.      0.    ]]


In [5]:
# Display the final value function
print("\nValue Function after Convergence:")
print(values)
print(f"Converged in {iteration} iterations.")


Value Function after Convergence:
[[-4.0951 -3.439  -2.71   -1.9   ]
 [-3.439  -2.71   -1.9    -1.    ]
 [-2.71   -1.9    -1.      0.    ]
 [-1.9    -1.      0.      0.    ]]
Converged in 6 iterations.


In [6]:
# Determine the optimal policy
policy = np.empty((grid_size, grid_size), dtype=str)
for x in range(grid_size):
    for y in range(grid_size):
        if is_terminal_state(x, y):
            policy[x, y] = 'Goal'
        else:
            best_action = None
            best_value = float('-inf')
            for action in actions:
                new_x, new_y = get_next_state(x, y, action)
                reward = rewards[new_x, new_y]
                value = reward + gamma * values[new_x, new_y]
                if value > best_value:
                    best_value = value
                    best_action = action
            policy[x, y] = best_action

# Display the optimal policy
print("\nOptimal Policy:")
for row in policy:
    print(row)


Optimal Policy:
['d' 'd' 'd' 'd']
['d' 'd' 'd' 'd']
['d' 'd' 'd' 'd']
['r' 'r' 'r' 'G']
