<a href="https://colab.research.google.com/github/aryanjha256/randomjupyternotebooks/blob/main/GridWorldBellman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import numpy as np

# Define constants for grid world
GRID_SIZE = 5
NUM_STATES = GRID_SIZE * GRID_SIZE
NUM_ACTIONS = 4 # Up, Down, Left, Right
GOAL_STATE = (GRID_SIZE -1, GRID_SIZE - 1)
OBSTACLE_STATES = [(1, 1), (2, 2), (3, 1)]
DISCOUNT_FACTOR = 0.9
MAX_ITERATIONS = 1000
EPSILON = 1e-6 # Convergence threshold

In [23]:
# Actions: up, down, left, right
actions = [-GRID_SIZE, GRID_SIZE, -1, 1]

# Intialize the value function arbitrarily
value_function = np.zeros(NUM_STATES)

# Define transition dynamics and rewards
transition_probabilities = np.zeros((NUM_STATES, NUM_ACTIONS, NUM_STATES))
rewards = np.zeros(NUM_STATES)

for state in range(NUM_STATES):
  if state == GOAL_STATE[0] * GRID_SIZE + GOAL_STATE[1]:
    rewards[state] = 10 # Goal state
  elif state in [s[0] * GRID_SIZE + s[1] for s in OBSTACLE_STATES]:
    rewards[state] = -10 # Obstacle state

  row, col = state // GRID_SIZE, state % GRID_SIZE
  for action in range(NUM_ACTIONS):
    if actions == 0:
      next_state = max(0, row - 1) * GRID_SIZE + col
    elif actions == 1:
      next_state = min(GRID_SIZE - 1, row + 1) * GRID_SIZE + col
    elif actions == 2:
      next_state = row * GRID_SIZE + max(0, col -1)
    else:
      next_state = row * GRID_SIZE + min(GRID_SIZE - 1, col + 1)

    transition_probabilities[state, action, next_state] = 1.0

# Perform value iteration
for i in range(MAX_ITERATIONS):
  new_value_function = np.zeros(NUM_STATES)
  for state in range(NUM_STATES):
    expected_rewards = np.sum(
        transition_probabilities[state] * (rewards + DISCOUNT_FACTOR * value_function),
        axis = 1
    )
    new_value_function[state] = np.max(expected_rewards)
  if np.max(np.abs(new_value_function - value_function)) < EPSILON:
    break
  value_function = new_value_function

# Print the optimal value function
print("Optimal value function:")
print(value_function.reshape((GRID_SIZE, GRID_SIZE)))


Optimal value function:
[[  0.           0.           0.           0.           0.        ]
 [-10.           0.           0.           0.           0.        ]
 [ -9.         -10.           0.           0.           0.        ]
 [-10.           0.           0.           0.           0.        ]
 [ 72.89999002  80.99999002  89.99999002  99.99999002  99.99999002]]
