<a href="https://colab.research.google.com/github/adarsh-nl/Markov-Decision-Process/blob/main/Iterative_Policy_Evaluation_for_GridWorld.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

In [None]:
DEBUG = False

def debug(str):
  if DEBUG:
    print(str)

In [None]:
# define the grid world problem
grid_size = 4
num_actions = 4
discount_factor = 0.9
grid = np.zeros((grid_size, grid_size))
policy = np.ones((grid_size, grid_size, num_actions)) / num_actions
rewards = np.zeros((grid_size, grid_size))
rewards[0, 0] = 1
rewards[grid_size-1, grid_size-1] = 10


print("Grid Size is: {}".format(grid_size))
print("Actions in the grid world: {}".format(num_actions))
print("\n\n*****************")
print("The grid is as follows: \n")
print(grid)
print("\n*****************\n\n")

print("The stochatic policy the Agent follows currently is: \n")
print(policy)
print("\n\nThe reward is formulated as: \n")
print(rewards)

Grid Size is: 4
Actions in the grid world: 4


*****************
The grid is as follows: 

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

*****************


The stochatic policy the Agent follows currently is: 

[[[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]]


The reward is formulated as: 

[[ 1.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0. 10.]]


In [None]:
def get_next_state_and_reward(state, action):
    print("\nCurrent State: {}".format(state))
    i, j = state
    if action == 0:
        next_i = max(i-1, 0)
        next_j = j
        print("Action performed: {}".format(action))
        print("Next State is: {}\n".format([next_i, next_j]))
    elif action == 1:
        next_i = i
        next_j = min(j+1, grid_size-1)
        print("Action performed: {}".format(action))
        print("Next State is: {}\n".format([next_i, next_j]))
    elif action == 2:
        next_i = min(i+1, grid_size-1)
        next_j = j
        print("Action performed: {}".format(action))
        print("Next State is: {}\n".format([next_i, next_j]))
    else:
        next_i = i
        next_j = max(j-1, 0)
        print("Action performed: {}".format(action))
        print("Next State is: {}\n".format([next_i, next_j]))
              
    next_state = [next_i, next_j]
    reward = rewards[next_i, next_j]
    return next_state, reward

In [None]:
# perform iterative policy evaluation
num_iterations = 50
values = np.zeros((grid_size, grid_size))
for i in range(num_iterations):
    new_values = np.zeros_like(values)
    for i in range(grid_size):
        for j in range(grid_size):
            print("Value Iteration is calculated for {}".format([i,j]))
            state = (i, j)
            value = 0
            for action in range(num_actions):
                next_state, reward = get_next_state_and_reward(state, action)
                value += policy[i, j, action] * (reward + discount_factor * values[next_state[0], next_state[1]])
            new_values[i, j] = value
    values = new_values

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Next State is: [0, 0]

Value Iteration is calculated for [0, 2]

Current State: (0, 2)
Action performed: 0
Next State is: [0, 2]


Current State: (0, 2)
Action performed: 1
Next State is: [0, 3]


Current State: (0, 2)
Action performed: 2
Next State is: [1, 2]


Current State: (0, 2)
Action performed: 3
Next State is: [0, 1]

Value Iteration is calculated for [0, 3]

Current State: (0, 3)
Action performed: 0
Next State is: [0, 3]


Current State: (0, 3)
Action performed: 1
Next State is: [0, 3]


Current State: (0, 3)
Action performed: 2
Next State is: [1, 3]


Current State: (0, 3)
Action performed: 3
Next State is: [0, 2]

Value Iteration is calculated for [1, 0]

Current State: (1, 0)
Action performed: 0
Next State is: [0, 0]


Current State: (1, 0)
Action performed: 1
Next State is: [1, 1]


Current State: (1, 0)
Action performed: 2
Next State is: [2, 0]


Current State: (1, 0)
Action performed: 3
Next State is: [1, 0

In [None]:
# print the values of each state
for i in range(grid_size):
    for j in range(grid_size):
        print("{:.2f}".format(values[i, j]), end="\t")
    print()

3.93	3.69	3.76	4.22	
3.69	3.94	5.07	6.56	
3.76	5.07	8.28	13.34	
4.22	6.56	13.34	20.00	
