<a href="https://colab.research.google.com/github/anaysingh/RL_Lab_21CSU011/blob/main/policy_iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

class GridWorld:
    def __init__(self, rows, cols):
        self.rows = rows
        self.cols = cols
        self.state = (0, 0)

    def is_terminal(self, state):
        return state == (self.rows - 1, self.cols - 1)

    def step(self, action):
        if action == 'right':
            next_state = (self.state[0], min(self.state[1] + 1, self.cols - 1))
        elif action == 'down':
            next_state = (min(self.state[0] + 1, self.rows - 1), self.state[1])
        else:
            next_state = self.state

        reward = -1 if not self.is_terminal(self.state) else 0
        self.state = next_state
        return next_state, reward

def policy_evaluation(policy, values, grid_world, gamma):
    epsilon = 1e-6
    while True:
        delta = 0
        for i in range(grid_world.rows):
            for j in range(grid_world.cols):
                if not grid_world.is_terminal((i, j)):
                    old_value = values[i, j]
                    action = policy[i, j]
                    next_state, reward = grid_world.step(action)
                    values[i, j] = reward + gamma * values[next_state]
                    delta = max(delta, abs(old_value - values[i, j]))

        if delta < epsilon:
            break

def policy_improvement(policy, values, grid_world, gamma):
    policy_stable = True
    for i in range(grid_world.rows):
        for j in range(grid_world.cols):
            if not grid_world.is_terminal((i, j)):
                old_action = policy[i, j]
                possible_actions = ['right', 'down']  # Actions for the current environment
                action_values = []

                for action in possible_actions:
                    next_state, reward = grid_world.step(action)
                    action_values.append(reward + gamma * values[next_state])

                best_action = possible_actions[np.argmax(action_values)]
                policy[i, j] = best_action

                if old_action != best_action:
                    policy_stable = False

    return policy_stable

def policy_iteration(grid_world, gamma, max_iterations=100):
    policy = np.full((grid_world.rows, grid_world.cols), 'right', dtype=np.str)
    values = np.zeros((grid_world.rows, grid_world.cols))

    for _ in range(max_iterations):
        policy_evaluation(policy, values, grid_world, gamma)
        stable = policy_improvement(policy, values, grid_world, gamma)

        if stable:
            break

    return policy, values

def print_policy(policy):
    for row in policy:
        print(row)

if __name__ == "__main__":
    grid_world = GridWorld(rows=3, cols=3)
    gamma = 0.9  # Discount factor

    optimal_policy, optimal_values = policy_iteration(grid_world, gamma)

    print("Optimal Policy:")
    print_policy(optimal_policy)

    print("\nOptimal Value Function:")
    print(optimal_values)


Optimal Policy:
['r' 'r' 'r']
['r' 'r' 'r']
['r' 'r' 'r']

Optimal Value Function:
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  policy = np.full((grid_world.rows, grid_world.cols), 'right', dtype=np.str)
