In [8]:
import numpy as np


In [9]:
def policy_evaluation(env, policy, gamma=0.9, theta=1e-8):
    """
    Evaluate a given policy for an MDP.
    Parameters:
    - env (object): An MDP environment with methods `transition_prob` and `reward`.
    - policy (dict): A mapping of states to actions.
    - gamma (float): Discount factor.
    - theta (float): Threshold for convergence.
    """
    states = env.states
    V = {s: 0 for s in states}  # Initialize state-value function
    
    while True:
        delta = 0  # Change in value for convergence check
        for s in states:
            v = V[s]
            V[s] = sum(
                env.transition_prob(s, policy[s], s_) *
                (env.reward(s, policy[s], s_) + gamma * V[s_])
                for s_ in states
            )
            delta = max(delta, abs(v - V[s]))  # Track max change
        if delta < theta:
            break  # Convergence achieved
    
    return V

In [10]:
def policy_improvement(env, V, gamma=0.9):
    """
    Improve a policy for an MDP.
    Parameters:
    - env (object): An MDP environment with methods `transition_prob` and `reward`.
    - V (dict): The state-value function.
    - gamma (float): Discount factor.
    """
    states = env.states
    actions = env.actions
    policy = {}
    
    for s in states:
        action_values = {}
        for a in actions:
            action_values[a] = sum(
                env.transition_prob(s, a, s_) *
                (env.reward(s, a, s_) + gamma * V[s_])
                for s_ in states
            )
        # Choose the action with the maximum value
        policy[s] = max(action_values, key=action_values.get)
    
    return policy

In [11]:
class GridWorldEnv:
    def __init__(self):
        self.states = [(x, y) for x in range(3) for y in range(3)]
        self.actions = ['up', 'down', 'left', 'right']
        self.goal_state = (2, 2)
        
    def transition_prob(self, state, action, next_state):
        """Returns the probability of moving to `next_state` from `state` given `action`."""
        intended_next_state = self._next_state(state, action)
        if next_state == intended_next_state:
            return 0.8  # 80% chance to go as intended
        elif next_state == state:
            return 0.2  # 20% chance of staying in place (e.g., hitting a wall)
        else:
            return 0.0  # All other transitions are impossible
    
    def reward(self, state, action, next_state):
        """Returns the reward for moving to `next_state` from `state`."""
        if next_state == self.goal_state:
            return 1  # Reward for reaching the goal
        else:
            return 0  # No reward otherwise
    
    def _next_state(self, state, action):
        """Returns the next state resulting from taking `action` in `state`."""
        x, y = state
        if action == 'up' and y < 2:
            return (x, y + 1)
        elif action == 'down' and y > 0:
            return (x, y - 1)
        elif action == 'left' and x > 0:
            return (x - 1, y)
        elif action == 'right' and x < 2:
            return (x + 1, y)
        else:
            return state  # If the action would take the agent out of bounds, it stays in place.

In [12]:
# Main script
env = GridWorldEnv()
initial_policy = {s: 'up' for s in env.states}

In [13]:

# Evaluate the initial policy
print("Evaluating initial policy...")
V = policy_evaluation(env, initial_policy)
print("Initial state-value function:")
for state, value in V.items():
    print(f"State: {state}, Value: {value:.2f}")

Evaluating initial policy...
Initial state-value function:
State: (0, 0), Value: 0.00
State: (0, 1), Value: 0.00
State: (0, 2), Value: 0.00
State: (1, 0), Value: 0.00
State: (1, 1), Value: 0.00
State: (1, 2), Value: 0.00
State: (2, 0), Value: 3.06
State: (2, 1), Value: 3.48
State: (2, 2), Value: 2.86


In [14]:
# Improve the policy
print("\nImproving policy...")
improved_policy = policy_improvement(env, V)
print("Improved policy:")
for state, action in improved_policy.items():
    print(f"State: {state}, Action: {action}")


Improving policy...
Improved policy:
State: (0, 0), Action: up
State: (0, 1), Action: up
State: (0, 2), Action: up
State: (1, 0), Action: right
State: (1, 1), Action: right
State: (1, 2), Action: right
State: (2, 0), Action: up
State: (2, 1), Action: up
State: (2, 2), Action: down
