<a href="https://colab.research.google.com/github/Codeadi01/RL/blob/main/RL_EXP7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# Define the transition probabilities and rewards for a simple MDP
# The MDP has 3 states and 2 actions (0 and 1)
# Transitions are deterministic, and rewards are -1 for every step
# The goal is to find the optimal policy that minimizes the total cost

num_states = 3
num_actions = 2

# Transition probabilities: transition_probs[state, action, next_state]
transition_probs = np.array([
    [[1, 0, 0], [0, 1, 0]],  # From state 0, action 0 goes to state 0 and action 1 goes to state 1
    [[0, 1, 0], [0, 0, 1]],  # From state 1, action 0 goes to state 1 and action 1 goes to state 2
    [[0, 0, 1], [0, 0, 1]],  # From state 2, both actions go to state 2
])

# Rewards: rewards[state, action]
rewards = np.array([
    [-1, -1],
    [-1, -1],
    [-1, -1],
])

# Discount factor
gamma = 0.9

# Initialize a random policy
policy = np.ones((num_states, num_actions)) / num_actions

def evaluate_policy(policy, transition_probs, rewards, gamma):
    """Evaluate the policy by solving the linear system of equations."""
    num_states, num_actions = policy.shape
    A = np.eye(num_states)
    b = np.zeros(num_states)

    for s in range(num_states):
        for a in range(num_actions):
            prob = policy[s, a]
            next_state_probs = transition_probs[s, a]
            reward = rewards[s, a]
            A[s] -= prob * gamma * next_state_probs
            b[s] += prob * reward

    values = np.linalg.solve(A, b)
    return values

def improve_policy(policy, values, transition_probs, rewards, gamma):
    """Improve the policy based on the current values."""
    num_states, num_actions = policy.shape

    for s in range(num_states):
        action_values = np.zeros(num_actions)

        for a in range(num_actions):
            next_state_probs = transition_probs[s, a]
            reward = rewards[s, a]
            action_values[a] = np.dot(next_state_probs, (reward + gamma * values))

        best_action = np.argmax(action_values)
        policy[s] = np.eye(num_actions)[best_action]

    return policy

# Policy iteration algorithm
num_iterations = 100
for i in range(num_iterations):
    values = evaluate_policy(policy, transition_probs, rewards, gamma)
    new_policy = improve_policy(policy, values, transition_probs, rewards, gamma)

    # Check for convergence
    if np.array_equal(new_policy, policy):
        print(f"Policy iteration converged after {i+1} iterations.")
        break

    policy = new_policy

print("Optimal Policy:")
print(policy)


Policy iteration converged after 1 iterations.
Optimal Policy:
[[1. 0.]
 [1. 0.]
 [1. 0.]]
