<a href="https://colab.research.google.com/github/Codeadi01/RL/blob/main/RL_EXP8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# Define the transition probabilities and rewards for a simple MDP
# The MDP has 3 states and 2 actions (0 and 1)
# Transitions are deterministic, and rewards are -1 for every step
# The goal is to find the optimal policy that minimizes the total cost

num_states = 3
num_actions = 2

# Transition probabilities: transition_probs[state, action, next_state]
transition_probs = np.array([
    [[1, 0, 0], [0, 1, 0]],  # From state 0, action 0 goes to state 0 and action 1 goes to state 1
    [[0, 1, 0], [0, 0, 1]],  # From state 1, action 0 goes to state 1 and action 1 goes to state 2
    [[0, 0, 1], [0, 0, 1]],  # From state 2, both actions go to state 2
])

# Rewards: rewards[state, action]
rewards = np.array([
    [-1, -1],
    [-1, -1],
    [-1, -1],
])

# Discount factor
gamma = 0.9

# Initialize values arbitrarily
values = np.zeros(num_states)

# Value iteration algorithm
num_iterations = 100
for i in range(num_iterations):
    new_values = np.zeros(num_states)

    for s in range(num_states):
        action_values = np.zeros(num_actions)

        for a in range(num_actions):
            next_state_probs = transition_probs[s, a]
            reward = rewards[s, a]
            action_values[a] = np.dot(next_state_probs, (reward + gamma * values))

        new_values[s] = np.max(action_values)

    # Check for convergence
    if np.allclose(new_values, values):
        print(f"Value iteration converged after {i+1} iterations.")
        break

    values = new_values

# Extract the optimal policy from the computed values
optimal_policy = np.zeros((num_states, num_actions))
for s in range(num_states):
    action_values = np.zeros(num_actions)

    for a in range(num_actions):
        next_state_probs = transition_probs[s, a]
        reward = rewards[s, a]
        action_values[a] = np.dot(next_state_probs, (reward + gamma * values))

    best_action = np.argmax(action_values)
    optimal_policy[s, best_action] = 1

print("Optimal Policy:")
print(optimal_policy)


Value iteration converged after 89 iterations.
Optimal Policy:
[[1. 0.]
 [1. 0.]
 [1. 0.]]
