In [11]:
import numpy as np

In [12]:
class Environment:
    def __init__(self, size, goal_state, rewards):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.actions = ['up', 'down', 'left', 'right']

    def get_next_state(self, state, action):
        row, col = divmod(state, self.size[1])
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size[0] - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size[1] - 1)
        return row * self.size[1] + col

    def get_reward(self, state):
        return self.rewards.get(state, -1)

In [13]:
class Parameters:
    def __init__(self, size, goal_state, rewards,gamma):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.gamma = gamma

In [14]:
class Agent:
    def __init__(self, environment, parameters, gamma=1.0):
        self.environment = environment
        self.gamma = gamma
        self.policy = {state: None for state in range(environment.size[0] * environment.size[1])}
        self.state_values = np.zeros(environment.size[0] * environment.size[1])
        self.parameters = parameters

    # Value Iteration
    def value_iteration(self, theta=1e-9, max_iterations=1000):
        delta = float('inf')
        iteration = 0
        while delta > theta and iteration < max_iterations:
            delta = 0
            for state in range(self.environment.size[0] * self.environment.size[1]):
                if state == self.environment.goal_state:
                    continue
                value = self.state_values[state]
                best_value = float('-inf')
                for action in self.environment.actions:
                    next_state = self.environment.get_next_state(state, action)
                    reward = self.environment.get_reward(next_state)
                    next_value = reward + self.gamma * self.state_values[next_state]
                    best_value = max(best_value, next_value)
                self.state_values[state] = best_value
                delta = max(delta, abs(value - self.state_values[state]))
            iteration += 1

    # Extract policy from value function
    def extract_policy(self):
        for state in range(self.environment.size[0] * self.environment.size[1]):
            if state == self.environment.goal_state:
                continue
            best_action = None 
            best_value = float('-inf') 
            for action in self.environment.actions:
                next_state = self.environment.get_next_state(state, action)
                reward = self.environment.get_reward(next_state)
                value = reward + self.gamma * self.state_values[next_state]
                if value > best_value:
                    best_value = value
                    best_action = action
            if state not in self.policy:
                self.policy[state] = best_action
            else:
                self.policy[state] = best_action

    # Run Value Iteration and extract policy
    def run_value_iteration(self):
        self.value_iteration()
        self.extract_policy()

    # Find best path for goal
    def find_best_path_for_goal(self, start_state):
        path = []
        current_state = start_state
        while current_state != self.environment.goal_state:
            path.append(current_state)
            current_action = self.policy[current_state]
            current_state = self.environment.get_next_state(current_state, current_action)
        path.append(self.environment.goal_state)
        return path

In [15]:
param = Parameters((3, 3),8,{8: 10,3:-5},1.0)
# Initialize the Environment
environment = Environment(param.size, param.goal_state, param.rewards)
# Initialize the Agent
agent = Agent(environment,param)

# Perform policy iteration
agent.value_iteration()

# Print the state values and policy
print("State Values:")
print(agent.state_values.reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            print(agent.policy[state], end=" ")
    print()

 # Find and print the best path from a starting state to the goal state
start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

State Values:
[[ 7.  8.  9.]
 [ 8.  9. 10.]
 [ 9. 10.  0.]]

Policy:
None None None 
None None None 
None None  G  



KeyboardInterrupt

