In [33]:
import numpy as np

In [34]:
class Environment:
    def __init__(self, size, goal_state, rewards):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.actions = ['up', 'down', 'left', 'right']

    def get_next_state(self, state, action):
        row, col = divmod(state, self.size[1])
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size[0] - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size[1] - 1)
        return row * self.size[1] + col

    def get_reward(self, state):
        return self.rewards.get(state, -1)

In [35]:
class Parameters:
    def __init__(self, size, goal_state, rewards,gamma):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.gamma = gamma

In [36]:
class PolicyIteration:
    def __init__(self, environment, gamma=1.0):
        self.environment = environment
        self.gamma = gamma
        self.policy = {}
        self.state_values = np.zeros(environment.size[0] * environment.size[1])
        self.initialize_policy()

    #Initialization policy by random values of actions
    def initialize_policy(self):
        for state in range(self.environment.size[0] * self.environment.size[1]):
            self.policy[state] = np.random.choice(self.environment.actions)

    # Policy Evaluation
    def policy_evaluation(self, iterations=100):
        for _ in range(iterations):
            new_state_values = np.copy(self.state_values) # create immutable to keep orignal state_values
            for state in range(self.environment.size[0] * self.environment.size[1]):
                if state == self.environment.goal_state:#if reach goal job is done just exit
                    continue
                action = self.policy[state]
                next_state = self.environment.get_next_state(state, action)
                reward = self.environment.get_reward(next_state)
                new_state_values[state] = reward + self.gamma * self.state_values[next_state]
            self.state_values = new_state_values

    # Policy improvement (training)
    def policy_improvement(self):
        policy_stable = True
        for state in range(self.environment.size[0] * self.environment.size[1]):
            if state == self.environment.goal_state:
                continue
            old_action = self.policy[state]
            best_action = None #null
            best_value = float('-inf') #negative infinity
            for action in self.environment.actions:
                next_state = self.environment.get_next_state(state, action)
                reward = self.environment.get_reward(next_state)
                value = reward + self.gamma * self.state_values[next_state]
                #find max value
                if value > best_value:
                    best_value = value
                    best_action = action
            self.policy[state] = best_action
            #ensure action converge and stabilized, if during many iteration we see no change in action so it's stabilized
            if best_action != old_action:
                policy_stable = False
        return policy_stable

    # Policy Iteration to reach stability = evaluation+improvement
    def policy_iteration(self):
        is_policy_stable = False
        while not is_policy_stable:
            self.policy_evaluation()
            is_policy_stable = self.policy_improvement()

    def find_best_path_for_goal(self, start_state):
        path = []
        current_state = start_state
        while current_state != self.environment.goal_state:
            path.append(current_state)
            current_action = self.policy[current_state]
            current_state = self.environment.get_next_state(current_state, current_action)
        path.append(self.environment.goal_state)
        return path

In [40]:
print("Policy Iteration - Grid World")
param = Parameters((6,6),8,{8: 10,3:-5},1.0)
# Initialize the Environment
environment = Environment(param.size, param.goal_state, param.rewards)
# Initialize the Agent
agent = PolicyIteration(environment)

# Perform policy iteration
agent.policy_iteration()

# Print the state values and policy
print("State Values:")
print(agent.state_values.reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            print(agent.policy[state], end=" ")
    print()

 # Find and print the best path from a starting state to the goal state
start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

Policy Iteration - Grid World
State Values:
[[ 8.  9. 10.  9.  8.  7.]
 [ 9. 10.  0. 10.  9.  8.]
 [ 8.  9. 10.  9.  8.  7.]
 [ 7.  8.  9.  8.  7.  6.]
 [ 6.  7.  8.  7.  6.  5.]
 [ 5.  6.  7.  6.  5.  4.]]

Policy:
down down down down down down 
right right  G  left left left 
up up up up up up 
up up up up up up 
up up up up up up 
up up up up up up 

Best Path from state 0 to goal:
[0, 6, 7, 8]


In [38]:
from RL_project.secret_envs_wrapper import SecretEnv0

print("Policy Iteration - Secret Env 0")
# Initialize the Environment
environment = SecretEnv0()
# Initialize the Agent
agent = Agent(environment)

# Perform policy iteration
agent.policy_iteration()

# Print the state values and policy
print("State Values:")
print(agent.state_values.reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            print(agent.policy[state], end=" ")
    print()

 # Find and print the best path from a starting state to the goal state
start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

Policy Iteration - Secret Env 0


AttributeError: 'SecretEnv0' object has no attribute 'size'

In [None]:
from RL_project.secret_envs_wrapper import SecretEnv1

print("Policy Iteration - Secret Env 1")
# Initialize the Environment
environment = SecretEnv1()
# Initialize the Agent
agent = Agent(environment,param)

# Perform policy iteration
agent.policy_iteration()

# Print the state values and policy
print("State Values:")
print(agent.state_values.reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            print(agent.policy[state], end=" ")
    print()

 # Find and print the best path from a starting state to the goal state
start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

In [None]:
from RL_project.secret_envs_wrapper import SecretEnv2

print("Policy Iteration - Secret Env 2")
# Initialize the Environment
environment = SecretEnv2()
# Initialize the Agent
agent = Agent(environment,param)

# Perform policy iteration
agent.policy_iteration()

# Print the state values and policy
print("State Values:")
print(agent.state_values.reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            print(agent.policy[state], end=" ")
    print()

 # Find and print the best path from a starting state to the goal state
start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

In [None]:
from RL_project.secret_envs_wrapper import SecretEnv3

print("Policy Iteration - Secret Env 3")
# Initialize the Environment
environment = SecretEnv3()
# Initialize the Agent
agent = Agent(environment,param)

# Perform policy iteration
agent.policy_iteration()

# Print the state values and policy
print("State Values:")
print(agent.state_values.reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            print(agent.policy[state], end=" ")
    print()

 # Find and print the best path from a starting state to the goal state
start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)