In [21]:
import numpy as np

In [22]:
class GridWorld:
    def __init__(self):
        self.grid = np.array([
            [0, 1, 2, 3, 4],
            [5, 6, 7, 8, 9],
            [10, 11, 12, 13, 14],
            [15, 16, 17, 18, 19],
            [20, 21, 22, 23, 24]
        ])
        self.state = 0
        self.actions = np.array([0, # up
                                 1, # down
                                 2, # left
                                 3  # right
                                 ])
        self.rewards = np.array([-1, 0, 1])
        self.forbidden = np.array([1])
        self.game_over = False

    def num_states(self) -> int:
        return self.grid.size

    def num_actions(self) -> int:
        return self.actions.size

    def num_rewards(self) -> int:
        return self.rewards.size

    def reward(self, i: int) -> float:
        return self.rewards[i]

    def p(self, s: int, a: int, s_p: int, r_index: int) -> float:
        return 1 if s_p == self.next_state(s, a) and r_index == self.reward_index(s, a) else 0

    def state_id(self) -> int:
        return self.state

    def reset(self):
        self.state = 0
        self.game_over = False

    def display(self):
        print(self.grid)

    def is_forbidden(self, action: int) -> int:
        return action in self.forbidden

    def is_game_over(self) -> bool:
        return self.game_over

    def available_actions(self) -> np.ndarray:
        return self.actions

    def step(self, action: int):
        if self.is_game_over():
            return
    
        if self.is_forbidden(action):
            return
    
        next_state, reward = self.next_state(self.state, action)
        self.state = next_state
        self.game_over = self.is_terminal(self.state)
        return reward

    def score(self):
        return self.reward_index(self.state, 0)

    def next_state(self, state, action):
        if action == 0:
            next_state = state - 5
            if next_state < 0:
                return state
            else:
                return next_state
        elif action == 1:
            next_state = state + 5
            if next_state >= self.grid.size:
                return state
            else:
                return next_state
        elif action == 2:
            next_state = state - 1
            if next_state < 0 or next_state % 5 == 4:
                return state
            else:
                return next_state
        elif action == 3:
            next_state = state + 1
            if next_state >= self.grid.size or next_state % 5 == 0:
                return state
            else:
                return next_state

    def is_terminal(self, state):
        return state == 24

    def reward_index(self, state, param):
        if state < 0 or state >= self.grid.size:
            return -1
        elif state == 24:
            return 1
        else:
            return 0


In [23]:
class PolicyIteration:
    def __init__(self, env, gamma=0.9, theta=0.001, max_iter=1000):
        self.env = env
        self.gamma = gamma
        self.theta = theta 
        self.max_iter = max_iter
        self.policy = np.zeros(self.env.num_states(), dtype=int)
        self.value_function = np.zeros(self.env.num_states())

    def policy_iteration(self):
        for _ in range(self.max_iter):
            self.value_function = self.policy_evaluation()

            policy_stable = True
            for s in range(self.env.num_states()):
                old_action = self.policy[s]
                self.policy[s] = self.greedy_policy(s)
                if old_action != self.policy[s]:
                    policy_stable = False

            if policy_stable:
                break

        return self.policy, self.value_function

    def policy_evaluation(self):
        value_function = np.zeros(self.env.num_states())
        for _ in range(self.max_iter):
            delta = 0
            for s in range(self.env.num_states()):
                v = value_function[s]
                value_function[s] = self.expected_value(s)
                delta = max(delta, abs(v - value_function[s]))
            if delta < self.theta:
                break
        self.value_function = value_function
        return value_function

    def expected_value(self, s):
        action = self.policy[s]
        next_state = self.env.next_state(s, action)
        reward = self.env.reward_index(s, action)
        return reward + self.gamma * self.value_function[next_state]

    def greedy_policy(self, s):
        actions = self.env.available_actions()
        best_action = actions[0]
        best_value = self.expected_value(s)
        for action in actions[1:]:
            value = self.expected_value(s)
            if value > best_value:
                best_action = action
                best_value = value
        return best_action

In [24]:
env = GridWorld()
policy_iteration = PolicyIteration(env)
policy, value_function = policy_iteration.policy_iteration()
print("Politique optimale : ", policy)
print("Fonction de valeur optimale : ", value_function)

Politique optimale :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Fonction de valeur optimale :  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1.]
