In [1]:
import numpy as np

In [2]:
from secret_envs_wrapper import SecretEnv1

In [3]:
class PolicyIteration:
    def __init__(self, env, gamma=0.9, theta=0.001, max_iter=1000):
        self.env = env
        self.gamma = gamma
        self.theta = theta 
        self.max_iter = max_iter
        self.policy = np.zeros(self.env.num_states(), dtype=int)
        self.value_function = np.zeros(self.env.num_states())

    def policy_iteration(self):
        for _ in range(self.max_iter):
            self.value_function = self.policy_evaluation()

            policy_stable = True
            for s in range(self.env.num_states()):
                old_action = self.policy[s]
                self.policy[s] = self.greedy_policy(s)
                if old_action != self.policy[s]:
                    policy_stable = False

            if policy_stable:
                break

        return self.policy, self.value_function

    def policy_evaluation(self):
        value_function = np.zeros(self.env.num_states())
        for _ in range(self.max_iter):
            delta = 0
            for s in range(self.env.num_states()):
                v = value_function[s]
                value_function[s] = self.expected_value(s)
                delta = max(delta, abs(v - value_function[s]))
            if delta < self.theta:
                break
        self.value_function = value_function
        return value_function

    def expected_value(self, s):
        action = self.policy[s]
        expected_value = 0
        for s_prime in range(self.env.num_states()):
            transition_prob = self.env.p(s, action, s_prime, action)
            reward = self.env.reward(s_prime)
            expected_value += transition_prob * (reward + self.gamma * self.value_function[s_prime])
        return expected_value

    def greedy_policy(self, s):
        actions = self.env.available_actions(s)
        best_action = actions[0]
        best_value = self.expected_value_for_action(s, actions[0])
        for action in actions[1:]:
            value = self.expected_value_for_action(s, action)
            if value > best_value:
                best_action = action
                best_value = value
        return best_action

    def expected_value_for_action(self, s, action):
        expected_value = 0
        for s_prime in range(self.env.num_states()):
            transition_prob = self.env.p(s, action, s_prime, action)
            reward = self.env.reward(s_prime)
            expected_value += transition_prob * (reward + self.gamma * self.value_function[s_prime])
        return expected_value

In [4]:
class ValueIteration:
    def __init__(self, env, gamma=0.9, theta=0.001, max_iter=1000):
        self.env = env
        self.gamma = gamma
        self.theta = theta
        self.max_iter = max_iter
        self.value_function = np.zeros(self.env.num_states())
        self.policy = np.zeros(self.env.num_states(), dtype=int)

    def value_iteration(self):
        for _ in range(self.max_iter):
            delta = 0
            new_value_function = np.copy(self.value_function)
            
            for s in range(self.env.num_states()):
                old_value = self.value_function[s]
                new_value_function[s] = self.compute_value_for_state(s)
                delta = max(delta, abs(old_value - new_value_function[s]))
            
            self.value_function = new_value_function
            
            if delta < self.theta:
                break

        # Compute the policy after convergence
        self.compute_policy()

        return self.policy, self.value_function

    def compute_value_for_state(self, state):
        best_value = float('-inf')
        for action in self.env.available_actions(state):  # Assurez-vous que cette méthode accepte un seul argument
            value = self.compute_value_for_action(state, action)
            if value > best_value:
                best_value = value
        return best_value

    def compute_value_for_action(self, state, action):
        expected_value = 0
        for next_state in range(self.env.num_states()):
            transition_prob = self.env.p(state, action, next_state, action)
            reward = self.env.reward(next_state)
            expected_value += transition_prob * (reward + self.gamma * self.value_function[next_state])
        return expected_value

    def compute_policy(self):
        for s in range(self.env.num_states()):
            best_action = None
            best_value = float('-inf')
            
            for action in self.env.available_actions(s):  # Assurez-vous que cette méthode accepte un seul argument
                value = self.compute_value_for_action(s, action)
                if value > best_value:
                    best_action = action
                    best_value = value
            
            self.policy[s] = best_action

In [None]:
env = SecretEnv1()
policy_iteration = PolicyIteration(env)
policy, value_function = policy_iteration.policy_iteration()
print("Politique optimale : ", policy)
print("Fonction de valeur optimale : ", value_function)

In [5]:
env = SecretEnv1()
value_iteration = ValueIteration(env)
value, value_function = value_iteration.value_iteration()
print("Politique optimale : ", policy)
print("Fonction de valeur optimale : ", value_function)

TypeError: SecretEnv1.available_actions() takes 1 positional argument but 2 were given