# Import des libs 

In [1]:
import ctypes
import platform
from collections import defaultdict
import numpy as np
import random

In [2]:
if platform.system().lower() == "windows":
    lib_path = "./libs/secret_envs.dll"
elif platform.system().lower() == "linux":
    lib_path = "./libs/libsecret_envs.so"
elif platform.system().lower() == "darwin":
    if "intel" in platform.processor().lower():
        lib_path = "./libs/libsecret_envs_intel_macos.dylib"
    else:
        lib_path = "./libs/libsecret_envs.dylib"

In [3]:
from secret_envs_wrapper import SecretEnv0Wrapper
from secret_envs_wrapper import SecretEnv1Wrapper
from secret_envs_wrapper import SecretEnv0
from secret_envs_wrapper import SecretEnv1

# Monte carlo
* ES
* on policy monte carlo control
* off policy monte carlo control

In [4]:
from collections import defaultdict

def run_episode(env):
    """Run a single episode in the given environment."""
    episode = []
    state = env.state_id()
    while not env.is_game_over():
        actions = env.available_actions()
        action = np.random.choice(actions)
        env.step(action)
        reward = env.score()
        next_state = env.state_id()
        episode.append((state, action, reward, next_state))
        state = next_state
    return episode

def monte_carlo_es(env, num_episodes):
    """Monte Carlo prediction to estimate state values."""
    # Initialize the value function
    value_table = defaultdict(float)
    returns = defaultdict(list)

    for _ in range(num_episodes):
        env.reset()
        episode = run_episode(env)

        # Calculate returns
        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward, next_state = episode[t]
            G = reward + 0.99 * G  # Assuming discount factor gamma = 0.99
            if state not in [x[0] for x in episode[:t]]:  # First visit MC
                returns[state].append(G)
                value_table[state] = np.mean(returns[state])

    return value_table

In [5]:
env0 = SecretEnv0()
value_table_0 = monte_carlo_es(env0, 1000)
print("Value Function for SecretEnv0:")
for state, value in value_table_0.items():
    print(f"State {state}: Value {value:.2f}")

Value Function for SecretEnv0:
State 7984: Value -3.09
State 7856: Value -6.14
State 7728: Value -10.18
State 7584: Value -6.72
State 7456: Value -9.89
State 7400: Value -14.92
State 7265: Value -16.72
State 7093: Value -22.08
State 6960: Value -26.08
State 6840: Value -34.51
State 6709: Value -32.07
State 6576: Value -32.81
State 6520: Value -36.64
State 6369: Value -27.96
State 6241: Value -33.33
State 6113: Value -34.20
State 5941: Value -41.32
State 5808: Value -44.43
State 5680: Value -40.93
State 5624: Value -54.79
State 5493: Value -57.53
State 5368: Value -43.03
State 5233: Value -53.10
State 5113: Value -78.54
State 4978: Value -83.78
State 4850: Value -81.39
State 4726: Value -80.26
State 4601: Value -77.30
State 4466: Value -76.04
State 4342: Value -76.63
State 4181: Value -45.91
State 4056: Value -44.01
State 3925: Value -65.93
State 3800: Value -45.86
State 3673: Value -61.20
State 3542: Value -62.01
State 3409: Value -44.73
State 3289: Value -52.31
State 3154: Value -52.7

: 

In [6]:
def q_learning(env, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.995, min_exploration_rate=0.01, episodes=100):
    q_table = np.zeros((env.num_states(), env.num_actions()))

    for episode in range(episodes):
        state = env.state_id()
        env.reset()
        total_reward = 0
        inc = 0
        while not env.is_game_over():
            if random.uniform(0, 1) < exploration_rate:
                action = random.choice(env.available_actions())
            else:
                action = np.argmax(q_table[state, :])

            env.step(action)
            next_state = env.state_id()
            reward = env.score()

            best_next_action = np.argmax(q_table[next_state, :])
            td_target = reward + discount_factor * q_table[next_state, best_next_action]
            td_error = td_target - q_table[state, action]
            q_table[state, action] += learning_rate * td_error

            state = next_state
            total_reward += reward
            print(inc)
            inc += 1

        exploration_rate = max(min_exploration_rate, exploration_rate * exploration_decay)
        print(f"Episode {episode + 1}: Total Reward: {total_reward}, Exploration Rate: {exploration_rate}")

    return q_table

q_learning(env0, episodes=100)

In [None]:
import numpy as np
from collections import defaultdict

def run_episode_on_policy(env, policy):
    """Run a single episode using the given policy in the given environment."""
    episode = []
    state = env.state_id()
    while not env.is_game_over():
        actions = env.available_actions()
        if np.random.rand() < policy['epsilon']:
            action = np.random.choice(actions)  # Explore
        else:
            action = policy['policy'][state]  # Exploit
        env.step(action)
        reward = env.score()
        next_state = env.state_id()
        episode.append((state, action, reward, next_state))
        state = next_state
    return episode

def on_policy_monte_carlo_control(env, num_episodes, epsilon=0.1):
    """On-Policy Monte Carlo control using epsilon-greedy policy."""
    Q = defaultdict(lambda: np.zeros(env.num_actions()))
    returns = defaultdict(list)
    policy = {'policy': defaultdict(int), 'epsilon': epsilon}

    for episode_num in range(num_episodes):
        env.reset()
        episode = run_episode_on_policy(env, policy)

        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward, next_state = episode[t]
            G = reward + 0.99 * G  # Assuming discount factor gamma = 0.99
            if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:
                returns[(state, action)].append(G)
                Q[state][action] = np.mean(returns[(state, action)])
                best_action = np.argmax(Q[state])
                policy['policy'][state] = best_action

    return Q, policy

env0 = SecretEnv0()
Q_0, policy_0 = on_policy_monte_carlo_control(env0, 1000)
print("Q-values for SecretEnv0:")
for state, actions in Q_0.items():
    print(f"State {state}: {actions}")


# Policy iteration, politique et value function

In [None]:
import numpy as np

def policy_iteration(env_class):
    env = env_class()
    
    num_states = env.num_states()
    num_actions = env.num_actions()
    gamma = 0.99  # Discount factor

    # Step 1: Initialize policy randomly
    policy = np.random.choice(num_actions, num_states)
    
    def one_step_lookahead(state, V):
        A = np.zeros(num_actions)
        for a in range(num_actions):
            for next_state in range(num_states):
                for r_index in range(env.num_rewards()):
                    prob = env.p(state, a, next_state, r_index)
                    reward = env.reward(r_index)
                    A[a] += prob * (reward + gamma * V[next_state])
        return A

    def policy_evaluation(policy, V, theta=1e1):
        while True:
            delta = 0
            for s in range(num_states):
                v = 0
                for a in range(num_actions):
                    if policy[s] == a:
                        for next_state in range(num_states):
                            for r_index in range(env.num_rewards()):
                                prob = env.p(s, a, next_state, r_index)
                                reward = env.reward(r_index)
                                v += prob * (reward + gamma * V[next_state])
                delta = max(delta, np.abs(v - V[s]))
                V[s] = v
                print(delta)
            if delta < theta:
                break
        return V

    def policy_improvement(V):
        policy_stable = True
        for s in range(num_states):
            chosen_a = policy[s]
            action_values = one_step_lookahead(s, V)
            best_a = np.argmax(action_values)
            if chosen_a != best_a:
                policy_stable = False
            policy[s] = best_a
        return policy_stable
    print("Policy Iteration")
    # Step 2: Policy Iteration
    V = np.zeros(num_states)
    while True:
        V = policy_evaluation(policy, V)
        policy_stable = policy_improvement(V)
        if policy_stable:
            break

    return policy, V


In [None]:
# Example usage:
"""policy0, V0 = policy_iteration(SecretEnv0)

print("Optimal policy for SecretEnv0:")
print(policy0)
print("Value function for SecretEnv0:")
print(V0)
"""


'policy0, V0 = policy_iteration(SecretEnv0)\n\nprint("Optimal policy for SecretEnv0:")\nprint(policy0)\nprint("Value function for SecretEnv0:")\nprint(V0)\n'

In [None]:
policy1, V1 = policy_iteration(SecretEnv1)

print("Optimal policy for SecretEnv1:")
print(policy1)
print("Value function for SecretEnv1:")
print(V1)

Policy Iteration
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0