In [3]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

def create_env():
    return gym.make('FrozenLake-v1', is_slippery=False)

def initialize_q_table(env):
    number_of_states = env.observation_space.n
    number_of_actions = env.action_space.n
    return np.zeros((number_of_states, number_of_actions))

def decay_function(episode, total_train_episodes, min_epsilon=0.01):
    return max(min_epsilon, min(1.0, 1.0 - np.log10((episode + 1) / (total_train_episodes * 0.1))))

def choose_action(q_table, state, epsilon, env):
    print(f"State before accessing q_table: {state}")
    if np.random.random() <= epsilon:
        action = env.action_space.sample()  # Exploration policy (behavior policy)
    else:
        action = np.argmax(q_table[state])  # Exploitation Greedy policy (target policy)
    print(f"Action chosen: {action}")
    return action

def generate_episode(behavior_policy, q_table, env, max_env_steps):
    state = env.reset()
    total_reward = 0
    done = False
    trajectory = []
    
    if isinstance(state, tuple):  # Normalizzazione iniziale dello stato
        state = state[0]
    
    while not done:
        action = behavior_policy(q_table, state, env)
        # Aggiorna qui: estrai il primo elemento dello stato se è una tupla
        new_state, reward, done, info, _ = env.step(action)
        if isinstance(new_state, tuple):
            new_state = new_state[0]  # Estrarre solo il componente stato intero
        trajectory.append((state, action, reward))
        state = new_state
        total_reward += reward
        if done:
            break
    
    return trajectory, total_reward


def Monte_Carlo(env, total_train_episodes, gamma, max_epsilon, min_epsilon):
    q_table = initialize_q_table(env)
    visits_counter = np.zeros_like(q_table)
    rewards = []
    max_env_steps = env.spec.max_episode_steps

    for episode in range(total_train_episodes):
        print('\nSTART EPISODE:', episode)
        epsilon = decay_function(episode, total_train_episodes, min_epsilon)
        print('\nEPSILON:', epsilon)
        behavior_policy = lambda q, s, e: choose_action(q, s, epsilon, e)
        trajectory, total_reward = generate_episode(behavior_policy, q_table, env, max_env_steps)
        G = 0
        W = 1  # Weight for importance sampling

        for t in reversed(range(len(trajectory))):
            state, action, reward = trajectory[t]
            G = gamma * G + reward
            
            visits_counter[state, action] += 1
            q_table[state, action] += (W / visits_counter[state, action]) * (G - q_table[state, action])
            
            if action != np.argmax(q_table[state]):
                break  # Break if the action taken is not the same as the action under the target policy
            W *= 1. / (0.5 if np.random.random() > epsilon else 0.5)  # Adjust the weight for the sampled policy

        rewards.append(total_reward)
        if episode % 50 == 0:
            print(f"Episode {episode}, epsilon {epsilon:.4f}, reward {total_reward:.2f}")

    return q_table, rewards

def plot_rewards(rewards):
    x = np.linspace(0, len(rewards) * 50, len(rewards))
    plt.plot(x, rewards, label='Monte Carlo Off-Policy')
    plt.xlabel('Episodes')
    plt.ylabel('Reward')
    plt.legend()
    plt.show()

def test_policy(env, q_table, num_episodes=10):
    rewards = []
    max_env_steps = env.spec.max_episode_steps

    for episode in range(num_episodes):
        state = env.reset()
        total_rewards = 0
        done = False

        while not done:
            action = np.argmax(q_table[state])
            state, reward, done, info, _ = env.step(action)
            total_rewards += reward
            if done:
                break

        rewards.append(total_rewards)

    return np.mean(rewards)

if __name__ == "__main__":
    env = create_env()
    total_train_episodes = 500
    gamma = 0.99
    max_epsilon = 1.0
    min_epsilon = 0.01

    q_table, rewards = Monte_Carlo(env, total_train_episodes, gamma, max_epsilon, min_epsilon)
    #plot_rewards(rewards)

    # average_reward = test_policy(env, q_table)
    #print(f"Average Reward: {average_reward}")
    env.close()


START EPISODE: 0

EPSILON: 1.0
State before accessing q_table: 0
Action chosen: 2
State before accessing q_table: 1
Action chosen: 0
State before accessing q_table: 0
Action chosen: 2
State before accessing q_table: 1
Action chosen: 2
State before accessing q_table: 2
Action chosen: 1
State before accessing q_table: 6
Action chosen: 2
Episode 0, epsilon 1.0000, reward 0.00

START EPISODE: 1

EPSILON: 1.0
State before accessing q_table: 0
Action chosen: 0
State before accessing q_table: 0
Action chosen: 2
State before accessing q_table: 1
Action chosen: 1

START EPISODE: 2

EPSILON: 1.0
State before accessing q_table: 0
Action chosen: 1
State before accessing q_table: 4
Action chosen: 1
State before accessing q_table: 8
Action chosen: 0
State before accessing q_table: 8
Action chosen: 0
State before accessing q_table: 8
Action chosen: 0
State before accessing q_table: 8
Action chosen: 3
State before accessing q_table: 4
Action chosen: 2

START EPISODE: 3

EPSILON: 1.0
State before acce