In [1]:
import random
import numpy as np
from RL.Parameters import *
from RL.Environment import *

In [2]:
class MonteCarloESAgent:
    def __init__(self, environment, gamma=0.9, epsilon=0.1, alpha=0.1):
        self.environment = environment
        self.gamma = gamma
        self.epsilon = epsilon
        self.alpha = alpha
        self.policy = np.zeros(np.prod(environment.size), dtype=int)
        self.state_values = np.zeros(np.prod(environment.size))
        self.returns = {state: [] for state in range(np.prod(environment.size))}
        self.initialize_policy()
        
    def initialize_policy(self):
        for state in range(np.prod(self.environment.size)):
            self.policy[state] = np.random.choice(range(len(self.environment.actions)))

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.environment.actions)
        else:
            return self.environment.actions[self.policy[state]]

    def generate_episode(self, max_steps=100):
        state = self.environment.reset()
        episode = []
        steps = 0
        while steps < max_steps:
            action = self.select_action(state)
            next_state, reward, done = self.environment.step(state, action)
            episode.append((state, action, reward, next_state, done))
            if done:
                break
            state = next_state
            steps += 1
        return episode

    def monte_carlo_es(self, episodes=20):
        for i in range(episodes):
            print(f"Generating episode {i+1}...")
            episode = self.generate_episode()
            G = 0
            for t in range(len(episode) - 1, -1, -1):
                state, action, reward, next_state, done = episode[t]
                G = self.gamma * G + reward
                self.returns[state].append(G)
                self.state_values[state] = self.state_values[state] + self.alpha * (G - self.state_values[state])
                if np.random.rand() < self.epsilon:
                    self.policy[state] = np.random.choice(range(len(self.environment.actions)))
                else:
                    self.policy[state] = np.argmax([self.state_values[next_state] for next_state in self.environment.get_next_states(state)])

    def find_best_path_for_goal(self, start_state):
        path = []
        current_state = start_state
        while current_state != self.environment.goal_state:
            path.append(current_state)
            current_action = self.policy[current_state]
            current_state = self.environment.get_next_state(current_state, current_action)
        path.append(self.environment.goal_state)
        return path

In [3]:
# Liste des environnements : LineWorld, GridWorld, TwoRoundRockPaperScissors, MontyHallLevel1, MontyHallLevel2
env_name = 'LineWorld' 
print("Algorithme choisi : Monte Carlo Off Policy Agent")
print("Environnement choisi :", env_name)

param = Parameters(env_name)
environment = Environment(param.size, param.goal_state, param.rewards)

monte_carlo_agent = MonteCarloESAgent(environment, gamma=0.9)
monte_carlo_agent.monte_carlo_es(episodes=1)

print("Valeurs des états (Monte Carlo ES Agent):")
print(monte_carlo_agent.state_values.reshape(param.size))
print("\nPolitique (Monte Carlo Agent):")
if len(param.size) == 1:
    for state in range(param.size[0]):
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            print(monte_carlo_agent.policy[state], end=" ")
        print()
else:
    for row in range(param.size[0]):
        for col in range(param.size[1]):
            state = row * param.size[1] + col
            if state == param.goal_state:
                print(" G ", end=" ")
            else:
                print(monte_carlo_agent.policy[state], end=" ")
        print()

start_state = 0
best_path = monte_carlo_agent.find_best_path_for_goal(start_state)
print("\nMeilleur chemin (Monte Carlo ES Agent) de l'état 0 à l'objectif:")
print(best_path)

Generating episode 1...


IndexError: tuple index out of range