In [1]:
import numpy as np

In [2]:
class Parameters:
    def __init__(self, size, goal_state, rewards, gamma):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.gamma = gamma

In [3]:
class Environment:
    def __init__(self, size, goal_state, rewards):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.actions = ['up', 'down', 'left', 'right']
        self.initial_state = 0  # assuming the initial state is 0

    def reset(self):
        return self.initial_state

    def step(self, state, action):
        next_state = self.get_next_state(state, action)
        reward = self.get_reward(next_state)
        done = next_state == self.goal_state
        return next_state, reward, done

    def get_next_state(self, state, action):
        row, col = divmod(state, self.size[1])
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size[0] - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size[1] - 1)
        return row * self.size[1] + col

    def get_next_states(self, state):
        next_states = []
        for action in self.actions:
            next_state = self.get_next_state(state, action)
            next_states.append(next_state)
        return next_states

    def get_reward(self, state):
        return self.rewards.get(state, -1)

In [4]:
class MonteCarloESAgent:
    def __init__(self, environment, gamma=0.9, epsilon=0.1, alpha=0.1):
        self.environment = environment
        self.gamma = gamma
        self.epsilon = epsilon
        self.alpha = alpha
        self.policy = np.zeros(environment.size[0] * environment.size[1], dtype=int)
        self.state_values = np.zeros(environment.size[0] * environment.size[1])
        self.returns = {state: [] for state in range(environment.size[0] * environment.size[1])}
        self.initialize_policy()

    def initialize_policy(self):
        for state in range(self.environment.size[0] * self.environment.size[1]):
            self.policy[state] = np.random.choice(range(len(self.environment.actions)))

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.environment.actions)
        else:
            return self.environment.actions[self.policy[state]]

    def generate_episode(self, max_steps=100):
        state = self.environment.reset()
        episode = []
        steps = 0
        while steps < max_steps:
            action = self.select_action(state)
            next_state, reward, done = self.environment.step(state, action)
            episode.append((state, action, reward, next_state, done))
            if done:
                break
            state = next_state
            steps += 1
        return episode

    def monte_carlo_es(self, episodes=20):
        for i in range(episodes):
            print(f"Generating episode {i+1}...")
            episode = self.generate_episode()
            G = 0
            for t in range(len(episode) - 1, -1, -1):
                state, action, reward, next_state, done = episode[t]
                G = self.gamma * G + reward
                self.returns[state].append(G)
                self.state_values[state] = self.state_values[state] + self.alpha * (G - self.state_values[state])
                if np.random.rand() < self.epsilon:
                    self.policy[state] = np.random.choice(range(len(self.environment.actions)))
                else:
                    self.policy[state] = np.argmax([self.state_values[next_state] for next_state in self.environment.get_next_states(state)])

    def find_best_path_for_goal(self, start_state):
        path = []
        current_state = start_state
        while current_state != self.environment.goal_state:
            path.append(current_state)
            current_action = self.policy[current_state]
            current_state = self.environment.get_next_state(current_state, current_action)
        path.append(self.environment.goal_state)
        return path

In [None]:
param = Parameters((3, 3), 8, {8: 10, 3: -5}, 1.0)
environment = Environment(param.size, param.goal_state, param.rewards)

monte_carlo_agent = MonteCarloESAgent(environment, gamma=0.9)
monte_carlo_agent.monte_carlo_es(episodes=1)

print("Valeurs des états (Monte Carlo ES Agent):")
print(monte_carlo_agent.state_values.reshape(param.size))
print("\nPolitique (Monte Carlo Agent):")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            print(monte_carlo_agent.policy[state], end=" ")
    print()

start_state = 0
best_path = monte_carlo_agent.find_best_path_for_goal(start_state)
print("\nMeilleur chemin (Monte Carlo ES Agent) de l'état 0 à l'objectif:")
print(best_path)

Generating episode 1...
Valeurs des états (Monte Carlo ES Agent):
[[-13.76517217  -3.08894624   0.        ]
 [-15.32275925   0.           0.        ]
 [-15.91535596   0.           0.        ]]

Politique (Monte Carlo Agent):
3 1 0 
3 0 0 
3 3  G  
