In [14]:
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import argparse
import time
import gymnasium as gym
import seaborn as sns
from tqdm import tqdm

# Method d'entrainement de Q-learning

In [26]:
def train_Qlearning(env=gym.make("Taxi-v3"),
          episodes: int = 25000,
          lr: float = 0.01,
          gamma: float = 0.99,
          epsilon: float = 1,
          max_epsilon: float = 1,
          min_epsilon: float = 0.001,
          epsilon_decay: float = 0.01,
          path_table: str = "qtable") -> tuple[float, int]:
    
    q_table = np.zeros([env.observation_space.n, env.action_space.n])

    start_time = time.time()
    total_reward = []
    steps_per_episode = []
    epsilon_vec = []
    success_rate = []
    success_count = 0

    for e in range(episodes):
        state, _ = env.reset()
        state = state.item() if isinstance(state, np.ndarray) else state

        done = False
        episode_reward = 0
        episode_steps = 0
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay * e)
        epsilon_vec.append(epsilon)

        while not done:
            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state])

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            next_state = next_state.item() if isinstance(next_state, np.ndarray) else next_state
            episode_reward += reward
            episode_steps += 1

            current_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            q_table[state, action] = (1 - lr) * current_value + lr * (reward + gamma * next_max)
            state = next_state

        total_reward.append(episode_reward)
        steps_per_episode.append(episode_steps)

        if episode_reward > 0:
            success_count += 1
        success_rate.append(success_count / (e + 1))

    execution_time = (time.time() - start_time)

    np.save(path_table, q_table)

    return np.round(execution_time, 2), np.mean(total_reward)


# Methode pour trouver les meilleurs parametres de Q-learning

In [25]:
import itertools

def grid_search():
    learning_rates = [0.001, 0.01, 0.1]
    gammas = [0.9, 0.95, 0.99]
    epsilons = [0.1, 0.2, 0.3]
    epsilon_decays = [0.001, 0.01, 0.1]

    best_params = None
    best_reward = float('-inf')

    for lr, gamma, epsilon, epsilon_decay in itertools.product(learning_rates, gammas, epsilons, epsilon_decays):
        print(f"Training with lr={lr}, gamma={gamma}, epsilon={epsilon}, epsilon_decay={epsilon_decay}")
        _, mean_reward = train_Qlearning(env=gym.make("Taxi-v3"),
                               episodes=1000,
                               lr=lr,
                               gamma=gamma,
                               epsilon=epsilon,
                               epsilon_decay=epsilon_decay)
        
        if mean_reward > best_reward:
            best_reward = mean_reward
            best_params = (lr, gamma, epsilon, epsilon_decay)
    
    print(f"Best parameters: lr={best_params[0]}, gamma={best_params[1]}, epsilon={best_params[2]}, epsilon_decay={best_params[3]}")
    print(f"Best mean reward: {best_reward}")

grid_search()

Training with lr=0.001, gamma=0.9, epsilon=0.1, epsilon_decay=0.001
Training with lr=0.001, gamma=0.9, epsilon=0.1, epsilon_decay=0.01
Training with lr=0.001, gamma=0.9, epsilon=0.1, epsilon_decay=0.1
Training with lr=0.001, gamma=0.9, epsilon=0.2, epsilon_decay=0.001
Training with lr=0.001, gamma=0.9, epsilon=0.2, epsilon_decay=0.01
Training with lr=0.001, gamma=0.9, epsilon=0.2, epsilon_decay=0.1
Training with lr=0.001, gamma=0.9, epsilon=0.3, epsilon_decay=0.001
Training with lr=0.001, gamma=0.9, epsilon=0.3, epsilon_decay=0.01
Training with lr=0.001, gamma=0.9, epsilon=0.3, epsilon_decay=0.1
Training with lr=0.001, gamma=0.95, epsilon=0.1, epsilon_decay=0.001
Training with lr=0.001, gamma=0.95, epsilon=0.1, epsilon_decay=0.01
Training with lr=0.001, gamma=0.95, epsilon=0.1, epsilon_decay=0.1
Training with lr=0.001, gamma=0.95, epsilon=0.2, epsilon_decay=0.001
Training with lr=0.001, gamma=0.95, epsilon=0.2, epsilon_decay=0.01
Training with lr=0.001, gamma=0.95, epsilon=0.2, epsilon

## Meilleur parametres pour Q-learning
Best parameters: lr=0.1, gamma=0.99, epsilon=0.3, epsilon_decay=0.1

Best mean reward: -98.766

# Methode d'entrainement de Sarsa

In [30]:
def train_sarsa(env=gym.make("Taxi-v3"),
          episodes=2000,
          gamma=0.95,
          epsilon=1,
          max_epsilon=1,
          min_epsilon=0.001,
          epsilon_decay=0.01,
          alpha=0.85,
          path_table: str = "sarsa_qtable"):
    
    Q = np.zeros((env.observation_space.n, env.action_space.n))

    start_time = time.time()
    total_reward = []
    steps_per_episode = []
    epsilon_vec = []
    success_rate = []
    success_count = 0

    for e in range(episodes):
        state, _ = env.reset()
        state = state.item() if isinstance(state, np.ndarray) else state

        done = False
        episode_reward = 0
        episode_steps = 0

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay * e)
        epsilon_vec.append(epsilon)

        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])

        while not done:
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            next_state = next_state.item() if isinstance(next_state, np.ndarray) else next_state
            
            episode_reward += reward
            episode_steps += 1

            if np.random.uniform(0, 1) < epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = np.argmax(Q[next_state, :])

            predict = Q[state, action]
            target = reward + gamma * Q[next_state, next_action]
            Q[state, action] = Q[state, action] + alpha * (target - predict)

            state = next_state
            action = next_action

        total_reward.append(episode_reward)
        steps_per_episode.append(episode_steps)

        if episode_reward > 0:
            success_count += 1
        success_rate.append(success_count / (e + 1))

    execution_time = (time.time() - start_time)

    return np.round(execution_time, 2), np.mean(total_reward)

# Methode pour trouver les meilleurs parametres de Sarsa

In [31]:
import itertools
from tqdm import tqdm

def find_best_params_sarsa(env_name="Taxi-v3", num_episodes=2000, num_runs=3):
    gamma_range = [0.9, 0.95, 0.99]
    epsilon_decay_range = [0.001, 0.01, 0.1]
    alpha_range = [0.1, 0.5, 0.9]

    best_params = None
    best_mean_reward = float('-inf')

    param_combinations = list(itertools.product(gamma_range, epsilon_decay_range, alpha_range))

    for gamma, epsilon_decay, alpha in tqdm(param_combinations, desc="Testing parameter combinations"):
        total_mean_reward = 0

        for _ in range(num_runs):
            env = gym.make(env_name)
            _, mean_reward = train_sarsa(
                env=env,
                episodes=num_episodes,
                gamma=gamma,
                epsilon_decay=epsilon_decay,
                alpha=alpha
            )
            total_mean_reward += mean_reward

        avg_mean_reward = total_mean_reward / num_runs

        if avg_mean_reward > best_mean_reward:
            best_mean_reward = avg_mean_reward
            best_params = (gamma, epsilon_decay, alpha)

    print(f"Best parameters: gamma={best_params[0]}, epsilon={best_params[1]}, alpha={best_params[2]}")
    print(f"Best mean reward: {best_reward}")

    return best_params, best_mean_reward

best_params, best_reward = find_best_params_sarsa()

Testing parameter combinations:   0%|          | 0/27 [00:00<?, ?it/s]

Testing parameter combinations: 100%|██████████| 27/27 [02:07<00:00,  4.71s/it]

Best parameters: gamma=0.9, epsilon=0.1, alpha=0.5
Best mean reward: -11.929333333333332





## Meilleur parametres pour SARSA
Best parameters: gamma=0.9, epsilon=0.1, alpha=0.5

Best mean reward: -11.929333333333332


# Comparaison graph entrainement Q-learning et Sarsa

### Q-learning
![Q-Learning graph](q-learning/QLearning_graph.png)

### Sarsa
![Sarsa graph](sarsa/sarsa_graph.png)


# Analyse des graphiques de Q-learning et SARSA

***Graphique 1:*** Q-learning

***Graphique 2:*** SARSA

En analysant ces deux graphiques, nous pouvons faire les déductions suivantes:

### Convergence:
Le graphique 1 montre une convergence beaucoup plus stable et rapide que le graphique 2. Les courbes de récompenses (vert) et d'étapes (orange) se stabilisent rapidement dans le graphique 1, tandis qu'elles restent très volatiles dans le graphique 2.

### Stabilité:
Le premier algorithme (graphique 1) semble beaucoup plus stable. Les fluctuations dans les récompenses et les étapes diminuent considérablement au fil du temps. En revanche, le deuxième algorithme (graphique 2) montre des variations importantes tout au long de l'entraînement.

### Performance:
L'algorithme du graphique 1 semble atteindre de meilleures performances moyennes. Les récompenses se stabilisent à un niveau plus élevé et les étapes à un niveau plus bas, ce qui est généralement souhaitable dans l'apprentissage par renforcement.

### Exploration (Epsilon):
Dans les deux cas, l'epsilon (ligne rouge) décroît de manière similaire, mais son impact sur la performance semble très différent. L'algorithme 1 semble mieux exploiter la réduction de l'exploration.

### Apprentissage:
L'algorithme 1 semble apprendre plus efficacement. Il montre une amélioration constante et maintient ses performances, tandis que l'algorithme 2 semble avoir du mal à maintenir des performances stables.

### Robustesse:
L'algorithme 1 paraît plus robuste aux variations de l'environnement ou aux aléas de l'apprentissage, tandis que l'algorithme 2 semble très sensible à ces facteurs.

### Conclusion:
En conclusion, l'algorithme Q-learning semble nettement supérieur en termes de stabilité, de convergence et de performance globale pour cette tâche particulière. Malgré le fait que SARSA ait une meilleure récompense moyenne, Q-learning démontre une plus grande stabilité et une convergence plus rapide, ce qui en fait un choix préférable pour cette application.mble nettement supérieur en termes de stabilité, de convergence et de performance globale pour cette tâche particulière.

### Données complémentaires:
**SARSA:**
- Temps d'entraînement: 12.33s (0.21min, 0.0h)
- Temps moyen par épisode: 0.000123s
- Temps d'exécution: 12.33 secondes
- Récompense moyenne: 7.09538

**Q-learning:**
- Temps d'entraînement: 27.09s (0.45min, 0.01h)
- Temps moyen par épisode: 0.000271s
- Temps d'exécution: 27.09 secondes
- Récompense moyenne: -1.69379

# Graphiques complementaire

## Brute force
![bruteForce_reward_distribution](brute-force/bruteforce_reward_distribution.png)

![bruteForce_reward_and_steps](brute-force/bruteforce_rewards_and_steps.png)

![bruteforce_steps_distribution](brute-force/bruteforce_steps_distribution.png)

![bruteforce_success_rate](brute-force/bruteforce_success_rate.png)


## Monte Carlo

![monte_carlo_graph.png](monte-carlo/monte_carlo_graph.png)
![monte_carlo_reward_distribution.png](monte-carlo/monte_carlo_reward_distribution.png)
![monte_carlo_rewards_and_steps.png](monte-carlo/monte_carlo_rewards_and_steps.png)
![monte_carlo_step_distribution.png](monte-carlo/monte_carlo_steps_distribution.png)
![monte_carlo_success_rate.png](monte-carlo/monte_carlo_success_rate.png)


## Q-learning

![QLearning_graph.png](q-learning/QLearning_graph.png)
![QLearning_reward_distribution.png](q-learning/QLearning_reward_distribution.png)
![QLearning_rewards_and_steps.png](q-learning/QLearning_rewards_and_steps.png)
![QLearning_step_distribution.png](q-learning/monte_carlo_steps_distribution.png)
![QLearning_success_rate.png](q-learning/QLearning_success_rate.png)

## SARSA

![SARSA_graph.png](sarsa/sarsa_graph.png)
![SARSA_reward_distribution.png](sarsa/sarsa_reward_distribution.png)
![SARSA_rewards_and_steps.png](sarsa/sarsa_rewards_and_steps.png)
![SARSA_step_distribution.png](sarsa/sarsa_steps_distribution.png)
![SARSA_success_rate.png](sarsa/sarsa_success_rate.png)