In [20]:
import time
from typing import Dict

import numpy as np
import scipy.stats as stats
import pandas as pd
import gymnasium as gym
from bettermdptools.utils.test_env import TestEnv
from bettermdptools.algorithms.planner import Planner
from bettermdptools.algorithms.rl import RL

from grid_search2 import set_seed, check_convergence, check_ql_convergence

N_ITERATIONS = 10000
SEED = 666
CV_JOBS = 2


def run_algorithm(
    env, algo_name: str, params: Dict[str, float] = None, n_episodes: int = 100
):
    if algo_name in ["Q-Learning", "QL"]:
        agent = RL(env)
        start_time = time.time()
        Q, V, pi, Q_track, pi_track = agent.q_learning(**params)
        runtime = time.time() - start_time
        episode_rewards = TestEnv.test_env(env=env, n_iters=n_episodes, pi=pi)
        cumulative_score = np.sum(episode_rewards)
        iterations_to_converge = check_ql_convergence(Q_track)
    elif algo_name in ["Value Iteration", "VI"]:
        planner = Planner(env.P)
        start_time = time.time()
        V, V_track, pi = planner.value_iteration(**params)
        runtime = time.time() - start_time
        episode_rewards = TestEnv.test_env(env=env, n_iters=n_episodes, pi=pi)
        cumulative_score = np.sum(episode_rewards)
        iterations_to_converge = check_convergence(V_track)
    elif algo_name in ["Policy Iteration", "PI"]:
        planner = Planner(env.P)
        start_time = time.time()
        V, V_track, pi = planner.policy_iteration(**params)
        runtime = time.time() - start_time
        episode_rewards = TestEnv.test_env(env=env, n_iters=n_episodes, pi=pi)
        cumulative_score = np.sum(episode_rewards)
        iterations_to_converge = check_convergence(V_track)
    return cumulative_score, runtime, iterations_to_converge


def perform_comparisons(config_results):
    comparisons = []
    for algo_name in ["VI", "PI", "QL"]:
        for config_name in ["baseline", "grid search"]:
            base_scores = config_results["FrozenLake16x16-v1"][
                f"{algo_name}_{config_name}"
            ]
            easy_scores = config_results["FrozenLake16x16-v1-easy"][
                f"{algo_name}_{config_name}"
            ]
            hard_scores = config_results["FrozenLake16x16-v1-hard"][
                f"{algo_name}_{config_name}"
            ]

            t_stat_easy, p_value_easy = stats.ttest_rel(base_scores, easy_scores)
            t_stat_hard, p_value_hard = stats.ttest_rel(base_scores, hard_scores)

            comparisons.append(
                {
                    "algorithm": algo_name,
                    "configuration": config_name,
                    "t_stat_easy": t_stat_easy,
                    "p_value_easy": p_value_easy,
                    "t_stat_hard": t_stat_hard,
                    "p_value_hard": p_value_hard,
                    "base_easy_valid": p_value_easy < 0.05,
                    "base_hard_valid": p_value_hard < 0.05,
                }
            )
    return comparisons


if __name__ == "__main__":
    set_seed(SEED)
    results = []

    env_configs = [
        "FrozenLake16x16-v1",
        "FrozenLake16x16-v1-easy",
        "FrozenLake16x16-v1-hard",
    ]
    config_results = {env: {} for env in env_configs}

    for env_name in env_configs:
        env = gym.make(env_name)
        for algo_name in ["VI", "PI", "QL"]:
            df = pd.read_csv(
                f"results2/FrozenLake16x16-v1/{algo_name}_grid_search_results.csv"
            )
            df.sort_values(by="cumulative_score", ascending=False, inplace=True)
            best_config = df.iloc[0].to_dict()
            n_iters = int(
                best_config.get("n_episodes", best_config.get("n_iters", N_ITERATIONS))
            )

            for config_name in ["baseline", "grid search"]:
                scores = []
                runtimes = []
                iterations = []

                for _ in range(CV_JOBS):
                    if config_name == "baseline":
                        config = {
                            (
                                "n_episodes"
                                if "n_episodes" in best_config
                                else "n_iters"
                            ): n_iters
                        }
                    else:
                        config = best_config.copy()
                        config.pop("cumulative_score", None)
                        config.pop("runtime", None)
                        config.pop("iterations_to_converge", None)
                        config[
                            "n_episodes" if "n_episodes" in best_config else "n_iters"
                        ] = n_iters

                    score, runtime, iters = run_algorithm(env, algo_name, config)
                    scores.append(score)
                    runtimes.append(runtime)
                    iterations.append(iters)


                results.append(
                    {
                        "environment": env_name,
                        "algorithm": algo_name,
                        "configuration": config_name,
                        "score": np.mean(scores),
                        "runtime": np.mean(runtimes),
                        "iterations_to_converge": np.mean(iterations),
                    }
                )


                config_results[env_name][f"{algo_name}_{config_name}"] = scores

runtime = 4.01 seconds


  if not isinstance(terminated, (bool, np.bool8)):


runtime = 4.04 seconds
runtime = 0.30 seconds
runtime = 0.30 seconds
runtime = 6.27 seconds
runtime = 3.53 seconds
runtime = 0.47 seconds
runtime = 1.36 seconds


                                                      

runtime = 4.96 seconds


                                                      

runtime = 4.90 seconds


                                                      

runtime = 5.76 seconds


                                                      

runtime = 5.84 seconds
runtime = 0.76 seconds


  if not isinstance(terminated, (bool, np.bool8)):


runtime = 0.77 seconds
runtime = 0.27 seconds
runtime = 0.27 seconds
runtime = 1.32 seconds
runtime = 1.00 seconds
runtime = 0.37 seconds
runtime = 0.46 seconds


                                                      

runtime = 7.08 seconds


                                                      

runtime = 7.04 seconds


                                                      

runtime = 10.21 seconds


                                                      

runtime = 10.01 seconds
runtime = 4.71 seconds


  if not isinstance(terminated, (bool, np.bool8)):


runtime = 4.76 seconds
runtime = 0.27 seconds
runtime = 0.27 seconds
runtime = 4.20 seconds
runtime = 3.35 seconds
runtime = 1.36 seconds
runtime = 1.09 seconds


                                                      

runtime = 4.74 seconds


                                                      

runtime = 4.72 seconds


                                                      

runtime = 5.63 seconds


                                                      

runtime = 5.64 seconds




In [21]:

results_df = pd.DataFrame(results)
display(results_df)


comparisons = perform_comparisons(config_results)
comparison_df = pd.DataFrame(comparisons)
display(comparison_df)

Unnamed: 0,environment,algorithm,configuration,score,runtime,iterations_to_converge
0,FrozenLake16x16-v1,VI,baseline,14.0,4.027508,749.0
1,FrozenLake16x16-v1,VI,grid search,38.0,0.296763,145.0
2,FrozenLake16x16-v1,PI,baseline,22.5,4.900278,6.5
3,FrozenLake16x16-v1,PI,grid search,38.0,0.912331,13.0
4,FrozenLake16x16-v1,QL,baseline,0.0,4.932331,1.0
5,FrozenLake16x16-v1,QL,grid search,0.0,5.802924,1.0
6,FrozenLake16x16-v1-easy,VI,baseline,97.0,0.767266,278.0
7,FrozenLake16x16-v1-easy,VI,grid search,97.0,0.268095,152.0
8,FrozenLake16x16-v1-easy,PI,baseline,92.5,1.159251,5.0
9,FrozenLake16x16-v1-easy,PI,grid search,92.0,0.416291,7.0


Unnamed: 0,algorithm,configuration,t_stat_easy,p_value_easy,t_stat_hard,p_value_hard,base_easy_valid,base_hard_valid
0,VI,baseline,-41.5,0.015337,4.666667,0.134386,True,False
1,VI,grid search,-59.0,0.010789,24.333333,0.026148,True,True
2,PI,baseline,-14.0,0.045396,11.0,0.057716,True,False
3,PI,grid search,-54.0,0.011788,19.0,0.033475,True,True
4,QL,baseline,,,,,False,False
5,QL,grid search,,,,,False,False
