In [5]:
import time
import random
from typing import Dict

import numpy as np
import scipy.stats as stats
import pandas as pd
import gymnasium as gym
from bettermdptools.utils.blackjack_wrapper import BlackjackWrapper
from bettermdptools.utils.test_env import TestEnv
from bettermdptools.algorithms.planner import Planner
from bettermdptools.algorithms.rl import RL

from grid_search2 import set_seed, check_convergence, check_ql_convergence

N_ITERATIONS = 10000
SEED = 666
CV_JOBS = 2


def run_algorithm(
    env, algo_name: str, params: Dict[str, float] = None, n_episodes: int = 100
):
    if algo_name in ["Q-Learning", "QL"]:
        agent = RL(env)

        start_time = time.time()
        Q, V, pi, Q_track, pi_track = agent.q_learning(**params)
        runtime = time.time() - start_time

        episode_rewards = TestEnv.test_env(env=env, n_iters=n_episodes, pi=pi)
        cumulative_score = np.sum(episode_rewards)
        iterations_to_converge = check_ql_convergence(Q_track)

    elif algo_name in ["Value Iteration", "VI"]:
        planner = Planner(env.P)

        start_time = time.time()
        V, V_track, pi = planner.value_iteration(**params)
        runtime = time.time() - start_time

        episode_rewards = TestEnv.test_env(env=env, n_iters=n_episodes, pi=pi)
        cumulative_score = np.sum(episode_rewards)
        iterations_to_converge = check_convergence(V_track)

    elif algo_name in ["Policy Iteration", "PI"]:
        planner = Planner(env.P)

        start_time = time.time()
        V, V_track, pi = planner.policy_iteration(**params)
        runtime = time.time() - start_time

        episode_rewards = TestEnv.test_env(env=env, n_iters=n_episodes, pi=pi)
        cumulative_score = np.sum(episode_rewards)
        iterations_to_converge = check_convergence(V_track)

    return cumulative_score, runtime, iterations_to_converge


if __name__ == "__main__":
    set_seed(SEED)

    results = []
    t_test_results = []

    # set environment
    for env_name in ["Blackjack-v1", "FrozenLake8x8-v1", "FrozenLake16x16-v1"]:
        if env_name == "Blackjack-v1":
            base_env = gym.make("Blackjack-v1", render_mode=None)
            env = BlackjackWrapper(base_env)

        else:
            env = gym.make(env_name)

        for algo_name in ["VI", "PI", "QL"]:
            # load best configuration
            df = pd.read_csv(f"results2/{env_name}/{algo_name}_grid_search_results.csv")
            df = df.sort_values(by="cumulative_score", ascending=False)
            best_config = df.to_dict(orient="records")[0]
            [
                best_config.pop(col)
                for col in ("cumulative_score", "runtime", "iterations_to_converge")
            ]

            best_config_names = list(best_config.keys())
            best_config_values = [
                best_config[name]
                for name in best_config_names
                if name != "n_episodes" and name != "n_iters"
            ]
            n_iters = best_config.get(
                "n_episodes", best_config.get("n_iters", N_ITERATIONS)
            )

            best_config["n_episodes" if "n_episodes" in best_config else "n_iters"] = (
                n_iters  
            )

            baseline_config = {
                "n_episodes" if "n_episodes" in best_config else "n_iters": n_iters
            }

            # get baseline performance
            baseline_scores = []
            baseline_runtimes = []
            baseline_convergence = []

            for _ in range(CV_JOBS):
                baseline_score, baseline_runtime, baseline_iters = run_algorithm(
                    env=env, algo_name=algo_name, params=baseline_config
                )

                baseline_scores.append(baseline_score)
                baseline_runtimes.append(baseline_runtime)
                baseline_convergence.append(baseline_iters)

            results.append(
                {
                    "environment": env_name,
                    "algorithm": algo_name,
                    "configuration": "baseline",
                    "cumulative_score": np.mean(baseline_scores),
                    "runtime": np.mean(baseline_runtimes),
                    "iterations_to_converge": np.mean(baseline_convergence),
                }
            )

            # get performance for best configuration
            optimized_scores = []
            optimized_runtimes = []
            optimized_convergence = []

            for _ in range(CV_JOBS):
                optimized_score, optimized_runtime, optimized_iters = run_algorithm(
                    env=env, algo_name=algo_name, params=best_config
                )

                optimized_scores.append(optimized_score)
                optimized_runtimes.append(optimized_runtime)
                optimized_convergence.append(optimized_iters)

            results.append(
                {
                    "environment": env_name,
                    "algorithm": algo_name,
                    "configuration": "grid search",
                    "cumulative_score": np.mean(optimized_scores),
                    "runtime": np.mean(optimized_runtimes),
                    "iterations_to_converge": np.mean(optimized_convergence),
                }
            )

            # T-test for scores
            t_stat, p_value = stats.ttest_rel(baseline_scores, optimized_scores)
            isValid = p_value < 0.05
            t_test_results.append(
                {
                    "environment": env_name,
                    "algorithm": algo_name,
                    "t_stat": t_stat,
                    "p_value": p_value,
                    "isValid": isValid,
                }
            )

  if not isinstance(terminated, (bool, np.bool8)):


runtime = 0.01 seconds
runtime = 0.01 seconds
runtime = 0.01 seconds
runtime = 0.01 seconds
runtime = 0.03 seconds
runtime = 0.03 seconds
runtime = 0.01 seconds
runtime = 0.02 seconds


                                                       

runtime = 0.51 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.50 seconds


                                                       

runtime = 0.50 seconds
runtime = 0.42 seconds


  if not isinstance(terminated, (bool, np.bool8)):


runtime = 0.41 seconds
runtime = 0.32 seconds
runtime = 0.31 seconds
runtime = 0.35 seconds
runtime = 0.34 seconds
runtime = 0.14 seconds
runtime = 0.17 seconds


                                                      

runtime = 6.46 seconds


                                                      

runtime = 6.38 seconds


                                                      

runtime = 8.38 seconds


                                                      

runtime = 8.50 seconds
runtime = 3.87 seconds


  if not isinstance(terminated, (bool, np.bool8)):


runtime = 3.87 seconds
runtime = 0.30 seconds
runtime = 0.32 seconds
runtime = 4.48 seconds
runtime = 4.52 seconds
runtime = 0.50 seconds
runtime = 0.42 seconds


                                                      

runtime = 5.09 seconds


                                                      

runtime = 4.91 seconds


                                                      

runtime = 6.07 seconds


                                                      

runtime = 5.86 seconds




In [8]:
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,environment,algorithm,configuration,cumulative_score,runtime,iterations_to_converge
0,Blackjack-v1,VI,baseline,0.0,0.014716,8.0
1,Blackjack-v1,VI,grid search,2.5,0.011255,7.0
2,Blackjack-v1,PI,baseline,10.0,0.027079,6.0
3,Blackjack-v1,PI,grid search,-22.0,0.016395,5.5
4,Blackjack-v1,QL,baseline,-15.5,0.49687,16.0
5,Blackjack-v1,QL,grid search,4.0,0.501511,19.5
6,FrozenLake8x8-v1,VI,baseline,93.0,0.417584,467.0
7,FrozenLake8x8-v1,VI,grid search,88.0,0.316876,467.0
8,FrozenLake8x8-v1,PI,baseline,90.0,0.343897,7.0
9,FrozenLake8x8-v1,PI,grid search,89.0,0.155575,7.5


In [7]:
t_test_results_df = pd.DataFrame(t_test_results)
display(t_test_results_df)

Unnamed: 0,environment,algorithm,t_stat,p_value,isValid
0,Blackjack-v1,VI,-5.0,0.125666,False
1,Blackjack-v1,PI,4.0,0.155958,False
2,Blackjack-v1,QL,-4.333333,0.144385,False
3,FrozenLake8x8-v1,VI,1.25,0.429553,False
4,FrozenLake8x8-v1,PI,0.333333,0.795167,False
5,FrozenLake8x8-v1,QL,,,False
6,FrozenLake16x16-v1,VI,-22.0,0.028917,True
7,FrozenLake16x16-v1,PI,-4.272727,0.146362,False
8,FrozenLake16x16-v1,QL,,,False
