# Exploration vs. Exploitation

In [8]:
import time

import numpy as np
import pandas as pd
import gymnasium as gym
from bettermdptools.utils.blackjack_wrapper import BlackjackWrapper
from bettermdptools.algorithms.rl import RL
from bettermdptools.utils.test_env import TestEnv

from grid_search2 import set_seed, check_ql_convergence

N_ITERATIONS = 10000
SEED = 666
CV_JOBS = 20


class RL2:
    def __init__(self, env):
        self.env = env

    def q_learning(
        self, num_episodes=1000, strategy="epsilon_greedy", strategy_params=None
    ):
        Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        alpha = 0.5
        gamma = 0.99

        # Initialize Q_track and pi_track to log the progress
        Q_track = np.zeros(
            (num_episodes, self.env.observation_space.n, self.env.action_space.n)
        )
        pi_track = []

        for i in range(num_episodes):
            initial_state_info = self.env.reset()
            state = self.extract_state(initial_state_info)
            done = False

            while not done:
                action = self.select_action(state, Q, strategy, strategy_params, i)
                next_state_info, reward, terminated, truncated, _ = self.env.step(
                    action
                )
                next_state = self.extract_state(
                    next_state_info
                )  # Extract correct next state
                done = terminated or truncated

                # Q-learning update
                best_next_action = np.argmax(Q[next_state])
                Q[state][action] += alpha * (
                    reward + gamma * Q[next_state][best_next_action] - Q[state][action]
                )
                state = next_state  # Update state for the next iteration

            # Log the Q-values and policy after each episode
            Q_track[i] = Q.copy()
            pi_track.append(np.argmax(Q, axis=1))

        pi = np.argmax(Q, axis=1)
        V = np.max(Q, axis=1)
        return Q, V, pi, Q_track, pi_track

    def extract_state(self, state_info):
        if isinstance(state_info, tuple) and isinstance(state_info[0], int):
            return state_info[0]
        return state_info

    # def select_action(self, state, Q, strategy, strategy_params, episode):
    #     if strategy == "epsilon_greedy":
    #         epsilon = strategy_params["initial_epsilon"] * (
    #             strategy_params["decay"] ** episode
    #         )
    #         if np.random.rand() < epsilon:
    #             return self.env.action_space.sample()
    #         else:
    #             return np.argmax(Q[state])
    #     elif strategy == "softmax":
    #         tau = strategy_params["initial_tau"] / (
    #             1 + episode * strategy_params["tau_decay"]
    #         )
    #         exp_q = np.exp(Q[state] / tau)
    #         probabilities = exp_q / np.sum(exp_q)
    #         return np.random.choice(np.arange(len(Q[state])), p=probabilities)

    def select_action(self, state, Q, strategy, strategy_params, episode):
        if strategy == "epsilon_greedy":
            epsilon = strategy_params["initial_epsilon"] * (
                strategy_params["decay"] ** episode
            )
            if np.random.rand() < epsilon:
                return self.env.action_space.sample()
            else:
                return np.argmax(Q[state])
        elif strategy == "softmax":
            tau = strategy_params["initial_tau"] / (
                1 + episode * strategy_params["tau_decay"]
            )
            q_values = Q[state]

            q_values_adjusted = q_values - np.max(q_values)
            exp_q = np.exp(q_values_adjusted / tau)
            sum_exp_q = np.sum(exp_q)
            if sum_exp_q > 0:
                probabilities = exp_q / sum_exp_q
            else:
                
                probabilities = np.ones_like(exp_q) / len(exp_q)
            return np.random.choice(np.arange(len(Q[state])), p=probabilities)


if __name__ == "__main__":
    strategy_params = {
        "initial_epsilon": 0.9,
        "decay": 0.99,
        "initial_tau": 1.0,
        "tau_decay": 0.01,
    }

    set_seed(SEED)

    results = []
    t_test_results = []

    for env_name in ["FrozenLake8x8-v1", "FrozenLake16x16-v1", "Blackjack-v1"]:
        # environment setup
        if env_name == "Blackjack-v1":
            base_env = gym.make("Blackjack-v1", render_mode=None)
            env = BlackjackWrapper(base_env)
        else:
            env = gym.make(env_name)

        # Q-Learning setup
        agent1 = RL(env)

        # load best configuration
        df = pd.read_csv(f"results2/{env_name}/QL_grid_search_results.csv")
        df = df.sort_values(by="cumulative_score", ascending=False)
        best_config = df.to_dict(orient="records")[0]
        [
            best_config.pop(col)
            for col in ("cumulative_score", "runtime", "iterations_to_converge")
        ]

        # baseline and optimized performance
        for strat in ["baseline", "optimized"]:
            if strat == "baseline":
                params = {"n_episodes": best_config.get("n_episodes", N_ITERATIONS)}
            else:
                params = best_config.copy()

            for _ in range(CV_JOBS):
                start_time = time.time()
                Q, V, pi, Q_track, pi_track = agent1.q_learning(**params)
                runtime = time.time() - start_time

                # Evaluate policy
                episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
                average_reward = np.mean(episode_rewards)

                iteration_converged = check_ql_convergence(Q_track)

                results.append(
                    {
                        "environment": env_name,
                        "strategy": strat,
                        "average_reward": average_reward,
                        "runtime": runtime,
                        "iterations_to_converge": iteration_converged,
                    }
                )

            scores_mean = np.mean(average_reward)
            scores_std = np.std(average_reward)
            runtimes_mean = np.mean(runtime)
            runtimes_std = np.std(runtime)
            iterations_mean = np.mean(iteration_converged)
            iterations_std = np.std(iteration_converged)

            print(
                f"{env_name} {strat} Average reward: {np.round(scores_mean, 4)} +/- {np.round(scores_std, 4)}"
            )
            print(
                f"{env_name} {strat} Average iterations to converge: {np.round(iterations_mean, 4)} +/- {np.round(iterations_std, 4)}"
            )
            print(
                f"{env_name} {strat} Average runtime: {np.round(runtimes_mean, 4)} +/- {np.round(runtimes_std, 4)} seconds"
            )

        # t-test baseline vs optimized scores
        # t_stat, p_value = stats.ttest_rel(
        #     results[env_name]["baseline"]["scores"],
        #     results[env_name]["optimized"]["scores"],
        # )
        # isSignificant = p_value < 0.05
        # print(
        #     f"{env_name} optimized t-statistic: {t_stat}, p-value: {p_value}, Significance: {isSignificant}"
        # )

        # t_test_results.append({
        #     "environment": env_name,
        #     "strategy": "optimized",
        #     "t_stat": t_stat,
        #     "p_value": p_value,
        #     "isSignificant": isSignificant,
        # })

        # epsilon-greedy and softmax performance
        n_episodes = best_config.get("n_episodes", N_ITERATIONS)

        # Q-Learning setup
        agent2 = RL2(env)

        for strat in ["epsilon_greedy", "softmax"]:
            for _ in range(CV_JOBS):
                start_time = time.time()
                Q, V, pi, Q_track, pi_track = agent2.q_learning(
                    num_episodes=n_episodes,
                    strategy=strat,
                    strategy_params=strategy_params,
                )
                runtime = time.time() - start_time

                # Evaluate policy
                episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
                average_reward = np.mean(episode_rewards)

                iteration_converged = check_ql_convergence(Q_track)

                results.append(
                    {
                        "environment": env_name,
                        "strategy": strat,
                        "average_reward": average_reward,
                        "runtime": runtime,
                        "iterations_to_converge": iteration_converged,
                    }
                )

            scores_mean = np.mean(average_reward)
            scores_std = np.std(average_reward)
            runtimes_mean = np.mean(runtime)
            runtimes_std = np.std(runtime)
            iterations_mean = np.mean(iteration_converged)
            iterations_std = np.std(iteration_converged)

            print(
                f"{env_name} {strat} Average reward: {np.round(scores_mean, 4)} +/- {np.round(scores_std, 4)}"
            )
            print(
                f"{env_name} {strat} Average iterations to converge: {np.round(iterations_mean, 4)} +/- {np.round(iterations_std, 4)}"
            )
            print(
                f"{env_name} {strat} Average runtime: {np.round(runtimes_mean, 4)} +/- {np.round(runtimes_std, 4)} seconds"
            )

            # t-test baseline vs strategy scores
            # t_stat, p_value = stats.ttest_rel(
            #     results[env_name]["baseline"]["scores"],
            #     results[env_name][strat]["scores"],
            # )
            # isSignificant = p_value < 0.05
            # print(
            #     f"{env_name} {strat} t-statistic: {t_stat}, p-value: {p_value}, Significance: {isSignificant}"
            # )

            # t_test_results.append({
            #     "environment": env_name,
            #     "strategy": strat,
            #     "t_stat": t_stat,
            #     "p_value": p_value,
            #     "isSignificant": isSignificant,
            # })

  0%|          | 0/10000 [00:00<?, ?it/s]

  if not isinstance(terminated, (bool, np.bool8)):
                                                      

runtime = 5.00 seconds


                                                      

runtime = 6.46 seconds


                                                      

runtime = 6.62 seconds


                                                      

runtime = 6.47 seconds


                                                      

runtime = 6.34 seconds


                                                      

runtime = 6.40 seconds


                                                      

runtime = 6.43 seconds


                                                      

runtime = 4.96 seconds


                                                      

runtime = 6.34 seconds


                                                      

runtime = 6.32 seconds


                                                      

runtime = 5.09 seconds


                                                      

runtime = 6.36 seconds


                                                      

runtime = 6.28 seconds


                                                      

runtime = 6.26 seconds


                                                      

runtime = 6.33 seconds


                                                      

runtime = 6.30 seconds


                                                      

runtime = 6.36 seconds


                                                      

runtime = 6.30 seconds


                                                      

runtime = 6.22 seconds


                                                      

runtime = 4.99 seconds
FrozenLake8x8-v1 baseline Average reward: 0.84 +/- 0.0
FrozenLake8x8-v1 baseline Average iterations to converge: 1.0 +/- 0.0
FrozenLake8x8-v1 baseline Average runtime: 4.9939 +/- 0.0 seconds


                                                      

runtime = 4.84 seconds


                                                      

runtime = 8.42 seconds


                                                      

runtime = 8.41 seconds


                                                      

runtime = 8.27 seconds


                                                      

runtime = 8.31 seconds


                                                      

runtime = 8.43 seconds


                                                      

runtime = 8.29 seconds


                                                      

runtime = 8.37 seconds


                                                      

runtime = 4.77 seconds


                                                      

runtime = 8.47 seconds


                                                      

runtime = 8.33 seconds


                                                      

runtime = 8.36 seconds


                                                      

runtime = 8.41 seconds


                                                      

runtime = 8.19 seconds


                                                      

runtime = 8.16 seconds


                                                      

runtime = 8.21 seconds


                                                      

runtime = 8.16 seconds


                                                      

runtime = 8.18 seconds


                                                      

runtime = 8.18 seconds


                                                      

runtime = 8.24 seconds
FrozenLake8x8-v1 optimized Average reward: 0.0 +/- 0.0
FrozenLake8x8-v1 optimized Average iterations to converge: 1.0 +/- 0.0
FrozenLake8x8-v1 optimized Average runtime: 8.2429 +/- 0.0 seconds
FrozenLake8x8-v1 epsilon_greedy Average reward: 0.0 +/- 0.0
FrozenLake8x8-v1 epsilon_greedy Average iterations to converge: 1.0 +/- 0.0
FrozenLake8x8-v1 epsilon_greedy Average runtime: 11.9759 +/- 0.0 seconds
FrozenLake8x8-v1 softmax Average reward: 0.85 +/- 0.0
FrozenLake8x8-v1 softmax Average iterations to converge: 1.0 +/- 0.0
FrozenLake8x8-v1 softmax Average runtime: 16.3602 +/- 0.0 seconds


  if not isinstance(terminated, (bool, np.bool8)):
                                                      

runtime = 5.11 seconds


                                                      

runtime = 5.11 seconds


                                                      

runtime = 4.87 seconds


                                                      

runtime = 4.91 seconds


                                                      

runtime = 4.88 seconds


                                                      

runtime = 4.98 seconds


                                                      

runtime = 5.01 seconds


                                                      

runtime = 5.02 seconds


                                                      

runtime = 5.00 seconds


                                                      

runtime = 5.00 seconds


                                                      

runtime = 4.89 seconds


                                                      

runtime = 5.04 seconds


                                                      

runtime = 4.95 seconds


                                                      

runtime = 5.00 seconds


                                                      

runtime = 4.92 seconds


                                                      

runtime = 5.05 seconds


                                                      

runtime = 4.90 seconds


                                                      

runtime = 4.99 seconds


                                                      

runtime = 4.91 seconds


                                                      

runtime = 4.86 seconds
FrozenLake16x16-v1 baseline Average reward: 0.0 +/- 0.0
FrozenLake16x16-v1 baseline Average iterations to converge: 1.0 +/- 0.0
FrozenLake16x16-v1 baseline Average runtime: 4.8582 +/- 0.0 seconds


                                                      

runtime = 5.94 seconds


                                                      

runtime = 5.88 seconds


                                                      

runtime = 5.82 seconds


                                                      

runtime = 5.99 seconds


                                                      

runtime = 5.75 seconds


                                                      

runtime = 5.90 seconds


                                                      

runtime = 5.88 seconds


                                                      

runtime = 5.96 seconds


                                                      

runtime = 5.73 seconds


                                                      

runtime = 5.85 seconds


                                                      

runtime = 5.97 seconds


                                                      

runtime = 5.96 seconds


                                                      

runtime = 5.77 seconds


                                                      

runtime = 5.79 seconds


                                                      

runtime = 5.91 seconds


                                                      

runtime = 5.94 seconds


                                                      

runtime = 5.93 seconds


                                                      

runtime = 5.92 seconds


                                                      

runtime = 5.83 seconds


                                                      

runtime = 5.87 seconds
FrozenLake16x16-v1 optimized Average reward: 0.0 +/- 0.0
FrozenLake16x16-v1 optimized Average iterations to converge: 1.0 +/- 0.0
FrozenLake16x16-v1 optimized Average runtime: 5.8705 +/- 0.0 seconds
FrozenLake16x16-v1 epsilon_greedy Average reward: 0.0 +/- 0.0
FrozenLake16x16-v1 epsilon_greedy Average iterations to converge: 1.0 +/- 0.0
FrozenLake16x16-v1 epsilon_greedy Average runtime: 5.9298 +/- 0.0 seconds
FrozenLake16x16-v1 softmax Average reward: 0.0 +/- 0.0
FrozenLake16x16-v1 softmax Average iterations to converge: 1.0 +/- 0.0
FrozenLake16x16-v1 softmax Average runtime: 5.4023 +/- 0.0 seconds


  if not isinstance(terminated, (bool, np.bool8)):
                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.51 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.49 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.49 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.47 seconds
Blackjack-v1 baseline Average reward: -0.15 +/- 0.0
Blackjack-v1 baseline Average iterations to converge: 2.0 +/- 0.0
Blackjack-v1 baseline Average runtime: 0.4681 +/- 0.0 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.53 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.49 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.49 seconds


                                                       

runtime = 0.47 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.50 seconds


                                                       

runtime = 0.48 seconds


                                                       

runtime = 0.50 seconds


                                                       

runtime = 0.68 seconds
Blackjack-v1 optimized Average reward: -0.11 +/- 0.0
Blackjack-v1 optimized Average iterations to converge: 72.0 +/- 0.0
Blackjack-v1 optimized Average runtime: 0.6835 +/- 0.0 seconds
Blackjack-v1 epsilon_greedy Average reward: -0.21 +/- 0.0
Blackjack-v1 epsilon_greedy Average iterations to converge: 127.0 +/- 0.0
Blackjack-v1 epsilon_greedy Average runtime: 0.4636 +/- 0.0 seconds
Blackjack-v1 softmax Average reward: -0.16 +/- 0.0
Blackjack-v1 softmax Average iterations to converge: 11.0 +/- 0.0
Blackjack-v1 softmax Average runtime: 0.6832 +/- 0.0 seconds


In [10]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,environment,strategy,average_reward,runtime,iterations_to_converge
0,FrozenLake8x8-v1,baseline,0.84,5.007597,1
1,FrozenLake8x8-v1,baseline,0.00,6.464121,1
2,FrozenLake8x8-v1,baseline,0.00,6.624337,1
3,FrozenLake8x8-v1,baseline,0.00,6.474760,1
4,FrozenLake8x8-v1,baseline,0.00,6.346132,1
...,...,...,...,...,...
235,Blackjack-v1,softmax,-0.31,0.678787,56
236,Blackjack-v1,softmax,-0.21,0.676793,8
237,Blackjack-v1,softmax,-0.15,0.671025,3
238,Blackjack-v1,softmax,-0.31,0.685600,69


In [11]:
# group by environment and strategy and calculate mean and std
df_results_grouped = df_results.groupby(["environment", "strategy"]).agg(
    {"average_reward": ["mean", "std"], "runtime": ["mean", "std"]}
)

df_results_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,average_reward,average_reward,runtime,runtime
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
environment,strategy,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Blackjack-v1,baseline,-0.0905,0.054914,0.480789,0.009095
Blackjack-v1,epsilon_greedy,-0.222,0.087696,0.487068,0.016052
Blackjack-v1,optimized,-0.0805,0.068247,0.4963,0.046215
Blackjack-v1,softmax,-0.228,0.081538,0.703292,0.036097
FrozenLake16x16-v1,baseline,0.0,0.0,4.972148,0.075668
FrozenLake16x16-v1,epsilon_greedy,0.0,0.0,5.937748,0.045355
FrozenLake16x16-v1,optimized,0.0,0.0,5.883211,0.076739
FrozenLake16x16-v1,softmax,0.0,0.0,5.633026,0.079705
FrozenLake8x8-v1,baseline,0.164,0.336646,6.093571,0.561879
FrozenLake8x8-v1,epsilon_greedy,0.0,0.0,11.988782,0.045822
