In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
module_path = "~/github/qhack_2023/"
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import gym
from gym.envs.toy_text.frozen_lake import generate_random_map

from plotly.subplots import make_subplots
import plotly.graph_objects as go


import numpy as np

from algorithms.q_learning import QLearning
from algorithms.qrl_classic import QRLClassic
from algorithms.custom_eval_callback import CustomEvalCallback

In [None]:
def run_experiment(
    model,
    model_name,
    model_parameters: dict,
    random_seed: int = 542,
    env_map_size: int = 10,
    env_non_deterministic: bool = False,
    eval_freq: int = 100,
    n_eval_episodes: int = 20,
    total_timesteps: int = 1_000_000
):
    np.random.seed(seed=random_seed)
    
    random_map = generate_random_map(size=env_map_size, p=0.8)
    env = gym.make("FrozenLake-v1", desc=random_map, is_slippery=env_non_deterministic)
    eval_env = gym.make("FrozenLake-v1", desc=random_map, is_slippery=env_non_deterministic)        
        
    callback = CustomEvalCallback(
        eval_env=eval_env,
        eval_freq=eval_freq,
        n_eval_episodes=n_eval_episodes,
        verbose=0
    )
        
    model = model(
        policy = None,
        env = env,
        **model_parameters,
    )
    
    model.learn(
        total_timesteps=total_timesteps,
        callback = callback    
    )
    
    learning_curve = callback.learning_curve
    eval_freq = callback.eval_freq
    
    return eval_freq, learning_curve
    
    

In [None]:
random_seed = 542
np.random.seed(seed=random_seed)
random_seeds = [np.random.randint(1000) for _ in range(10)]


In [None]:
env_non_deterministic = True
env_map_size = 5

In [None]:
# Training parameters
total_timesteps = 100_000  # Total training steps
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes
eval_freq = 100

# Environment parameters
max_steps = 200              # Max steps per episode
gamma = 0.95                 # Discounting rate

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01              # Exponential decay rate for exploration prob



model_parameters = {
    "learning_rate": learning_rate,
    "gamma": gamma,
    "max_steps": max_steps,
    "max_epsilon": max_epsilon,
    "min_epsilon": min_epsilon,
    "decay_rate": decay_rate,
    "verbose": 0,
    "seed": None,
    "device": "auto",
    "_init_setup_model": False,    
}



learning_curves_ql = list()
for rs in random_seeds:
    _, lc = run_experiment(
        model=QLearning,
        model_name="ql_deterministic",
        model_parameters=model_parameters,
        env_non_deterministic=env_non_deterministic,
        env_map_size=env_map_size,
        eval_freq=eval_freq,
        total_timesteps = total_timesteps,
        random_seed=rs,
    )
    learning_curves_ql.append(lc)


In [None]:
# Training parameters
total_timesteps = 100_000  # Total training steps
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
max_steps = 200               # Max steps per episode
gamma = 0.95                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment



model = QRLClassic

model_parameters = {
    "learning_rate": learning_rate,
    "gamma": gamma,
    "max_steps": max_steps,
    "verbose": 0,
    "seed": None,
    "device": "auto",
    "_init_setup_model": False,    
}


learning_curves_qrl = list()
for rs in random_seeds:
    _, lc = run_experiment(
        model=QRLClassic,
        model_name="qrl",
        model_parameters=model_parameters,
        env_non_deterministic=env_non_deterministic,
        env_map_size=env_map_size,
        eval_freq=eval_freq,
        total_timesteps = total_timesteps,
        random_seed=rs,
    )
    learning_curves_qrl.append(lc)


In [None]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=("Q-learning", "QRL"))


cutoff = 100
for i, lc in enumerate(learning_curves_ql):
    lc = lc[:cutoff]
    mean_reward = [c[0] for c in lc]
    x = [e*eval_freq for e in range(len(mean_reward))]
        
    fig.add_trace(
        go.Scatter(
            x=x,
            y=mean_reward,
            mode="lines",
            name=f"ql_mean_reward_{i}",
        ),
        row=1,
        col=1,
    )   
    
for i, lc in enumerate(learning_curves_qrl):
    lc = lc[:cutoff]    
    mean_reward = [c[0] for c in lc]
    x = [e*eval_freq for e in range(len(mean_reward))]
        
    fig.add_trace(
        go.Scatter(
            x=x,
            y=mean_reward,
            mode="lines",
            name=f"qrl_mean_reward_{i}",
        ),
        row=1,
        col=2,
    )   
    
fig.update_layout(width=1200, height=600, showlegend=True, title_text=f"QL vs QRL learning curves (map size {env_map_size} random {env_non_deterministic})")
fig.show()    
plot_name = f"../images/ql_vs_qrl_size_{env_map_size}_random_{env_non_deterministic}"
fig.write_html(f"{plot_name}.html")
fig.write_image(f"{plot_name}.png")