In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
module_path = "/home/jacky/Desktop/qhack_2023"
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import gym
from gym.envs.toy_text.frozen_lake import generate_random_map

from plotly.subplots import make_subplots
import plotly.graph_objects as go


import numpy as np

from algorithms.q_learning import QLearning
from algorithms.qrl_classic import QRLClassic
from algorithms.custom_eval_callback import CustomEvalCallback

In [None]:
from typing import Dict, Optional, Tuple, Union

import torch as th
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule

from algorithms.custom_eval_callback import CustomEvalCallback


class TDLambda(BaseAlgorithm):
    """
    TD-Lambda
    NOTE: this implementation has some unused parameters because we inherit from the SB3 BaseAlgorithm class.
    """

    def _setup_model(self) -> None:
        pass

    def __init__(
        self,
        policy: str,
        env: Union[GymEnv, str],
        learning_rate: Union[float, Schedule] = 7e-4,
        gamma: float = 0.95,  # Discounting rate
        lamb: float = 0.95,  # Lambda Discounting rate
        max_steps: int = 99,  # Max steps per episode
        max_epsilon: float = 1.0,  # Exploration probability at start
        min_epsilon: float = 0.05,  # Minimum exploration probability
        decay_rate: float = 0.0005,  # Exponential decay rate for exploration prob
        verbose: int = 0,
        seed: Optional[int] = None,
        device: Union[th.device, str] = "auto",
        _init_setup_model: bool = True,
    ):
        super().__init__(
            policy,
            env,
            learning_rate=learning_rate,
            verbose=verbose,
            device=device,
            seed=seed,
        )

        # Agent parameters
        self.gamma = gamma
        self.lamb = lamb
        self.max_steps = max_steps

        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.decay_rate = decay_rate

        state_space_n = self.observation_space.n
        action_space_n = self.action_space.n
        self.q_table = np.zeros((state_space_n, action_space_n))
        self.eligibility_trace = np.zeros((state_space_n, action_space_n))

    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 100,
        tb_log_name: str = "TDLambda",
        reset_num_timesteps: bool = True,
        progress_bar: bool = False,
    ):
        total_timesteps, callback = self._setup_learn(
            total_timesteps,
            callback,
            reset_num_timesteps,
            tb_log_name,
            progress_bar,
        )

        step = 0
        episode_n = 0
        while step < total_timesteps:
            episode_n += 1
            episode_step = 0
            episode_done = False
            # Reduce epsilon (because we need less and less exploration)
            epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_rate * episode_n)
            state = self.env.reset()

            # Run an episode
            while not episode_done or episode_step < self.max_steps:
                step += 1
                episode_step += 1

                action = self._epsilon_greedy_policy(state, epsilon)

                # Take action At and observe Rt+1 and St+1
                # Take the action (a) and observe the outcome state(s') and reward (r)
                new_state, reward, episode_done, info = self.env.step(action)
                self._update_eligibility_table(state, action)
                self._update_q_table(state, action, new_state, reward)

                # Our next state is the new state
                state = new_state

                if isinstance(callback, CustomEvalCallback):
                    if step % callback.eval_freq == 0:
                        callback.on_step()

    def _greedy_policy(self, state):
        # Exploitation: take the action with the highest state, action value
        action = np.array([np.argmax(self.q_table[s][:]) for s in state])
        return action

    def _epsilon_greedy_policy(self, state, epsilon):
        # Randomly generate a number between 0 and 1
        random_int = np.random.uniform(0, 1)
        # if random_int > greater than epsilon --> exploitation
        if random_int > epsilon:
            action = self._greedy_policy(state)
        # else --> exploration
        else:
            action = [self.action_space.sample() for _ in state]
        return action

    def _update_q_table(self, state, action, new_state, reward):
        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        state_idx = state[0]
        new_state_idx = new_state[0]
        action_idx = action[0]
        reward_idx = reward[0]
        td_error = reward_idx + self.gamma * np.max(self.q_table[new_state_idx]) - self.q_table[state_idx][action_idx]
        self.q_table = self.q_table + self.learning_rate * td_error * self.eligibility_trace
        
    def _update_eligibility_table(self, state, action):
        state_idx = state[0]
        action_idx = action[0]
        self.eligibility_trace *= self.lamb * self.gamma
        self.eligibility_trace[state_idx][action_idx] += 1.0


    def predict(
        self,
        observation: Union[np.ndarray, Dict[str, np.ndarray]],
        state: Optional[Tuple[np.ndarray, ...]] = None,
        episode_start: Optional[np.ndarray] = None,
        deterministic: bool = False,
    ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]:
        if deterministic:
            action = self._greedy_policy(observation)
            return action, state

        else:
            raise NotImplementedError


In [None]:
def run_experiment(
    model,
    model_name,
    model_parameters: dict,
    random_seed: int = 542,
    env_map_size: int = 10,
    env_non_deterministic: bool = False,
    eval_freq: int = 100,
    n_eval_episodes: int = 20,
    total_timesteps: int = 1_000_000
):
    np.random.seed(seed=random_seed)
    
    random_map = generate_random_map(size=env_map_size, p=0.8)
    env = gym.make("FrozenLake-v1", desc=random_map, is_slippery=env_non_deterministic)
    eval_env = gym.make("FrozenLake-v1", desc=random_map, is_slippery=env_non_deterministic)
        
    callback = CustomEvalCallback(
        eval_env=eval_env,
        eval_freq=eval_freq,
        n_eval_episodes=n_eval_episodes,
        verbose=0
    )
        
    model = model(
        policy = None,
        env = env,
        **model_parameters,
    )
    
    model.learn(
        total_timesteps=total_timesteps,
        callback = callback    
    )
    
    learning_curve = callback.learning_curve
    eval_freq = callback.eval_freq
    
    return eval_freq, learning_curve
    
    

In [None]:
random_seed = 542
np.random.seed(seed=random_seed)
random_seeds = [np.random.randint(1000) for _ in range(10)]

env_non_deterministic = False
env_map_size = 5

In [None]:
# Training parameters
total_timesteps = 10000  # Total training steps
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes
eval_freq = 100

# Environment parameters
max_steps = 200              # Max steps per episode
gamma = 0.95                 # Discounting rate
lamb = 0.0005                  # Lambda rate

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.00005              # Exponential decay rate for exploration prob



model_parameters = {
    "learning_rate": learning_rate,
    "gamma": gamma,
    "lamb": lamb,
    "max_steps": max_steps,
    "max_epsilon": max_epsilon,
    "min_epsilon": min_epsilon,
    "decay_rate": decay_rate,
    "verbose": 0,
    "seed": None,
    "device": "auto",
    "_init_setup_model": False,    
}



learning_curves_tdl = list()
for rs in random_seeds:
    _, lc = run_experiment(
        model=TDLambda,
        model_name="TDLambda_deterministic",
        model_parameters=model_parameters,
        env_non_deterministic=env_non_deterministic,
        env_map_size=env_map_size,
        eval_freq=eval_freq,
        total_timesteps = total_timesteps,
        random_seed=rs,
    )
    learning_curves_tdl.append(lc)


In [None]:
# Training parameters
total_timesteps = 10000  # Total training steps
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes
eval_freq = 100

# Environment parameters
max_steps = 200              # Max steps per episode
gamma = 0.95                 # Discounting rate

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01              # Exponential decay rate for exploration prob



model_parameters = {
    "learning_rate": learning_rate,
    "gamma": gamma,
    "max_steps": max_steps,
    "max_epsilon": max_epsilon,
    "min_epsilon": min_epsilon,
    "decay_rate": decay_rate,
    "verbose": 0,
    "seed": None,
    "device": "auto",
    "_init_setup_model": False,    
}



learning_curves_ql = list()
for rs in random_seeds:
    _, lc = run_experiment(
        model=QLearning,
        model_name="ql_deterministic",
        model_parameters=model_parameters,
        env_non_deterministic=env_non_deterministic,
        env_map_size=env_map_size,
        eval_freq=eval_freq,
        total_timesteps = total_timesteps,
        random_seed=rs,
    )
    learning_curves_ql.append(lc)


In [None]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=("TD-Lambda", "QL"))


cutoff = 100
for i, lc in enumerate(learning_curves_tdl):
    lc = lc[:cutoff]
    mean_reward = [c[0] for c in lc]
    x = [e*eval_freq for e in range(len(mean_reward))]
        
    fig.add_trace(
        go.Scatter(
            x=x,
            y=mean_reward,
            mode="lines",
            name=f"ql_mean_reward_{i}",
        ),
        row=1,
        col=1,
    )   
    
for i, lc in enumerate(learning_curves_ql):
    lc = lc[:cutoff]    
    mean_reward = [c[0] for c in lc]
    x = [e*eval_freq for e in range(len(mean_reward))]
        
    fig.add_trace(
        go.Scatter(
            x=x,
            y=mean_reward,
            mode="lines",
            name=f"qrl_mean_reward_{i}",
        ),
        row=1,
        col=2,
    )   
    
fig.update_layout(width=1200, height=600, showlegend=True, title_text=f"TD-Lambda vs QL learning curves (map size {env_map_size} random {env_non_deterministic})")
fig.show()    
plot_name = f"../images/ql_vs_qrl_size_{env_map_size}_random_{env_non_deterministic}"
fig.write_html(f"{plot_name}.html")
fig.write_image(f"{plot_name}.png")