In [1]:
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

from lib.params import Params
from lib.agent import QLearningAgent

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

from pathlib import Path

sns.set_theme()

## Parameters

In [5]:
total_episodes = 10_000
initial_epsilon = 1

params = Params(
    total_episodes=total_episodes,
    learning_rate=0.8,
    discount_factor=0.95,
    initial_epsilon=initial_epsilon,
    epsilon_decay=initial_epsilon / (total_episodes / 2), # reduce the exploration over time
    final_epsilon=0.1,
    is_slippery=False,
    n_runs=20,
    action_size=None,
    state_size=None,
    proba_frozen=0.9,
    savefig_folder=Path("fig/frozenlake-v1/"),
    seed=1
)

# create the figure folder if it doesn't exists
params.savefig_folder.mkdir(parents=True, exist_ok=True)

## Training loop

In [6]:
def train(env: gym.Env, agent: QLearningAgent):
    rewards = np.zeros((params.total_episodes, params.n_runs))
    steps = np.zeros((params.total_episodes, params.n_runs))
    episodes = np.arange(params.total_episodes)
    qtables = np.zeros((params.n_runs, params.state_size, params.action_size))
    all_states = []
    all_actions = []

    for run in range(params.n_runs):  # run several times to account for stochasticity
        agent.reset_qtable()  # reset the Q-table between runs

        for episode in tqdm(
            episodes, desc=f"Run {run}/{params.n_runs} - Episodes", leave=False
        ):
            state = env.reset(seed=params.seed)[0]  # Reset the environment
            step = 0
            done = False
            total_rewards = 0

            # play episode
            while not done:
                action = agent.get_action(
                     state=state, 
                     action_space=env.action_space
                )

                # log all states and actions
                all_states.append(state)
                all_actions.append(action)

                # take the action (a) and observe the outcome state(s') and reward (r)
                new_state, reward, terminated, truncated, info = env.step(action)

                done = terminated or truncated

                # update the agent
                agent.update(
                    state=state, 
                    action=action, 
                    reward=reward,
                    terminated=terminated, 
                    new_state=new_state
                )

                total_rewards += reward
                step += 1

                # new state is now state
                state = new_state

            # log all rewards and steps
            rewards[episode, run] = total_rewards
            steps[episode, run] = step
        qtables[run, :, :] = agent.qtable

    return rewards, steps, episodes, qtables, all_states, all_actions