# Exploration of "Callbacks" and "Hyperparameter optimization"

For the "callbacks" stage, and "hyperparameter optimization" (HPO), I am going to be guided by what is exposed in the "colab" tutorial proposed on the SB3 page:

https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/sb3/4_callbacks_hyperparameter_tuning.ipynb

As a training environment I will continue with the __"RetroMtpoNesReducedRL()"__ class, the action wrapper, and additionally I will add some additional "wrappers" that we will discuss later.

In [60]:
# Imports for "gym" and "retro" features
import os
from gym import Env
import gym
from gym.spaces import MultiDiscrete, Box, MultiBinary
import retro
from retro import RetroEnv
import time

# To help with image preprocessing
import numpy as np
import cv2

# Todo lo concerniente a SB3
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, VecTransposeImage
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy 
from stable_baselines3.common.callbacks import BaseCallback


# Para la optimización de hiperparametros
import optuna

In [5]:
class RetroMtpoNesReducedRL(Env):
    """
    Class that creates a retro "Gym" object, and allows me to manipulate its observation space.
     With this I seek to reduce the observations space, to speed up the training stage.

     This class creates a "focus area", removing the outter most two thirds of the screen (vertically), 
     leaving in "focus" a area where the action of the game takes place. Additionally I reduce the number
     of color channels, from three to one, which gives the feeling that the game is in black and white (also
     called "grayscaling").

     In this class, additionally, the "viewing" area is reduced, going from an observation space of 196x80x1
     to one of 84x84x1.
     
     The main inspiration for this class comes from a Youtube tutorial from Nickolas Renotte.
     
     https://www.youtube.com/watch?v=rzbFhu6So5U&t=6248s
     
    """
    def __init__(self, state='GlassJoe.state',
                 scenario='scenario_king_hippo',
                 inttype=retro.data.Integrations.STABLE,
                 points_as_rewards=True):
        super(RetroEnv).__init__()
        # Most of these lines comes from GYM RETRO library.
        self.img = None
        rom_path = retro.data.get_romfile_path('Mtpo-Nes', inttype)
        self.system = retro.get_romfile_system(rom_path)
        core = retro.get_system_info(self.system)
        self.buttons = core['buttons']
        self.observation_space = Box(low=0, high=255, shape=(84,84,1), dtype=np.uint8)
        self.action_space = MultiBinary(9)
        self.state = state
        self.scenario = scenario
        self.game = retro.make(game='Mtpo-Nes',
                               state=self.state,
                               scenario=self.scenario,
                              )
        self.points_as_rewards = points_as_rewards
        self.picture = None
        

    def preprocess(self, observation):
        """ 
        Method to preprocess the images that the "RetroEnv" object uses during training.
         The idea is to deliver a reduced observation, which helps streamline the training processes of the
         agent. The derivation of the reduced observation can be seen in the notebook:
        
         - '1_CV_Preprocessing.ipynb'
        
         which is part of this 'Notebooks' section
        """
        # Cropping
        xlen = observation.shape[0]
        ylen = observation.shape[1]
        focus_zone = observation[int(xlen*(1/8)):int(xlen*(3/2)),int(ylen/3):-int(ylen/3)]
        # Grayscale
        gray = cv2.cvtColor(focus_zone, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        
        # We must fit the output to a tensor with three dimensions, since
        # it is the data structure that the gym object expects.
        # values between 0 and 1.
        channels = np.reshape(resize, (84,84,1))

        return channels

    def reset(self):
        # Returns the fist "frame"
        obs = self.game.reset()
        processed_obs = self.preprocess(obs)
        self.score = 0
        self.picture = processed_obs
        return processed_obs
    
    def step(self, action):
        # Go one step further in the emulation of the game
        # Integrate the modification to the observation using the "preprocessed()" method
        obs, reward, done, info = self.game.step(action)
        processed_obs = self.preprocess(obs)
        
        # This is to return the points of the game as the reward if we want it.
        if self.points_as_rewards:
            reward_as_points = info['POINTS'] - self.score
            self.score = info['POINTS']
            return processed_obs, reward_as_points, done, info
        else:  
            return processed_obs, reward, done, info
    
    # The rest of the methods are not used much, yet might come in
    # handy in some cases
    def render(self, *args, **kwargs):
        self.game.render()
        
    def close(self):
        self.game.close()

    def get_image(self):
        return self.picture
    
    def get_buttons(self):
        return self.buttons
    
    def get_action_meaning(self, act):
        return self.game.get_action_meaning(act)
    
    def get_in_game_score(self):
        return self.score

    def get_in_game_reward(self):
        return self.in_game_reward

In [6]:
class Discretizer(gym.ActionWrapper):
    """
    Wraps an "Env" object and turn it into an environment with discrete actions.
     args:
         combos: ordered list of lists of valid button combinations.
    """

    def __init__(self, env, combos):
        super().__init__(env)
        assert isinstance(env.action_space, gym.spaces.MultiBinary)
        buttons = env.unwrapped.buttons
        self._decode_discrete_action = []
        for combo in combos:
            arr = np.array([False] * env.action_space.n)
            for button in combo:
                arr[buttons.index(button)] = True
            self._decode_discrete_action.append(arr)

        self.action_space = gym.spaces.Discrete(len(self._decode_discrete_action))

    def action(self, act):
        return self._decode_discrete_action[act].copy()


class MtpoDiscretizer(Discretizer):
    """
    We use discrete actions specific to the Punch-Out game
    """

# Actions to use the star during the fight (super power)
    def __init__(self, env):
        USE_STAR = [
        [], # Motionless
        ['RIGHT'], # Dodge right
        ['LEFT'], # Dodge left
        ['DOWN'], # Cover
        ['UP', 'A'], # Hit the face with a right hand
        ['UP', 'B'], # Hit the face with a left hand
        ['A'], # Punch to the body with a right hand
        ['B'], # Punch to the body with a left hand
        ['START'], # Use super power
        ]

# Actions to not use the star during the fight (super power)
        NO_STAR = [
        [], # Motionless
        ['RIGHT'], # Dodge right
        ['LEFT'], # Dodge left
        ['DOWN'], # Cover
        ['UP', 'A'], # Hit the face with a right hand
        ['UP', 'B'], # Hit the face with a left hand
        ['A'], # Punch to the body with a right hand
        ['B'], # Punch to the body with a left hand
        ]

# Actions to not use the star during the fight (super power) and only dodge blows, not cover
        DODGE = [
        [],
        ['RIGHT'], # Dodge right
        ['LEFT'], # Dodge left
        ['DOWN'], # Cover
        ['UP', 'A'], # Hit the face with a right hand
        ['UP', 'B'], # Hit the face with a left hand
        ['A'], # Punch to the body with a right hand
        ['B'], # Punch to the body with a left hand
        ['START'], # Use super power
        ]
        super().__init__(env=env, combos=DODGE)

## Enter Optuna

To perform the hyperparameter optimization stage, we are going to use the __"Optuna"__ library, as suggested by the SB3 tutorials page, and the tutorial by youtuber Nickolas Renotte.

The official page of Optuna is:

https://optuna.org/

__Takuya Akiba, Shotaro Sano, Toshihiko Yanase, Takeru Ohta, and Masanori Koyama. 2019.
Optuna: A Next-generation Hyperparameter Optimization Framework. In KDD.__

And the commented Youtube tutorial is:

https://www.youtube.com/watch?v=rzbFhu6So5U&t=6224s

In [8]:
LOG_DIR = os.path.join('..\models', 'logs', 'kinghippo')
OPT_DIR = os.path.join('..\models', 'opt', 'kinghippo')
MODELS_PATH = os.path.join('..\models', 'kinghippo')

To carry out the hyperparameter optimization stage, we are going to define a function that returns a dictionary, whose keys are the names of the parameters that we want to optimize, and their values are the __"suggestions"__ that I want to make to the system, all according to the __Optuna__ documentation.

In [9]:
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_float('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_float('learning_rate', 3e-5, 3e-2),
        'clip_range':trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.99)
    }

Now we define a function that optimizes the parameters by running a loop that performs a short training, and then evaluates the model, this loop is called study, or __"experiment"__:

In [21]:
# Run a training loop and return mean reward

# Read recommendations of these parameters below
TOTAL_TIME_STEPS = 2000
N_EVAL_EPISODES = 3

LEVEL_STATE = 'KingHippo.state'

def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial) 

        # Create environment 
        env = RetroMtpoNesReducedRL(state=LEVEL_STATE)
        env = MtpoDiscretizer(env)
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 12, channels_order='last')

        # Create algo
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        # The total timesteps and eval episodes variables should be set accordingly:
        # At least 50.000 for total timesteps, and
        # at least 10 evaluation episodes
        model.learn(total_timesteps=TOTAL_TIME_STEPS)

        # Evaluate model
        # Same as before here, with the evaluation variable
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=N_EVAL_EPISODES)
        
        # Change the name of the best model accordingly to the level you're training it.
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_KH_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        print(e)
        return -1000

Como paso final, definimos la dirección de optimización (por defecto es minimizar) que sea __"maximizar"__ para que el agente siempre busque obtener una recompenza mayor con cada intento, y definimos el número de intentos a que sean 100:

In [22]:
# Creamos el experimento
study = optuna.create_study(direction='maximize')
# As previously mentioned, here, the number of trials, should be over 100.
study.optimize(optimize_agent, n_trials=3, n_jobs=1)

[32m[I 2022-12-21 18:02:32,059][0m A new study created in memory with name: no-name-e33b5128-9ff9-4cb4-a5e8-f613b0962b5c[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7184 and n_envs=1)
[32m[I 2022-12-21 18:06:37,420][0m Trial 0 finished with value: 0.0 and parameters: {'n_steps': 7184, 'gamma': 0.8980419105445048, 'learning_rate': 0.013743209385947532, 'clip_range': 0.355394173701559, 'gae_lambda': 0.889373365703623}. Best is trial 0 with value: 0.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3995 and n_envs=1)
[32m[I 2022-12-21 18:08:44,289][0m Trial 1 finished with value: 0.0 and parameters: {'n_steps': 3995, 'gamma': 0.9587801305324833, 'learning_rate': 0.02946527349611525, 'clip_range': 0.28095540738158253, 'gae_lambda': 0.8491697036789049}. Best is trial 0 with value: 0.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6456 and n_env

At the end of the optimization we obtain a set of suggested parameters, which we are going to assign to the model that will be trained, and additionally we are going to use the model obtained from the best "experiment" as a basis for training the final model.

In [26]:
# The number of steps should be a multiple of 64, so we change it accordingly:

study.best_params
study.best_params['n_steps'] = (study.best_params['n_steps']//64) * study.best_params['n_steps']

In [27]:
study.best_params

{'n_steps': 7184,
 'gamma': 0.8980419105445048,
 'learning_rate': 0.013743209385947532,
 'clip_range': 0.355394173701559,
 'gae_lambda': 0.889373365703623}

In [28]:
study.best_trial

FrozenTrial(number=0, values=[0.0], datetime_start=datetime.datetime(2022, 12, 21, 18, 2, 32, 62171), datetime_complete=datetime.datetime(2022, 12, 21, 18, 6, 37, 419004), params={'n_steps': 7184, 'gamma': 0.8980419105445048, 'learning_rate': 0.013743209385947532, 'clip_range': 0.355394173701559, 'gae_lambda': 0.889373365703623}, distributions={'n_steps': IntDistribution(high=8192, log=False, low=2048, step=1), 'gamma': FloatDistribution(high=0.9999, log=False, low=0.8, step=None), 'learning_rate': FloatDistribution(high=0.03, log=False, low=3e-05, step=None), 'clip_range': FloatDistribution(high=0.4, log=False, low=0.1, step=None), 'gae_lambda': FloatDistribution(high=0.99, log=False, low=0.8, step=None)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None)

## Callbacks

The "callbacks" are __"warnings"__, and __"modifications"__ that we can configure during the training phase, and that help us, among other things, to:

* Modify the behavior of the training loop.
* Modify the behavior of the parameters during training.
* Save intermediate models during the training process.
* Stop training if certain conditions that we impose are met.
* Create, fill, and save a record file, or log, with which we can actively evaluate the training of the model in real time.

## Tensorboard

It is a tool created by Google as part of the Tensorflow library for the creation of machine learning models that allows recording the behavior in real time of the training of said models. When used together with the "callbacks" they are a powerful tool to know the development of the training of a model, allowing us to know visually if the training phase is going according to expectations, or if on the contrary it is deviating from that goal, which would help us cancel said training, among many other things.

Next we will create a class called __"TrainAndLoggingCallback()"__ that inherits from the callbacks base class, and that will allow us to obtain all the features mentioned above:

In [29]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'PPO_checkpoint_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

# Training

Using the parameters obtained through the "HPO" phase with Optuna, we are going to train the RL model using a PPO agent:

In [47]:
env.close()

In [48]:
LEVEL_STATE = 'KingHippo.state'

env = RetroMtpoNesReducedRL(state=LEVEL_STATE)
env = MtpoDiscretizer(env)
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecTransposeImage(env)
env = VecFrameStack(env, 12, channels_order='last')

In [49]:
best_model_params_optuna = study.best_params

We are going to save an intermediate model every 500 thousand frames (in the real project, this frequency should be a lot higher, around 500k or 1M steps), to periodically evaluate the behavior:

In [50]:
CHECKPOINT_DIR = os.path.join('..', 'models', 'train', 'kinghippo')
CHECK_FREQ = 10000
callback = TrainAndLoggingCallback(check_freq=CHECK_FREQ, save_path=CHECKPOINT_DIR)

The PPO agenet will be trained using a CNN policy (which works best for visual observations, like the screen of our "env" object), and the parameters of the better Optuna experiment:

In [51]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **best_model_params_optuna)

Using cpu device


In [38]:
# We run this cell only if we have a previously saved best model

# MODEL_NAME = 'trial_VK_10_best_model.zip'
# MODEL_PATH = os.path.join('..', 'models', 'opt', 'kinghippo', MODEL_NAME)
# model.load(MODEL_PATH)

Let's train the model for 3 million frames:

In [52]:
# Comenzamos a entrenar
model.learn(total_timesteps=20000, callback=callback)

Logging to ..\models\logs\kinghippo\PPO_12
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2.63e+03 |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 97       |
|    iterations      | 1        |
|    time_elapsed    | 73       |
|    total_timesteps | 7184     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.79e+03    |
|    ep_rew_mean          | 2           |
| time/                   |             |
|    fps                  | 8           |
|    iterations           | 2           |
|    time_elapsed         | 1734        |
|    total_timesteps      | 14368       |
| train/                  |             |
|    approx_kl            | 0.042621728 |
|    clip_fraction        | 0.0421      |
|    clip_range           | 0.355       |
|    entropy_loss         | -2.15       |
|    explained_variance   | -

<stable_baselines3.ppo.ppo.PPO at 0x25abbc10670>

In [53]:
# Save the model with an appropiate name
model.save('CnnPolicy-20k-HPO-ReducedActions-KingHippo')

# Test the model

In [54]:
env.close()

In [55]:
LEVEL_STATE = 'KingHippo.state'

env = RetroMtpoNesReducedRL(state=LEVEL_STATE)
env = MtpoDiscretizer(env)
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecTransposeImage(env)
env = VecFrameStack(env, 12, channels_order='last')

In [6]:
custom_objects = {
    "lr_schedule": lambda x: .003,
    "clip_range": lambda x: .02
}

In [56]:
model = PPO.load('CnnPolicy-20k-HPO-ReducedActions-KingHippo.zip',
#                  custom_objects=custom_objects
                )

Now lets test the model in some episodes:

In [63]:
from tqdm import tqdm

In [64]:
VELOCIDAD = 0.0001
GAMES = 3

episodes_rewards = np.zeros(shape=(GAMES,))
for n in tqdm(range(GAMES)):
    episode_rew = 0
    obs = env.reset()
    done = False
    while not done: 
        if done:
            obs = env.reset()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        episode_rew += reward
        env.render()
        time.sleep(VELOCIDAD)
    episodes_rewards[n] = float(episode_rew)
    
env.close()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:59<00:00, 39.81s/it]


Finally we print the rewards of each episode:

In [66]:
episodes_rewards

array([30.,  0., 10.])