# BYO Planner

This example shows you how to **Bring Your Own** (BYO) planner to work with the `a2rl.Simulator` API.

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import my_nb_path  # isort: skip
import os
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from IPython.display import Markdown
from tqdm.autonotebook import tqdm

import a2rl as wi
from a2rl.nbtools import print  # Enable color outputs when rich is installed.

import seaborn as sns  # isort: skip  # After a2rl, sns's suprious deprecation warnings are gone.

# Misc. settings
plt.rcParams["figure.figsize"] = [10, 6]
RAN_SEED = 42
random.seed(RAN_SEED)
np.random.seed(RAN_SEED)
_ = torch.manual_seed(RAN_SEED)

## Load Dataset

In [None]:
BLOCK_SIZE_ROW = 2  # block_size (measured by # of rows) as the context to train GPT
wi_df = wi.read_csv_dataset(wi.sample_dataset_path("chiller"))
wi_df.add_value()

# Speed up training for demo purpose
wi_df = wi_df.iloc[:1000]

# Instantiate a tokenier given the selected dataset.
tokenizer = wi.AutoTokenizer(wi_df, block_size_row=BLOCK_SIZE_ROW)
tokenizer.df.head(2)

In [None]:
tokenizer.df_tokenized.head(2)

## Create the Simulator

### Load or Train the GPT model

In [None]:
model_dir = "model-byo"
config = None  # Default training configuration

################################################################################
# To run in fast mode, set env var NOTEBOOK_FAST_RUN=1 prior to starting Jupyter
################################################################################
if os.environ.get("NOTEBOOK_FAST_RUN", "0") != "0":
    config = {
        "train_config": {
            "epochs": 1,
            "batch_size": 512,
            "embedding_dim": 512,
            "gpt_n_layer": 1,
            "gpt_n_head": 1,
            "learning_rate": 6e-4,
            "num_workers": 0,
            "lr_decay": True,
        }
    }

    display(
        Markdown(
            '<p style="color:firebrick; background-color:yellow; font-weight:bold">'
            "NOTE: notebook runs in fast mode. Use only 1 epoch. Results may differ."
        )
    )
################################################################################

builder = wi.GPTBuilder(tokenizer, model_dir, config)

Start GPT model training.

Default hyperparam is located at `src/a2rl/config.yaml`

In [None]:
#%%time
model_fname = os.path.join(model_dir, builder.model_name)
if os.path.exists(model_fname):
    print(f"Will load the GPT model from {model_fname}")
    builder.load_model()
else:
    print("Training the GPT model")
    builder.fit()

### Instantiate the Simulator Instance
To create a simulator, we need pass in the tokenzier and the GPT model wrapped inside `a2rl.Simulator.GPTBuilder`

In [None]:
simulator = wi.Simulator(tokenizer, builder.model)

## Build Your Own Planner

### Preparation
First we will find out the column names of SARS'

In [None]:
col_names = tokenizer.df.sar_d
col_names

In [None]:
tokenizer.state_dim

In [None]:
rewards_cols = tokenizer.df.sar_d["rewards"]
rewards_cols

In [None]:
action_cols = tokenizer.df.sar_d["actions"]
action_cols

In [None]:
nb_actions = len(tokenizer.df_tokenized[action_cols[0]].unique())
nb_actions

Calculate the total number of dataframe tokens per SAR

In [None]:
sar_row_len = tokenizer.state_dim  + tokenizer.reward_dim + tokenizer.action_dim
sar_row_len

In [None]:
block_size = BLOCK_SIZE_ROW * sar_row_len
block_size

### Behavioural Clone Planner
First we use the `a2rl.Simulator.sample` API to obtain some random actions.
By "random", we mean the actions (behaviour) are sampled from the probability distribution learned by the GPT model from the CSV dataset. It is expected these actions are similar to actions reflected in the data, hence the "clone" part.
We then apply that random actions to rollout the next step.
We do this Rollout for each step throughout the entire trajectory.

In [None]:
horizon = 20  # set the planning horizon to 20 steps
nb_runs = 5
step_size = tokenizer.state_dim + tokenizer.action_dim + tokenizer.reward_dim
################################################################################
# To run in fast mode, set env var NOTEBOOK_FAST_RUN=1 prior to starting Jupyter
################################################################################
if os.environ.get("NOTEBOOK_FAST_RUN", "0") != "0":
    nb_runs = 2

    display(
        Markdown(
            '<p style="color:firebrick; background-color:yellow; font-weight:bold">'
            "NOTE: notebook runs in fast mode. Use less samples. Results may differ."
        )
    )
################################################################################

In [None]:
accum_cost_list = []
non_accum_cost_list = []
batch_size = nb_runs
custom_context = tokenizer.df_tokenized.iloc[0, :tokenizer.state_dim].values
per_ctx_max_size = 1
# obtain a valid "random" action
for i in tqdm(range(horizon)):
    if custom_context.ndim == 1:
        batch_custom_context = np.tile(custom_context, (batch_size, 1))
    else:
        batch_custom_context = custom_context
    
    if batch_custom_context.shape[1] > block_size:
        truncated_custom_context = batch_custom_context[:, -block_size:]
    else:
        truncated_custom_context = batch_custom_context
    
    recommendation_df = simulator.sample(
        truncated_custom_context, max_size=per_ctx_max_size, as_token=True
    )
    my_actions = recommendation_df[action_cols].values
    reward, next_states = simulator.lookahead(batch_custom_context, my_actions)
    # only keep the first reward/state that corresponds to the 
    # matching (which happens to be the first) action of a given custom_context
    reward = reward[::batch_size]
    next_states = next_states[::batch_size]
    
    samples = np.hstack([my_actions, reward, next_states])

    df_ars = wi.WiDataFrame(
        samples,
        **tokenizer.df_tokenized.sar_d,
        columns=[
            *tokenizer.df_tokenized.actions,
            *tokenizer.df_tokenized.rewards,
            *tokenizer.df_tokenized.states,
        ],
    )
    df_sar = df_ars[df_ars.sar]
    df_sar = tokenizer.field_tokenizer.inverse_transform(df_sar)
    immediate_cost = df_sar[tokenizer.df_tokenized.rewards[:-1]].values
    custom_context = np.hstack([batch_custom_context, samples])
    non_accum_cost_list.append(immediate_cost.flatten().tolist())

accum_cost_list = np.array(non_accum_cost_list)
non_accum_cost_list = np.array(non_accum_cost_list)
for i in range(1, len(non_accum_cost_list)):
    accum_cost_list[i, :] = accum_cost_list[i - 1, :] + non_accum_cost_list[i, :]

accum_cost_list = accum_cost_list.transpose()
non_accum_cost_list = non_accum_cost_list.transpose()

### Q-value Maximisation Planner
Second we use the `a2rl.Simulator.get_valid_actions` to obtain all the valid actions.
Then we use `a2rl.Simulator.lookahead` to "explore" each action by obtaining both immediate reward and reward-to-go.
Next, we choose the action that has the highest / lowest sum_reward (immediate_reward + reward-to-go), and take that action to the next step. We do this Rollout for the entire trajectory

In [None]:
q_accum_cost_list = []
q_non_accum_cost_list = []

batch_size = nb_runs
custom_context = tokenizer.df_tokenized.iloc[0, : tokenizer.state_dim].values

for i in tqdm(range(horizon)):
    if custom_context.ndim == 1:
        batch_custom_context = np.tile(custom_context, (batch_size, 1))
    else:
        batch_custom_context = custom_context

    if batch_custom_context.shape[1] > block_size:
        truncated_custom_context = batch_custom_context[:, -block_size:]
    else:
        truncated_custom_context = batch_custom_context

    all_valid_actions = simulator.get_valid_actions(
        truncated_custom_context[0], max_size=nb_actions
    ).values

    reward, next_states = simulator.lookahead(batch_custom_context, all_valid_actions)

    tiled_actions = np.tile(all_valid_actions, (batch_size, 1))
    samples = np.hstack([tiled_actions, reward, next_states])
    df_ars = wi.WiDataFrame(
        samples,
        **tokenizer.df_tokenized.sar_d,
        columns=[
            *tokenizer.df_tokenized.actions,
            *tokenizer.df_tokenized.rewards,
            *tokenizer.df_tokenized.states,
        ],
    )

    df_sar = df_ars[df_ars.sar]
    df_sar = tokenizer.field_tokenizer.inverse_transform(df_sar) # need untokenized (original) reward values
    reward = df_sar[tokenizer.df_tokenized.rewards].values

    both_cost = reward.sum(axis=1)  # sum of immediate cost + cost_to_go
    both_cost = both_cost.reshape([batch_size, -1])
    action_idx = np.argmin(both_cost, axis=1)  # for each run gets its min-cost index
    rs_reward = reward.reshape([batch_size, -1, tokenizer.reward_dim]) # [nb_runs, nb_actions, reward_dim]
    # pick the reward as per the min-cost action
    sel_reward = np.array(
        [data[action] for data, action in zip(rs_reward, action_idx)]
    )    
    immediate_cost = sel_reward[:, 0]
    q_non_accum_cost_list.append(immediate_cost.flatten().tolist())
    
    # use the tokenized dataframe to select the new context as per the min-cost action
    df_ars_reshape = df_ars.values.reshape([batch_size, -1, len(df_ars.columns)])
    new_context = np.array(
        [data[action] for idx, (data, action) in enumerate(zip(df_ars_reshape, action_idx))]
    )
    custom_context = np.hstack([batch_custom_context, new_context])

q_accum_cost_list = np.array(q_non_accum_cost_list)
q_non_accum_cost_list = np.array(q_non_accum_cost_list)
for i in range(1, len(q_non_accum_cost_list)):
    q_accum_cost_list[i, :] = q_accum_cost_list[i - 1, :] + q_non_accum_cost_list[i, :]

q_accum_cost_list = q_accum_cost_list.transpose()
q_non_accum_cost_list = q_non_accum_cost_list.transpose()


### Compare the costs (`system_power_consumption`) between two planners

On average (in the sense of **expected** outcome), the `Q-value Maximisation` planner produces relatively lower `system_power_consumption`. However, the `Bahaviour Clone` actions may occasionally perform equally well. This is due to the non-deterministic nature of both the *Simulator* when performing `simulator.lookahead()` and the randomness associated with `simulator.sample()`. Moreover, the GPT model associated with the *Simulator* in this example was not trained sufficiently in terms of both the number of epochs and the size of the training data.

In [None]:
step_list = []
policy_list = []
acc_cost = []
inst_cost = []

In [None]:
for i in range(nb_runs):
    for j in range(horizon):
        step_list.append(j)
        acc_cost.append(accum_cost_list[i][j])
        inst_cost.append(non_accum_cost_list[i][j])
        policy_list.append("behaviour")

        step_list.append(j)
        acc_cost.append(q_accum_cost_list[i][j])
        inst_cost.append(q_non_accum_cost_list[i][j])
        policy_list.append("q-value")

In [None]:
df_result = pd.DataFrame(
    {
        "step": step_list,
        "acc_cost": acc_cost,
        "step_cost": inst_cost,
        "policy": policy_list,
    }
)

In [None]:
sns.lineplot(
    data=df_result[df_result.policy == "behaviour"], x="step", y="step_cost", label="Behaviour clone"
)
sns.lineplot(
    data=df_result[df_result.policy == "q-value"], x="step", y="step_cost", label="Q-value optimal"
)
plt.legend(fontsize=14)
plt.grid(ls="--")
plt.xlabel("Step", fontsize=16)
plt.xlabel("Step", fontsize=16)
plt.ylabel("Step Cost", fontsize=16)
_ = plt.title("Stepwise system_power_consumption")

In [None]:
data1 = df_result[(df_result.policy == "behaviour")]
data2 = df_result[(df_result.policy == "q-value")]

sns.lineplot(data=data1, x="step", y="acc_cost", label="Behaviour clone")
sns.lineplot(data=data2, x="step", y="acc_cost", label="Q-value optimal")
plt.legend(fontsize=14)
plt.grid(ls="--")
plt.xlabel("Step", fontsize=16)
plt.ylabel("Accumutive Cost", fontsize=16)
_ = plt.title("Accumulative system_power_consumption")