In [None]:
%%capture
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext training_rl
%set_random_seed 12

In [None]:
%presentation_style

In [None]:
%load_latex_macros

In [None]:
%autoreload

from training_rl.offline_rl.load_env_variables import load_env_variables
load_env_variables()

import os
import warnings

import gymnasium as gym
import torch

from training_rl.offline_rl.behavior_policies.behavior_policy_registry import \
    BehaviorPolicyType
from training_rl.offline_rl.custom_envs.custom_2d_grid_env.obstacles_2D_grid_register import \
    ObstacleTypes
from training_rl.offline_rl.custom_envs.custom_envs_registration import (
    CustomEnv, RenderMode, register_grid_envs)
from training_rl.offline_rl.custom_envs.utils import (
    Grid2DInitialConfig, InitialConfigCustom2DGridEnvWrapper)
from training_rl.offline_rl.generate_custom_minari_datasets.generate_minari_dataset_grid_envs import \
    create_combined_minari_dataset
from training_rl.offline_rl.offline_policies.offpolicy_rendering import \
    offpolicy_rendering
from training_rl.offline_rl.offline_policies.policy_registry import PolicyName
from training_rl.offline_rl.offline_trainings.offline_training import \
    offline_training
from training_rl.offline_rl.offline_trainings.policy_config_data_class import (
    TrainedPolicyConfig, get_trained_policy_path)
from training_rl.offline_rl.offline_trainings.restore_policy_model import \
    restore_trained_offline_policy
from training_rl.offline_rl.utils import (compare_state_action_histograms,
                                          load_buffer_minari,
                                          state_action_histogram)
from training_rl.offline_rl.visualizations.utils import (
    get_state_action_data_and_policy_grid_distributions, snapshot_env)

warnings.filterwarnings("ignore")
register_grid_envs()

render_mode = RenderMode.RGB_ARRAY_LIST if os.environ.get("DISPLAY") else None

<img src="_static/images/aai-institute-cover.svg" alt="Snow" style="width:100%;">
<div class="md-slide title"> Offline RL distributional shift exercises </div>

# Off-Policy Distributional Shift - I

## Exercise I:

In this exercise we have two datasets that tries to bring the agent from (3,0) to (7,7) .

<img src="_static/images/nb_95_env_image_1.png" alt="grid environment configuration" style="height:400px;">

One policy is suboptimal and the other is optimal.

I  - **Suboptimal expert policy**:  collect ~ 8000 steps

II - **expert policy**: collect ~ 1000 steps

(This could be a realistic situation where you could have only a few human expert data available and the rest of the data is collected through a far from optimal policy.)

**We will use in this example as off-policy RL policy, the Deep Q-Network (DQN) algorithm,** that we introduced in the online RL section. As the DQN agent cannot interact with the environment we will feed the collected data through a ReplyBuffer, similarly as we did before in the imitation learning example.


Exercise: (Give a look to the imitation learning exercise)

1 - **We will use the 8x8 grid environment with a vertical obstacle : ObstacleTypes.dqn_obstacle_8x8**.

2 - **The policies: behavior_8x8_eps_greedy_4_0_to_7_7 and behavior_8x8_deterministic_4_0_to_7_7)**

3 - **For the dataset collection give a look to the solution**, specifically to the points:

- Configure the two datasets
- Create Minari combined datasets 

as we will fuse the two datasets coming from different policies into a single one (as you should do in a real problem)

4 - **Do the training using as policy BehaviorPolicyType.dqn to have a felling of how good (or how bad) is an offpolicy algorithm to deal with collected data.**

### **SOLUTION:**

#### Create the environment

In [None]:
ENV_NAME = CustomEnv.Grid_2D_8x8_discrete

# Grid configuration
OBSTACLE = ObstacleTypes.dqn_obstacle_8x8
INITIAL_STATE = (3, 0)
FINAL_STATE = (7, 7)

env_2D_grid_initial_config = Grid2DInitialConfig(
    obstacles=OBSTACLE,
    initial_state=INITIAL_STATE,
    target_state=FINAL_STATE,
)

env = InitialConfigCustom2DGridEnvWrapper(gym.make(ENV_NAME, render_mode=render_mode), env_config=env_2D_grid_initial_config)
snapshot_env(env)

#### Configure and rendering policies

In [None]:
BEHAVIOR_POLICY_I = BehaviorPolicyType.behavior_8x8_eps_greedy_4_0_to_7_7
# ToDo: CHANGE !!!!!!!!!
#BEHAVIOR_POLICY_I = BehaviorPolicyType.random
DATA_SET_IDENTIFIER_I = "_suboptimal"
# TODO: CHANGE !!!!!!!
NUM_STEPS_I = 100000

BEHAVIOR_POLICY_II = BehaviorPolicyType.behavior_8x8_deterministic_4_0_to_7_7
DATA_SET_IDENTIFIER_II = "_expert"
NUM_STEPS_II = 1000


In [None]:
# Suboptimal policy
offpolicy_rendering(
    env_or_env_name=ENV_NAME,
    render_mode=render_mode,
    behavior_policy_name=BEHAVIOR_POLICY_I,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
)

In [None]:
# Expert policy
offpolicy_rendering(
    env_or_env_name=ENV_NAME,
    render_mode=render_mode,
    behavior_policy_name=BEHAVIOR_POLICY_II,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
)

#### Create single Minari dataset

( give a look to **create_combined_minari_dataset(...)** ) 

In [None]:
config_combined_data = create_combined_minari_dataset(
        env_name=ENV_NAME,
        dataset_identifiers = (DATA_SET_IDENTIFIER_I, DATA_SET_IDENTIFIER_II),
        num_collected_points = (NUM_STEPS_I, NUM_STEPS_II),
        behavior_policy_names = (BEHAVIOR_POLICY_I, BEHAVIOR_POLICY_II),
        combined_dataset_identifier = "combined_data_sets_random_walk",
        env_2d_grid_initial_config = env_2D_grid_initial_config,
)

#### check state-action distribution

In [None]:
#Create Buffers with minari datasets
name_combined_dataset = config_combined_data.data_set_name

buffer_data = load_buffer_minari(name_combined_dataset)
data_size = len(buffer_data)

# Compute state-action data distribution
state_action_count_data, _ = get_state_action_data_and_policy_grid_distributions(buffer_data, env)
state_action_histogram(state_action_count_data, title="State-Action data distribution", inset_pos_xy=(-0.1, -0.3))

snapshot_env(env)

#### offline policy training

In [None]:
POLICY_NAME = PolicyName.dqn

NAME_EXPERT_DATA = name_combined_dataset
# TrainedPolicyConfig is a handy object that will help us to deal with the policy configuration data.
offline_policy_config = TrainedPolicyConfig(
    name_expert_data=NAME_EXPERT_DATA,
    policy_name=POLICY_NAME,
    render_mode=render_mode,
    device="cpu"
)


In [None]:
# Run the training

NUM_EPOCHS = 4
BATCH_SIZE = 128
STEP_PER_EPOCH = 0.1*data_size

# After every epoch we will collect some test statistics from the policy from NUMBER_TEST_ENVS independent envs.
NUMBER_TEST_ENVS = 1


offline_training(
    offline_policy_config=offline_policy_config,
    num_epochs = NUM_EPOCHS,
    number_test_envs=NUMBER_TEST_ENVS,
    step_per_epoch=STEP_PER_EPOCH,
    restore_training=False,
)

In [None]:
# ToDo: Delete or not delete ???

from training_rl.offline_rl.online_trainings.online_training import online_training
from training_rl.offline_rl.offline_policies.policy_registry import PolicyType

online_training(
    trained_policy_config = offline_policy_config,
    policy_type = PolicyType.offpolicy,
    num_epochs=1,
    batch_size=64,
    restore_training=False,
)

### Results

#### Restore policy

In [None]:
POLICY_FILE = "policy_best_reward.pth"

# restore a policy with the same configuration as the one we trained.
policy = restore_trained_offline_policy(offline_policy_config)
# load the weights
name_expert_data = offline_policy_config.name_expert_data
log_name = os.path.join(name_expert_data, POLICY_NAME)
log_path = get_trained_policy_path(log_name)
policy.load_state_dict(torch.load(os.path.join(log_path, POLICY_FILE), map_location="cpu"))


#### Policy visualization

In [None]:
#env_2D_grid_initial_config.obstacles = OBSTACLE.obst_free_8x8
#env = InitialConfigCustom2DGridEnvWrapper(gym.make(ENV_NAME, render_mode=render_mode), env_config=env_2D_grid_initial_config)

offpolicy_rendering(
    env_or_env_name=env,
    render_mode=render_mode,
    policy_model=policy,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
    imitation_policy_sampling=False
)

#### State-action policy distribution

In [None]:
NUM_EPISODES = 100 # as more episodes the better

# compute statistics
state_action_count_data, state_action_count_policy = \
    get_state_action_data_and_policy_grid_distributions(
    buffer_data,
    env,
    policy,
    num_episodes=NUM_EPISODES,
    logits_sampling=True,
)

# plots
state_action_histogram(state_action_count_data)
state_action_histogram(state_action_count_policy)
compare_state_action_histograms(state_action_count_data, state_action_count_policy)
snapshot_env(env)

## Exercise II

**1 - Train again the DQN but now with random data (i.e. suboptimal random policy  and increase the expert data too,
expert data ~ 30000).**

Is the policy better? Why?

Remove the walls and see what happens, i.e:

In [None]:
env_2D_grid_initial_config.obstacles = ObstacleTypes.obst_free_8x8
env = InitialConfigCustom2DGridEnvWrapper(gym.make(ENV_NAME, render_mode=render_mode), env_config=env_2D_grid_initial_config)

offpolicy_rendering(
    env_or_env_name=env,
    render_mode=render_mode,
    policy_model=policy,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
    imitation_policy_sampling=False
)

**2 - Try now with the original problem and try one of the offline RL algorithms that we will introduce later:  BCQ.**

What happens now?

BCQ as all the offline RL algorithms has tuning parameters to control o.o.d data. They are typically tune to some empirical values that solve a huge range of tasks but in general these are important parameters you will need to tune in your experiments.

# Off-Policy Distributional Shift - II

## Exercise I

**In this notebook we will deal with another important property that we should fulfill with a robust offline RL algorithm, the stitching, i.e. the reuse of different trajectories contain in the data to obtain the best trajectory in our dataset.**


The goal will be to reach a target at (7,7) starting from (0,0). We will use again the 8x8 grid environment. Our dataset contains trajectories covering our space of interest but generated for different tasks (note that before we collected data for the same task) . One is a suboptimal policy that moves the agent from (0,0) to (7,0) and the other is a deterministic an optimal one (human expert) that brings the agent from (4,0) to (7,7). We have obviously much more data coming from the suboptimal policy than the expert one as it is cheaper.

So we will create the two policies:

I  - **Suboptimal expert policy** (behavior_8x8_moves_downwards_within_strip):  moves agent in suboptimal way downwards starting from (0,0) (collect 8000 steps)

II - **Optimal expert policy**(behavior_8x8_deterministic_4_0_to_7_7): moves agent in the optimal path from (4,0) to (7,7) (collect 1000 steps)


In this example we will use again as off-policy RL algorithm, the Deep Q-Network (DQN) algorithm.

**Let's setup our configuration and create the environment**

### Environment

In [None]:
ENV_NAME = CustomEnv.Grid_2D_8x8_discrete

# Grid configuration
OBSTACLE = ObstacleTypes.vertical_object_8x8
INITIAL_STATE = (0, 0)
FINAL_STATE = (7, 7)

env_2D_grid_initial_config = Grid2DInitialConfig(
    obstacles=OBSTACLE,
    initial_state=INITIAL_STATE,
    target_state=FINAL_STATE,
)

env = InitialConfigCustom2DGridEnvWrapper(gym.make(ENV_NAME, render_mode=render_mode), env_config=env_2D_grid_initial_config)
snapshot_env(env)

### Configure the two datasets

In [None]:
DATA_SET_IDENTIFIER_I = "_downwards_"
BEHAVIOR_POLICY_I = BehaviorPolicyType.behavior_8x8_moves_downwards_within_strip
NUM_STEPS_I = 8000

DATA_SET_IDENTIFIER_II = "_optimal_"
BEHAVIOR_POLICY_II = BehaviorPolicyType.behavior_8x8_deterministic_4_0_to_7_7
NUM_STEPS_II = 1000

### Create combined Minari dataset

In [None]:
config_combined_data = create_combined_minari_dataset(
        env_name=ENV_NAME,
        dataset_identifiers = (DATA_SET_IDENTIFIER_I, DATA_SET_IDENTIFIER_II),
        num_collected_points = (NUM_STEPS_I, NUM_STEPS_II),
        behavior_policy_names = (BEHAVIOR_POLICY_I, BEHAVIOR_POLICY_II),
        combined_dataset_identifier = "_stiching",
        env_2d_grid_initial_config = env_2D_grid_initial_config,
)

### Rendering behavioral policy

In [None]:
# Suboptimal policy

offpolicy_rendering(
    env_or_env_name=ENV_NAME,
    render_mode=render_mode,
    behavior_policy_name=BEHAVIOR_POLICY_I,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
)

In [None]:
# Expert policy
offpolicy_rendering(
    env_or_env_name=ENV_NAME,
    render_mode=render_mode,
    behavior_policy_name=BEHAVIOR_POLICY_II,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
)

### State-action distribution

In [None]:
name_combined_dataset = config_combined_data.data_set_name

#Create Buffers with minari datasets
buffer_data = load_buffer_minari(name_combined_dataset)
data_size = len(buffer_data)

# Compute state-action data distribution
state_action_count_data, _ = get_state_action_data_and_policy_grid_distributions(buffer_data, env)
state_action_histogram(state_action_count_data, title="State-Action data distribution", inset_pos_xy=(-0.1, -0.012))

snapshot_env(env)

### Policy to train

In [None]:
POLICY_NAME = PolicyName.dqn

NAME_EXPERT_DATA = name_combined_dataset
# TrainedPolicyConfig is a handy object that will help us to deal with the policy configuration data.
offline_policy_config = TrainedPolicyConfig(
    name_expert_data=NAME_EXPERT_DATA,
    policy_name=POLICY_NAME,
    render_mode=render_mode,
    device="cpu"
)

### Training

In [None]:
# Run the training
NUM_EPOCHS = 20
BATCH_SIZE = 128
STEP_PER_EPOCH = 0.1*data_size

# After every epoch we will collect some test statistics from the policy from NUMBER_TEST_ENVS independent envs.
NUMBER_TEST_ENVS = 1


offline_training(
    offline_policy_config=offline_policy_config,
    num_epochs = NUM_EPOCHS,
    step_per_epoch=STEP_PER_EPOCH,
    number_test_envs=NUMBER_TEST_ENVS,
    restore_training=False,
)

### Restore policy

In [None]:
POLICY_FILE = "policy_best_reward.pth"

# restore a policy with the same configuration as the one we trained.
policy = restore_trained_offline_policy(offline_policy_config)
# load the weights
name_expert_data = offline_policy_config.name_expert_data
log_name = os.path.join(name_expert_data, POLICY_NAME)
log_path = get_trained_policy_path(log_name)
policy.load_state_dict(torch.load(os.path.join(log_path, POLICY_FILE), map_location="cpu"))

### Let's visualize the policy

In [None]:
offpolicy_rendering(
    env_or_env_name=env,
    render_mode=render_mode,
    policy_model=policy,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
    imitation_policy_sampling=False
)

### Questions:

1 - What do you notice? What happens if you increase the expert data? Is it better?

2 - Try again with the BCQ algorithm.