In [2]:
import warnings
from training_rl.offline_rl import load_env_variables
import minari
from training_rl.offline_rl.behavior_policies.behavior_policy_registry import BehaviorPolicyType
from training_rl.offline_rl.custom_envs.custom_2d_grid_env.obstacles_2D_grid_register import ObstacleTypes
from training_rl.offline_rl.custom_envs.utils import Grid2DInitialConfig
from training_rl.offline_rl.generate_custom_minari_datasets.generate_minari_dataset_grid_envs import \
    create_minari_datasets, MinariDatasetConfig


from training_rl.offline_rl.offline_trainings.offline_training import offline_training


from training_rl.offline_rl.offline_trainings.policy_config_data_class import TrainedPolicyConfig
from training_rl.offline_rl.utils import state_action_histogram
from training_rl.offline_rl.custom_envs.utils import InitialConfigCustom2DGridEnvWrapper
from training_rl.offline_rl.custom_envs.custom_envs_registration import register_grid_envs

import gymnasium as gym

import torch
from training_rl.offline_rl.offline_trainings.policy_config_data_class import get_trained_policy_path
import os
from training_rl.offline_rl.offline_trainings.restore_policy_model import restore_trained_offline_policy
from tianshou.data import Collector
from training_rl.offline_rl.offline_policies.offpolicy_rendering import offpolicy_rendering

from training_rl.offline_rl.utils import delete_minari_data_if_exists
from minari import combine_datasets
from copy import copy

from training_rl.offline_rl.load_env_variables import load_env_variables

load_env_variables()
warnings.filterwarnings("ignore")
# ToDo: this should be load automatically
register_grid_envs()


ModuleNotFoundError: No module named 'training_rl'

In this exercise we will see what happens if we want to apply off-policy RL from a given dataset. Our policy will be a simple random walk in one dimension, that starting from (0,0) tries to reach the target at (3,7). So as we move in one dimension the the highest reward state is not present in our dataset.

We will use in this example as off-policy RL policy, the Deep Q-Network (DQN) algorithm, that we introduced in the online RL section. As the DQN agent cannot interact with the environment we will feed the collected data through a ReplyBuffer, similarly as we did before in the imitation learning example.

Again we will use the 8x8 grid environment but without any obstacle.

Let's setup our configuration and create the environment

In [None]:
from training_rl.offline_rl.scripts.visualizations.utils import snapshot_env
from training_rl.offline_rl.offline_policies.policy_registry import PolicyName
from training_rl.offline_rl.custom_envs.custom_envs_registration import CustomEnv, RenderMode

ENV_NAME = CustomEnv.Grid_2D_8x8_discrete
BEHAVIOR_POLICY = BehaviorPolicyType.random
NUM_COLLECTED_POINTS = 6000

OFFLINE_POLICY = PolicyName.dqn

# Grid configuration
OBSTACLE = ObstacleTypes.vertical_object_8x8
INITIAL_STATE = (0, 0)
FINAL_STATE = (3, 7)

env_2D_grid_initial_config = Grid2DInitialConfig(
    obstacles=OBSTACLE,
    initial_state=INITIAL_STATE,
    target_state=FINAL_STATE,
)

env = InitialConfigCustom2DGridEnvWrapper(gym.make(ENV_NAME, render_mode=RenderMode.RGB_ARRAY_LIST), env_config=env_2D_grid_initial_config)
snapshot_env(env)

Let's give a look to our environment and policy configuration

In [None]:
# Suboptimal policy
offpolicy_rendering(
    env_or_env_name=ENV_NAME,
    render_mode=RenderMode.RGB_ARRAY_LIST,
    behavior_policy_name=BEHAVIOR_POLICY,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
)

### EXERCISE 1:

Similarly as we did for imitation learning before:

    1 - Collect a minari dataset
    2 - Create a reply Tianshou buffer
    3 - train a DQN policy on the generated dataset 

### SOLUTION:

In [None]:
DATA_SET_NAME = "data"
DATA_SET_IDENTIFIER = "_exercise_6_a"
VERSION_DATA_SET = "v0"

# Create metadata config for set I
minari_dataset_config = create_minari_config_from_dict(
env_name=ENV_NAME,
dataset_name=DATA_SET_NAME,
data_set_identifier=DATA_SET_IDENTIFIER,
version_dataset=VERSION_DATA_SET,
num_steps=NUM_COLLECTED_POINTS,
behavior_policy_name=BEHAVIOR_POLICY,
env_2d_grid_initial_config=env_2D_grid_initial_config
)

create_minari_datasets(minari_dataset_config)

Let's give a look to the state-action distribution

In [None]:
NAME_EXPERT_DATA = "Grid_2D_8x8_discrete-data_horizontal_line_object_8x8_start_0_0_target_3_7_exercise_6_a-v0" 

#Create Buffers with minari datasets
buffer_data = load_buffer_minari(NAME_EXPERT_DATA)

# Compute state-action data distribution
state_action_count_data, _ = get_state_action_data_and_policy_grid_distributions(buffer_data, env)
state_action_histogram(state_action_count_data, title="State-Action data distribution", 
                       inset_pos_xy=(0.0, -0.03))

snapshot_env(env)

Let's train the DQN policy!

In [None]:
NAME_EXPERT_DATA = NAME_EXPERT_DATA

# The model policy to be trained.
POLICY_NAME = PolicyName.dqn


NUM_EPOCHS = 200
BATCH_SIZE = 256
UPDATE_PER_EPOCH = 100

# After every epoch we will collect some test statistics from the policy from NUMBER_TEST_ENVS independent envs.
NUMBER_TEST_ENVS = 1
EXPLORATION_NOISE = True
SEED = None #1626

# TrainedPolicyConfig is a handy object that will help us to deal with the policy configuration data.
offline_policy_config = TrainedPolicyConfig(
    name_expert_data=NAME_EXPERT_DATA,
    policy_name=POLICY_NAME,
    render_mode=RenderMode.RGB_ARRAY_LIST,
    device="cpu",
)


Let's train the model.

In [None]:
# Run the training
offline_training(
    offline_policy_config=offline_policy_config,
    num_epochs = NUM_EPOCHS,
    number_test_envs=NUMBER_TEST_ENVS,
    update_per_epoch=UPDATE_PER_EPOCH,
    restore_training=False,
)

Let's give a look to the policy state-action distribution

### EXERCISE 2:

Analyze the state-action policy distribution and try to make sense of the results. 

In [None]:
POLICY_FILE = "policy_final.pth"
NUM_EPISODES = 10 # as more episodes the better

# restore a policy with the same configuration as the one we trained.
policy = restore_trained_offline_policy(offline_policy_config)
# load the weights
name_expert_data = offline_policy_config.name_expert_data
log_name = os.path.join(name_expert_data, POLICY_NAME)
log_path = get_trained_policy_path(log_name)
policy.load_state_dict(torch.load(os.path.join(log_path, POLICY_FILE), map_location="cpu"))



In [None]:
# compute statistics
state_action_count_data, state_action_count_policy = \
    get_state_action_data_and_policy_grid_distributions(
    buffer_data, 
    env, 
    policy, 
    num_episodes=NUM_EPISODES,
    logits_sampling=False,
)

# plots
#state_action_histogram(state_action_count_data)
state_action_histogram(state_action_count_policy, inset_pos_xy=None)
#compare_state_action_histograms(state_action_count_data, state_action_count_policy)
snapshot_env(env)

Let's visualize the policy

In [None]:
#policy.set_eps(0.01)
#final_collector = Collector(policy, env, exploration_noise=EXPLORATION_NOISE)
#final_collector.collect(n_episode=20,q render=1 / 35)

#ToDo: Sole error in DQN visualization !!!!

env_2D_grid_initial_config.initial_state=(0,0)
env_2D_grid_initial_config.obstacles = ObstacleTypes.obst_free_8x8
env = InitialConfigCustom2DGridEnvWrapper(gym.make(ENV_NAME, render_mode=RenderMode.RGB_ARRAY_LIST), env_config=env_2D_grid_initial_config)

offpolicy_rendering(
    env_or_env_name=env,
    render_mode=RenderMode.RGB_ARRAY_LIST,
    policy_model=policy,
    env_2d_grid_initial_config=env_2D_grid_initial_config,
    num_frames=1000,
    imitation_policy_sampling=False
)


Conclusion:

1 - DQN goes totally out of distribution. It start to generate action that bring the system to states not included in our dataset. Remember that in general in off-policy algorithms :

$$ J (\phi) = \mathbb{E}_{{s,a,s'}\sim D} [r(s, a) + \gamma \mathbb{E}_{a'\sim \pi_{\text{off}}(\cdot|s)} [Q_{\pi_{\phi}}(s', a') - Q_{\pi_{\phi}}(s, a)]^2 $$

$a'$ won't be in the dataset and so this will move the agent to states not observed in the data with possibly negative consequences!
