In [25]:
import d3rlpy
from d3rlpy.algos import COMBO
from sklearn.model_selection import train_test_split
#import gymnasium as gym
import gym
from gym.wrappers import TransformObservation
import numpy as np
import encoders
import os
import json
import torch
import environments

In [26]:
print(gym.version.VERSION)
print(torch.__version__)

0.23.1
2.2.0+cu121


## Params

In [27]:
seed = 1
d3rlpy.seed(seed)
use_gpu = True
# prepare environment
#env = gym.make("InvertedPendulum-v2")
#eval_env = gym.make("InvertedPendulum-v2")
# env = gym.make("Reacher-v2")
# eval_env = gym.make("Reacher-v2")
env = environments.CarEnv()
eval_env = environments.CarEnv()
env.reset(seed=seed)
eval_env.reset(seed=seed)

array([  0.23643249,   9.00927393, -22.35811053,   8.97298894,
        -3.76337096,  -4.81754121])

In [28]:
# def observation_edit1(obs):
#     new_obs = np.zeros(6)
#     new_obs[0] = np.arctan2(obs[2], obs[0])
#     new_obs[1] = np.arctan2(obs[3], obs[1])
#     new_obs[2:] = obs[4:-3]
#     return new_obs

# env1 = TransformObservation(env, observation_edit1)
# env1.observation_space = gym.spaces.Box(-np.inf, np.inf, shape=(6,), dtype= np.float64 )
# print(env1.reset(seed=seed))

# eval_env1 = TransformObservation(eval_env, observation_edit1)
# eval_env1.observation_space = gym.spaces.Box(-np.inf, np.inf, shape=(6,), dtype= np.float64 )
# print(env1.reset(seed=seed))

## Create dataset

In [29]:
actor_encoder = d3rlpy.models.encoders.DefaultEncoderFactory(dropout_rate=0.2)
# setup algorithm
sac = d3rlpy.algos.SAC(
    batch_size=256,
    actor_encoder_factory=actor_encoder,
    actor_learning_rate=3e-4,
    critic_learning_rate=3e-4,
    temp_learning_rate=3e-4,
    use_gpu=use_gpu
)

# prepare utilities
buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=10000, env=env)

# start training
sac.fit_online(
    env,
    buffer,
    eval_env=eval_env,
    n_steps=10000,
    n_steps_per_epoch=1000,
    update_interval=1,
    update_start_step=1000,
    tensorboard_dir='tensorboard_logs',
    experiment_name='exp_test_car_SAC_2024-02-15',
    save_interval=10
)

[2m2024-02-15 15:49:51[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/exp_test_car_SAC_2024-02-15_20240215154951[0m
[2m2024-02-15 15:49:51[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2024-02-15 15:49:51[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2024-02-15 15:49:51[0m [[32m[1minfo     [0m] [1mParameters are saved to d3rlpy_logs/exp_test_car_SAC_2024-02-15_20240215154951/params.json[0m [36mparams[0m=[35m{'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': 0.2}}, 'actor_learning_rate': 0.0003, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'batch_size': 256, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_learning_rate': 0.0003, 'critic_optim_factory':

  0%|          | 0/10000 [00:00<?, ?it/s]

[2m2024-02-15 15:49:52[0m [[32m[1minfo     [0m] [1mexp_test_car_SAC_2024-02-15_20240215154951: epoch=1 step=1000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0011125824451446533, 'time_environment_step': 6.556248664855956e-05, 'rollout_return': 1.7298613990474887e-30, 'time_step': 0.0012606935501098633, 'evaluation': 2.1809515445946591e-72}[0m [36mstep[0m=[35m1000[0m


KeyboardInterrupt: 

In [None]:
# export replay buffer as MDPDataset
dataset = buffer.to_mdp_dataset()

# save MDPDataset
dataset.dump('d3rlpy_data/car_test_2024-02-15.h5')

## Load the dataset

In [None]:
dataset = d3rlpy.dataset.MDPDataset.load('d3rlpy_data/car_test_2024-02-15.h5')

In [None]:
train_episodes, test_episodes = train_test_split(dataset, random_state=seed)

## Dynamics learning

In [None]:
# def inverted_pendulum_project(x):
#     return x[:, 1:]
# projection_size = 3

# def reacher_project(x):
#     # return x[:, [1,4,5]]
    
projector = env.preprocess_fn
projection_size = 3

In [None]:
encoder_factory = encoders.SymmetryEncoderFactory(project=projector, projection_size=projection_size)
dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=True, state_encoder_factory=encoder_factory)
#dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=True) # Baseline

Using SymmetryEncoderFactory


In [30]:
# same as algorithms
dynamics.fit(train_episodes,
             eval_episodes=test_episodes,
            #  n_epochs=100,
            n_steps=100000,
             n_steps_per_epoch=1000,
             scorers={
               #  'observation_error': d3rlpy.metrics.scorer.dynamics_observation_prediction_error_scorer,
               #  'reward_error': d3rlpy.metrics.scorer .dynamics_reward_prediction_error_scorer,
               #  'variance': d3rlpy.metrics.scorer.dynamics_prediction_variance_scorer,
             },
            tensorboard_dir='tensorboard_logs/test',
            experiment_name='car_2024-02-15')

[2m2024-02-15 15:50:08[0m [[32m[1mdebug    [0m] [1mRoundIterator is selected.    [0m
[2m2024-02-15 15:50:08[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/car_2024-02-15_20240215155008[0m
[2m2024-02-15 15:50:08[0m [[32m[1minfo     [0m] [1mParameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/params.json[0m [36mparams[0m=[35m{'action_scaler': None, 'augmentation': None, 'batch_size': 100, 'discrete_action': False, 'gamma': 1.0, 'generated_maxlen': 100000, 'learning_rate': 0.0001, 'n_ensembles': 5, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0.0001, 'amsgrad': False}, 'permutation_indices': None, 'real_ratio': 1.0, 'reduction': None, 'reward_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'reward_scaler': None, 'scaler': None, 'state_encoder_factory': {'type': 'symmetry', 'params': {'hidde

Epoch 1/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:10[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=1 step=75[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026103019714355467, 'time_algorithm_update': 0.024373235702514647, 'loss': 34.58080952962239, 'time_step': 0.024705158869425457}[0m [36mstep[0m=[35m75[0m
[2m2024-02-15 15:50:10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_75.pt[0m


Epoch 2/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:12[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=2 step=150[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00023705482482910156, 'time_algorithm_update': 0.022897698084513345, 'loss': 33.87072738647461, 'time_step': 0.023211046854654947}[0m [36mstep[0m=[35m150[0m
[2m2024-02-15 15:50:12[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_150.pt[0m


Epoch 3/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:14[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=3 step=225[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002469762166341146, 'time_algorithm_update': 0.023074671427408853, 'loss': 34.09314628601074, 'time_step': 0.02339872678120931}[0m [36mstep[0m=[35m225[0m
[2m2024-02-15 15:50:14[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_225.pt[0m


Epoch 4/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:16[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=4 step=300[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00023389180501302083, 'time_algorithm_update': 0.023121477762858073, 'loss': 34.50152109781901, 'time_step': 0.023424596786499025}[0m [36mstep[0m=[35m300[0m
[2m2024-02-15 15:50:16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_300.pt[0m


Epoch 5/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:18[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=5 step=375[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00023855845133463542, 'time_algorithm_update': 0.023003495534261068, 'loss': 34.22541206359863, 'time_step': 0.023321520487467447}[0m [36mstep[0m=[35m375[0m
[2m2024-02-15 15:50:18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_375.pt[0m


Epoch 6/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:19[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=6 step=450[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002520593007405599, 'time_algorithm_update': 0.02361097017923991, 'loss': 34.089635365804035, 'time_step': 0.023941415150960287}[0m [36mstep[0m=[35m450[0m
[2m2024-02-15 15:50:19[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_450.pt[0m


Epoch 7/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:21[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=7 step=525[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00025441805521647133, 'time_algorithm_update': 0.02392187436421712, 'loss': 33.74408271789551, 'time_step': 0.024257097244262695}[0m [36mstep[0m=[35m525[0m
[2m2024-02-15 15:50:21[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_525.pt[0m


Epoch 8/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:23[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=8 step=600[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002584743499755859, 'time_algorithm_update': 0.02519740104675293, 'loss': 33.40949851989746, 'time_step': 0.025526192982991535}[0m [36mstep[0m=[35m600[0m
[2m2024-02-15 15:50:23[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_600.pt[0m


Epoch 9/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:25[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=9 step=675[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00024956385294596354, 'time_algorithm_update': 0.023643817901611328, 'loss': 33.373305079142256, 'time_step': 0.023970244725545247}[0m [36mstep[0m=[35m675[0m
[2m2024-02-15 15:50:25[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_675.pt[0m


Epoch 10/100:   0%|          | 0/75 [00:00<?, ?it/s]

[2m2024-02-15 15:50:27[0m [[32m[1minfo     [0m] [1mcar_2024-02-15_20240215155008: epoch=10 step=750[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00025678316752115885, 'time_algorithm_update': 0.02393456776936849, 'loss': 33.99833318074544, 'time_step': 0.024277162551879884}[0m [36mstep[0m=[35m750[0m
[2m2024-02-15 15:50:27[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/car_2024-02-15_20240215155008/model_750.pt[0m


Epoch 11/100:   0%|          | 0/75 [00:00<?, ?it/s]

In [None]:
def experiment_dynamics_training(dataset, symmetry_project, projection_size, n_runs, experiment_name, seed=1, use_gpu=True):
    for i in range(n_runs):
        for exp_type in ['default', 'symmetry']:
            # use the same seeds for default and symmetric runs
            train_episodes, test_episodes = train_test_split(dataset, random_state=seed+i)
            if exp_type == 'symmetry':
                state_encoder_factory = encoders.SymmetryEncoderFactory(project=symmetry_project, projection_size=projection_size)
                train_episodes, test_episodes = train_test_split(dataset, random_state=seed+i+1) # remove this later
                dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=use_gpu, state_encoder_factory=state_encoder_factory)
            else:
                dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=use_gpu)
            dynamics.fit(train_episodes,
                 eval_episodes=test_episodes,
                 n_epochs=100,
                 scorers={
                    'observation_error': d3rlpy.metrics.scorer.dynamics_observation_prediction_error_scorer,
                    'reward_error': d3rlpy.metrics.scorer.dynamics_reward_prediction_error_scorer,
                    'variance': d3rlpy.metrics.scorer.dynamics_prediction_variance_scorer,
                 },
                tensorboard_dir='tensorboard_logs/dynamics',
                experiment_name=experiment_name + '_' + exp_type)

In [None]:
experiment_dynamics_training(dataset=dataset, symmetry_project=reacher_project, projection_size=3, n_runs=5, experiment_name="exp_5_dynamics_reacher", use_gpu=True)

## Load Dynamics

In [None]:
# load trained dynamics model
dynamics_model_path = "d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632"
dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics.from_json(dynamics_model_path + '/params.json')
dynamics.load_model(dynamics_model_path + '/model_31542.pt')

## Train Offline RL Algorithm

In [None]:
encoder = d3rlpy.models.encoders.DefaultEncoderFactory(dropout_rate=0.2)
# give COMBO as the generator argument.
combo = COMBO(dynamics=dynamics, critic_encoder_factory=encoder, actor_encoder_factory=encoder,
              use_gpu=use_gpu)

In [None]:
combo.fit(dataset = train_episodes, eval_episodes=test_episodes, n_steps=100000, n_steps_per_epoch=1000, tensorboard_dir="tensorboard_logs",
         scorers={
            'environment': d3rlpy.metrics.scorer.evaluate_on_environment(eval_env)
        })

In [None]:
def experiment_COMBO_training(dataset, eval_env, experiment_name, save_name, models_dir, symmetry_project, projection_size, seed=1, use_gpu=True):
    model_paths = [filename for filename in os.listdir(models_dir) if filename.startswith(experiment_name+'_dynamics')]
    model_paths = [models_dir + model_paths_i for model_paths_i in model_paths]
    model_paths.sort()
    print(model_paths)

    symmetry_reduced_paths = []
    default_paths = []
    for model_path_i in model_paths:
        f = open(model_path_i +'/params.json')
        model_path_i_params = json.load(f)
        if(model_path_i_params["state_encoder_factory"]['type']=='symmetry'):
            symmetry_reduced_paths.append(model_path_i)
        elif(model_path_i_params["state_encoder_factory"]['type']=='default'):
            default_paths.append(model_path_i)
    print("Default_paths:", default_paths, "Symmetry reduced paths: ", symmetry_reduced_paths)

    # load trained dynamics model
    for i in range(len(default_paths)):
        for type, dynamics_model_path in zip(['symmetry', 'default'],[symmetry_reduced_paths[i], default_paths[i]]):
            # use the same seeds for default and symmetric runs
            train_episodes, test_episodes = train_test_split(dataset, random_state=seed+i)
            if type == 'symmetry':
                state_encoder_factory = encoders.SymmetryEncoderFactory(project=symmetry_project, projection_size=projection_size)
                dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=use_gpu, state_encoder_factory=state_encoder_factory)
                dynamics.build_with_dataset(dataset)
            else:
                dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics.from_json(dynamics_model_path + '/params.json')

            filenames = os.listdir(dynamics_model_path)
            latest_model_path = dynamics_model_path + '/model_' +  str(max([int(filename.strip('model_.pt')) for filename in filenames if filename.endswith(".pt")])) + '.pt'
            dynamics.load_model(latest_model_path)
            print("Loaded model: ", latest_model_path)
            
            encoder = d3rlpy.models.encoders.DefaultEncoderFactory(dropout_rate=0.2)
            # give COMBO as the generator argument.
            combo = COMBO(dynamics=dynamics, critic_encoder_factory=encoder, actor_encoder_factory=encoder, use_gpu=use_gpu)
            combo.fit(dataset = train_episodes, eval_episodes=test_episodes, n_steps=1000000, n_steps_per_epoch=1000,
                      tensorboard_dir="tensorboard_logs",
                     scorers={
                        'environment': d3rlpy.metrics.scorer.evaluate_on_environment(eval_env)
                    },
                     experiment_name=save_name + "_" + type,
                     save_interval=50)


In [None]:
experiment_COMBO_training(dataset, eval_env1, 'exp_5', save_name='exp_5_COMBO_reacher', models_dir='d3rlpy_logs/', symmetry_project=reacher_project, projection_size=3, seed=1, use_gpu=True)

## Load the Policy

In [None]:
#actor_encoder = d3rlpy.models.encoders.DefaultEncoderFactory(dropout_rate=0.2)
# setup algorithm
trained_policy = d3rlpy.algos.SAC()
trained_policy.build_with_env(env1)
trained_policy.load_model('d3rlpy_logs/exp_6_SAC_reacher_20231024124119/model_100000.pt')

# initialize with dataset
#trained_policy.build_with_dataset(dataset)
# Load entire model parameters.
#trained_policy.load_model('d3rlpy_logs/COMBO_20230929153035/model_53000.pt')

## See the policy running

In [None]:
scorer = d3rlpy.metrics.scorer.evaluate_on_environment(env1, render=True)
mean_episode_return = scorer(trained_policy)