In [3]:
import d3rlpy
from d3rlpy.algos import COMBO
from sklearn.model_selection import train_test_split
import gymnasium as gym
import numpy as np
import encoders

## Params

In [5]:
seed = 1
d3rlpy.seed(seed)
use_gpu = True
# prepare environment
env = gym.make("InvertedPendulum-v4")
eval_env = gym.make("InvertedPendulum-v4")
env.reset(seed=seed)
eval_env.reset(seed=seed)

(array([ 0.00023643,  0.00900927, -0.00711681,  0.00897299]), {})

## Create dataset

In [14]:
actor_encoder = d3rlpy.models.encoders.DefaultEncoderFactory(dropout_rate=0.2)
# setup algorithm
sac = d3rlpy.algos.SAC(
    batch_size=256,
    actor_encoder_factory=actor_encoder,
    actor_learning_rate=3e-4,
    critic_learning_rate=3e-4,
    temp_learning_rate=3e-4,
    use_gpu=use_gpu
)

# prepare utilities
buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=1000000, env=env)

# start training
sac.fit_online(
    env,
    buffer,
    eval_env=eval_env,
    n_steps=100000,
    n_steps_per_epoch=1000,
    update_interval=1,
    update_start_step=1000,
    tensorboard_dir='tensorboard_logs'
)

[2m2023-10-01 15:22:21[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/SAC_online_20231001152221[0m
[2m2023-10-01 15:22:21[0m [[32m[1mdebug    [0m] [1mBuilding model...[0m
[2m2023-10-01 15:22:21[0m [[32m[1mdebug    [0m] [1mModel has been built.[0m
[2m2023-10-01 15:22:21[0m [[32m[1minfo     [0m] [1mParameters are saved to d3rlpy_logs/SAC_online_20231001152221/params.json[0m [36mparams[0m=[35m{'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': 0.2}}, 'actor_learning_rate': 0.0003, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'batch_size': 256, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_learning_rate': 0.0003, 'critic_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-

  0%|          | 0/100000 [00:00<?, ?it/s]

[2m2023-10-01 15:22:23[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SAC_online_20231001152221/model_1000.pt[0m
[2m2023-10-01 15:22:23[0m [[32m[1minfo     [0m] [1mSAC_online_20231001152221: epoch=1 step=1000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0011901423931121827, 'time_environment_step': 6.751775741577148e-05, 'time_step': 0.0013019063472747804, 'rollout_return': 10.06060606060606, 'evaluation': 14.7}[0m [36mstep[0m=[35m1000[0m
[2m2023-10-01 15:22:37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SAC_online_20231001152221/model_2000.pt[0m
[2m2023-10-01 15:22:37[0m [[32m[1minfo     [0m] [1mSAC_online_20231001152221: epoch=2 step=2000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0012940735816955567, 'time_environment_step': 0.0001036219596862793, 'time_sample_batch': 0.000309492826461792, 'time_algorithm_update': 0.011871881484985351, 'temp_loss'

In [15]:
# export replay buffer as MDPDataset
dataset = buffer.to_mdp_dataset()

# save MDPDataset
dataset.dump('d3rlpy_data/inverted_pendulum2.h5')

## Load the dataset

In [6]:
dataset = d3rlpy.dataset.MDPDataset.load('d3rlpy_data/inverted_pendulum2.h5')

In [7]:
train_episodes, test_episodes = train_test_split(dataset, random_state=seed)

## Dynamics learning

In [17]:
# encoder_factory = encoders.InvertedPendulumEncoderFactory()
dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=True, encoder_factory="inverted_pendulum")

Using InvertedPendulumEncoderFactory


In [18]:
# same as algorithms
dynamics.fit(train_episodes,
             eval_episodes=test_episodes,
             n_epochs=100,
             scorers={
                'observation_error': d3rlpy.metrics.scorer.dynamics_observation_prediction_error_scorer,
                'reward_error': d3rlpy.metrics.scorer.dynamics_reward_prediction_error_scorer,
                'variance': d3rlpy.metrics.scorer.dynamics_prediction_variance_scorer,
             },
            tensorboard_dir='tensorboard_logs/dynamics')

[2m2023-10-02 23:06:32[0m [[32m[1mdebug    [0m] [1mRoundIterator is selected.[0m
[2m2023-10-02 23:06:32[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632[0m
[2m2023-10-02 23:06:32[0m [[32m[1mdebug    [0m] [1mBuilding models...[0m
[2m2023-10-02 23:06:32[0m [[32m[1mdebug    [0m] [1mModels have been built.[0m
[2m2023-10-02 23:06:32[0m [[32m[1minfo     [0m] [1mParameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/params.json[0m [36mparams[0m=[35m{'action_scaler': None, 'batch_size': 100, 'discrete_action': False, 'encoder_factory': {'type': 'inverted_pendulum', 'params': {'hidden_units': [256, 256], 'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None, 'use_dense': False}}, 'gamma': 1.0, 'generated_maxlen': 100000, 'learning_rate': 0.0001, 'n_ensembles': 5, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps':

Epoch 1/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:06:43[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=1 step=751[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00022852055718514637, 'time_algorithm_update': 0.013406936719160423, 'loss': -21.364473361799742, 'time_step': 0.013703783406081117, 'observation_error': 0.05243828512231935, 'reward_error': 0.0017379385478908265, 'variance': 0.06518403561763936}[0m [36mstep[0m=[35m751[0m
[2m2023-10-02 23:06:43[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_751.pt[0m


Epoch 2/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:06:55[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=2 step=1502[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00022441315428712238, 'time_algorithm_update': 0.01318661001487356, 'loss': -31.360013313204565, 'time_step': 0.013474410129450609, 'observation_error': 0.030806009419821413, 'reward_error': 0.0008161839346589291, 'variance': 0.02939949919303506}[0m [36mstep[0m=[35m1502[0m
[2m2023-10-02 23:06:55[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_1502.pt[0m


Epoch 3/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:07:07[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=3 step=2253[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00024834739542832864, 'time_algorithm_update': 0.013635611883333615, 'loss': -37.95246935303456, 'time_step': 0.013954735310195131, 'observation_error': 0.019543565701348038, 'reward_error': 0.00010574237357376756, 'variance': 0.014075724658158869}[0m [36mstep[0m=[35m2253[0m
[2m2023-10-02 23:07:07[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_2253.pt[0m


Epoch 4/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:07:18[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=4 step=3004[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002418024720904354, 'time_algorithm_update': 0.013314464914497141, 'loss': -44.787614279199694, 'time_step': 0.01362347475856979, 'observation_error': 0.014953815019147616, 'reward_error': 4.4406497667748545e-05, 'variance': 0.008680161103078956}[0m [36mstep[0m=[35m3004[0m
[2m2023-10-02 23:07:18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_3004.pt[0m


Epoch 5/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:07:29[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=5 step=3755[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00023311273394507193, 'time_algorithm_update': 0.013149672278392806, 'loss': -51.14260700190274, 'time_step': 0.013446972944765052, 'observation_error': 0.017394232104416485, 'reward_error': 3.087846773370062e-05, 'variance': 0.007074993983950143}[0m [36mstep[0m=[35m3755[0m
[2m2023-10-02 23:07:30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_3755.pt[0m


Epoch 6/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:07:41[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=6 step=4506[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00025479739578998837, 'time_algorithm_update': 0.013749021347289715, 'loss': -55.46442060051523, 'time_step': 0.01407709515364605, 'observation_error': 0.019263532296463277, 'reward_error': 1.8265105546206563e-05, 'variance': 0.014443879057905734}[0m [36mstep[0m=[35m4506[0m
[2m2023-10-02 23:07:41[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_4506.pt[0m


Epoch 7/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:07:53[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=7 step=5257[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002697647490926811, 'time_algorithm_update': 0.01416107849496976, 'loss': -59.34817462691296, 'time_step': 0.014503739645255706, 'observation_error': 0.032411348095376775, 'reward_error': 1.1612423782347725e-05, 'variance': 0.0333428284042204}[0m [36mstep[0m=[35m5257[0m
[2m2023-10-02 23:07:53[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_5257.pt[0m


Epoch 8/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:08:06[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=8 step=6008[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026558591586137105, 'time_algorithm_update': 0.013977731750427328, 'loss': -62.0152760767905, 'time_step': 0.014317528385614429, 'observation_error': 0.0673252108828156, 'reward_error': 1.767245480982782e-05, 'variance': 0.04918076488398031}[0m [36mstep[0m=[35m6008[0m
[2m2023-10-02 23:08:06[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_6008.pt[0m


Epoch 9/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:08:17[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=9 step=6759[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002598260912533289, 'time_algorithm_update': 0.013923051989030901, 'loss': -63.507225387105926, 'time_step': 0.014254970969595064, 'observation_error': 0.04915927840190379, 'reward_error': 1.2033055282351252e-05, 'variance': 0.06219917623601189}[0m [36mstep[0m=[35m6759[0m
[2m2023-10-02 23:08:18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_6759.pt[0m


Epoch 10/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:08:29[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=10 step=7510[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002320111194717265, 'time_algorithm_update': 0.013087690114657508, 'loss': -65.16352085108446, 'time_step': 0.013385960963689852, 'observation_error': 0.04591474336085644, 'reward_error': 1.9795817991960435e-05, 'variance': 0.07341947022650049}[0m [36mstep[0m=[35m7510[0m
[2m2023-10-02 23:08:29[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_7510.pt[0m


Epoch 11/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:08:40[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=11 step=8261[0m [36mepoch[0m=[35m11[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00023698425800917786, 'time_algorithm_update': 0.013145448682152637, 'loss': -66.80545106740512, 'time_step': 0.013448992994590384, 'observation_error': 0.0805442244730728, 'reward_error': 1.6165564559020737e-05, 'variance': 0.08080553263930496}[0m [36mstep[0m=[35m8261[0m
[2m2023-10-02 23:08:40[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_8261.pt[0m


Epoch 12/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:08:52[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=12 step=9012[0m [36mepoch[0m=[35m12[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002685771007512444, 'time_algorithm_update': 0.013778526519490939, 'loss': -68.19117315567921, 'time_step': 0.014116086274107667, 'observation_error': 0.08669712326070216, 'reward_error': 1.082562268590026e-05, 'variance': 0.10529988083834133}[0m [36mstep[0m=[35m9012[0m
[2m2023-10-02 23:08:52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_9012.pt[0m


Epoch 13/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:09:04[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=13 step=9763[0m [36mepoch[0m=[35m13[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002624769502886126, 'time_algorithm_update': 0.013673598217107008, 'loss': -68.08264416257805, 'time_step': 0.014006759450534053, 'observation_error': 0.08957194758546132, 'reward_error': 1.1794945360027843e-05, 'variance': 0.11430120413516677}[0m [36mstep[0m=[35m9763[0m
[2m2023-10-02 23:09:04[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_9763.pt[0m


Epoch 14/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:09:16[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=14 step=10514[0m [36mepoch[0m=[35m14[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002590149601353469, 'time_algorithm_update': 0.013809438393055679, 'loss': -69.51583102285942, 'time_step': 0.014137639504139338, 'observation_error': 0.08291350117052367, 'reward_error': 9.270348433388266e-06, 'variance': 0.1087923990052778}[0m [36mstep[0m=[35m10514[0m
[2m2023-10-02 23:09:16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_10514.pt[0m


Epoch 15/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:09:28[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=15 step=11265[0m [36mepoch[0m=[35m15[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002726276768507875, 'time_algorithm_update': 0.013813515636479649, 'loss': -69.93400368455565, 'time_step': 0.014157967624588114, 'observation_error': 0.10814884753161752, 'reward_error': 1.0036446801411084e-05, 'variance': 0.12933254499097158}[0m [36mstep[0m=[35m11265[0m
[2m2023-10-02 23:09:28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_11265.pt[0m


Epoch 16/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:09:40[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=16 step=12016[0m [36mepoch[0m=[35m16[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002828657071536454, 'time_algorithm_update': 0.014573545811496943, 'loss': -70.32310646859689, 'time_step': 0.01493031080172319, 'observation_error': 0.10254627817366879, 'reward_error': 1.7276800205630933e-05, 'variance': 0.11239423467688812}[0m [36mstep[0m=[35m12016[0m
[2m2023-10-02 23:09:41[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_12016.pt[0m


Epoch 17/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:09:54[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=17 step=12767[0m [36mepoch[0m=[35m17[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003031338261224617, 'time_algorithm_update': 0.015351193881384066, 'loss': -71.60659224945759, 'time_step': 0.01573525732271522, 'observation_error': 0.09985968799495948, 'reward_error': 6.824707558734197e-06, 'variance': 0.10370214001777088}[0m [36mstep[0m=[35m12767[0m
[2m2023-10-02 23:09:54[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_12767.pt[0m


Epoch 18/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:10:07[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=18 step=13518[0m [36mepoch[0m=[35m18[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000322298108340897, 'time_algorithm_update': 0.015944201524025908, 'loss': -71.37373577580153, 'time_step': 0.016352293812006355, 'observation_error': 0.09360046069934361, 'reward_error': 1.0666291658526499e-05, 'variance': 0.10596251167824813}[0m [36mstep[0m=[35m13518[0m
[2m2023-10-02 23:10:07[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_13518.pt[0m


Epoch 19/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:10:21[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=19 step=14269[0m [36mepoch[0m=[35m19[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003217419041457094, 'time_algorithm_update': 0.015881514898470332, 'loss': -71.83253711517578, 'time_step': 0.016288465888458944, 'observation_error': 0.08991161217268115, 'reward_error': 1.800241414490849e-05, 'variance': 0.10276285794382962}[0m [36mstep[0m=[35m14269[0m
[2m2023-10-02 23:10:21[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_14269.pt[0m


Epoch 20/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:10:34[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=20 step=15020[0m [36mepoch[0m=[35m20[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000302023322540974, 'time_algorithm_update': 0.014947113438389115, 'loss': -72.25845793560246, 'time_step': 0.015328389826849519, 'observation_error': 0.08515148240606984, 'reward_error': 6.608235405363918e-06, 'variance': 0.09758177180535409}[0m [36mstep[0m=[35m15020[0m
[2m2023-10-02 23:10:34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_15020.pt[0m


Epoch 21/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:10:46[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=21 step=15771[0m [36mepoch[0m=[35m21[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026937013618479397, 'time_algorithm_update': 0.01403417949193962, 'loss': -72.20712048267715, 'time_step': 0.01437464471504628, 'observation_error': 0.0923139561885392, 'reward_error': 7.39015883008406e-06, 'variance': 0.08863630885275245}[0m [36mstep[0m=[35m15771[0m
[2m2023-10-02 23:10:46[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_15771.pt[0m


Epoch 22/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:10:58[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=22 step=16522[0m [36mepoch[0m=[35m22[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002712378012991142, 'time_algorithm_update': 0.014203738277348952, 'loss': -72.15048953791592, 'time_step': 0.014552154490220721, 'observation_error': 0.07692866129697623, 'reward_error': 3.7418223277055845e-06, 'variance': 0.08352273033010696}[0m [36mstep[0m=[35m16522[0m
[2m2023-10-02 23:10:58[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_16522.pt[0m


Epoch 23/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:11:11[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=23 step=17273[0m [36mepoch[0m=[35m23[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002914935390101609, 'time_algorithm_update': 0.014964285607979237, 'loss': -73.5151220155302, 'time_step': 0.015335636354794357, 'observation_error': 0.08490095318600292, 'reward_error': 5.7925253569306555e-06, 'variance': 0.08597295201042826}[0m [36mstep[0m=[35m17273[0m
[2m2023-10-02 23:11:11[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_17273.pt[0m


Epoch 24/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:11:23[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=24 step=18024[0m [36mepoch[0m=[35m24[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026505288684098286, 'time_algorithm_update': 0.014082019719556867, 'loss': -73.30664074182828, 'time_step': 0.014418937236110317, 'observation_error': 0.07048793214635636, 'reward_error': 7.857433239333278e-06, 'variance': 0.08834296548307845}[0m [36mstep[0m=[35m18024[0m
[2m2023-10-02 23:11:23[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_18024.pt[0m


Epoch 25/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:11:35[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=25 step=18775[0m [36mepoch[0m=[35m25[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00025293417522973926, 'time_algorithm_update': 0.013547784955142817, 'loss': -73.29887431471072, 'time_step': 0.013871751676069277, 'observation_error': 0.07257815352286272, 'reward_error': 7.392556208188534e-06, 'variance': 0.08042947262612472}[0m [36mstep[0m=[35m18775[0m
[2m2023-10-02 23:11:35[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_18775.pt[0m


Epoch 26/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:11:47[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=26 step=19526[0m [36mepoch[0m=[35m26[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00025631235379194926, 'time_algorithm_update': 0.013820679940174168, 'loss': -73.62813315029626, 'time_step': 0.014146876874839894, 'observation_error': 0.07616823330036541, 'reward_error': 6.414180107053621e-06, 'variance': 0.07744455395938815}[0m [36mstep[0m=[35m19526[0m
[2m2023-10-02 23:11:47[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_19526.pt[0m


Epoch 27/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:11:59[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=27 step=20277[0m [36mepoch[0m=[35m27[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002491947179151756, 'time_algorithm_update': 0.013641683461662932, 'loss': -74.23307303264836, 'time_step': 0.01395949161798754, 'observation_error': 0.06399799687599951, 'reward_error': 4.543127282023815e-06, 'variance': 0.07252397987450558}[0m [36mstep[0m=[35m20277[0m
[2m2023-10-02 23:11:59[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_20277.pt[0m


Epoch 28/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:12:10[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=28 step=21028[0m [36mepoch[0m=[35m28[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002370287035498892, 'time_algorithm_update': 0.013247945813459658, 'loss': -74.08671614523735, 'time_step': 0.013549734527039305, 'observation_error': 0.06149779478598632, 'reward_error': 4.899795897796485e-06, 'variance': 0.07558409604096705}[0m [36mstep[0m=[35m21028[0m
[2m2023-10-02 23:12:10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_21028.pt[0m


Epoch 29/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:12:22[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=29 step=21779[0m [36mepoch[0m=[35m29[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00025502216323872856, 'time_algorithm_update': 0.013606262905461176, 'loss': -74.7928370794507, 'time_step': 0.013928279419554851, 'observation_error': 0.05100920214037398, 'reward_error': 3.929330562832949e-06, 'variance': 0.06407551833312887}[0m [36mstep[0m=[35m21779[0m
[2m2023-10-02 23:12:22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_21779.pt[0m


Epoch 30/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:12:33[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=30 step=22530[0m [36mepoch[0m=[35m30[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00024615242065348413, 'time_algorithm_update': 0.01362888409834251, 'loss': -73.98152317473796, 'time_step': 0.013943553129620305, 'observation_error': 0.0533179769625825, 'reward_error': 4.726412796005333e-06, 'variance': 0.06148351277670768}[0m [36mstep[0m=[35m22530[0m
[2m2023-10-02 23:12:33[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_22530.pt[0m


Epoch 31/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:12:45[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=31 step=23281[0m [36mepoch[0m=[35m31[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00022664146321750036, 'time_algorithm_update': 0.012914411872426933, 'loss': -73.88663739307266, 'time_step': 0.013206700986615827, 'observation_error': 0.06358232452572908, 'reward_error': 7.853196294103082e-06, 'variance': 0.06410237869850802}[0m [36mstep[0m=[35m23281[0m
[2m2023-10-02 23:12:45[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_23281.pt[0m


Epoch 32/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:12:57[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=32 step=24032[0m [36mepoch[0m=[35m32[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002831796831519562, 'time_algorithm_update': 0.015043997097904292, 'loss': -74.90890298734175, 'time_step': 0.015402141487233331, 'observation_error': 0.04751395395068934, 'reward_error': 5.799251285860668e-06, 'variance': 0.05192492300743397}[0m [36mstep[0m=[35m24032[0m
[2m2023-10-02 23:12:57[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_24032.pt[0m


Epoch 33/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:13:09[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=33 step=24783[0m [36mepoch[0m=[35m33[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002430917102392123, 'time_algorithm_update': 0.013386143825343064, 'loss': -75.16071557427215, 'time_step': 0.013696844504771632, 'observation_error': 0.05531357239308855, 'reward_error': 4.254540021720875e-06, 'variance': 0.049879465197805326}[0m [36mstep[0m=[35m24783[0m
[2m2023-10-02 23:13:09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_24783.pt[0m


Epoch 34/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:13:20[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=34 step=25534[0m [36mepoch[0m=[35m34[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00023987956751837393, 'time_algorithm_update': 0.013241003420000546, 'loss': -74.69948457115976, 'time_step': 0.013546632228297655, 'observation_error': 0.052255064282763296, 'reward_error': 4.963180528435599e-06, 'variance': 0.04696049721321378}[0m [36mstep[0m=[35m25534[0m
[2m2023-10-02 23:13:20[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_25534.pt[0m


Epoch 35/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:13:33[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=35 step=26285[0m [36mepoch[0m=[35m35[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002629039449475894, 'time_algorithm_update': 0.014172997836584099, 'loss': -75.31516193867364, 'time_step': 0.014506865119172159, 'observation_error': 0.054663445094133775, 'reward_error': 2.5963155488189387e-06, 'variance': 0.04371133901049977}[0m [36mstep[0m=[35m26285[0m
[2m2023-10-02 23:13:33[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_26285.pt[0m


Epoch 36/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:13:45[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=36 step=27036[0m [36mepoch[0m=[35m36[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00027859988764980027, 'time_algorithm_update': 0.014527274193998659, 'loss': -75.47154206886749, 'time_step': 0.014879176207134791, 'observation_error': 0.04686058882004584, 'reward_error': 2.5989444368410788e-06, 'variance': 0.0397429265163855}[0m [36mstep[0m=[35m27036[0m
[2m2023-10-02 23:13:45[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_27036.pt[0m


Epoch 37/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:13:58[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=37 step=27787[0m [36mepoch[0m=[35m37[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00028144344985088244, 'time_algorithm_update': 0.014573962647175186, 'loss': -75.6010141867931, 'time_step': 0.014935152508447395, 'observation_error': 0.042192943397502555, 'reward_error': 3.1682179460812386e-06, 'variance': 0.03719918547095691}[0m [36mstep[0m=[35m27787[0m
[2m2023-10-02 23:13:58[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_27787.pt[0m


Epoch 38/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:14:10[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=38 step=28538[0m [36mepoch[0m=[35m38[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002713758999434673, 'time_algorithm_update': 0.014216705898787782, 'loss': -75.04107126192787, 'time_step': 0.014561913778556488, 'observation_error': 0.03870929347477867, 'reward_error': 6.718273745446784e-06, 'variance': 0.03966919672974678}[0m [36mstep[0m=[35m28538[0m
[2m2023-10-02 23:14:10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_28538.pt[0m


Epoch 39/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:14:22[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=39 step=29289[0m [36mepoch[0m=[35m39[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002510671450517149, 'time_algorithm_update': 0.013460947892637291, 'loss': -76.0655787413352, 'time_step': 0.013779073834577984, 'observation_error': 0.04096306077003024, 'reward_error': 3.3785957601783466e-06, 'variance': 0.032915818024660363}[0m [36mstep[0m=[35m29289[0m
[2m2023-10-02 23:14:22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_29289.pt[0m


Epoch 40/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:14:34[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=40 step=30040[0m [36mepoch[0m=[35m40[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026059277683059955, 'time_algorithm_update': 0.013908443692203527, 'loss': -76.47718694460852, 'time_step': 0.014238392782909734, 'observation_error': 0.034260976255870504, 'reward_error': 2.6624428200327548e-06, 'variance': 0.03111042078633205}[0m [36mstep[0m=[35m30040[0m
[2m2023-10-02 23:14:34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_30040.pt[0m


Epoch 41/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:14:45[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=41 step=30791[0m [36mepoch[0m=[35m41[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00025066840505790454, 'time_algorithm_update': 0.013568961033014738, 'loss': -75.9969762631961, 'time_step': 0.013890091176039369, 'observation_error': 0.02967821012231883, 'reward_error': 2.973742936621061e-06, 'variance': 0.027824332189573146}[0m [36mstep[0m=[35m30791[0m
[2m2023-10-02 23:14:45[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_30791.pt[0m


Epoch 42/100:   0%|          | 0/751 [00:00<?, ?it/s]

[2m2023-10-02 23:14:57[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics_20231002230632: epoch=42 step=31542[0m [36mepoch[0m=[35m42[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026208741687109243, 'time_algorithm_update': 0.013898081849322973, 'loss': -76.14587828259017, 'time_step': 0.0142314999144817, 'observation_error': 0.035268183058480035, 'reward_error': 1.1854234170584247e-05, 'variance': 0.027841275058009225}[0m [36mstep[0m=[35m31542[0m
[2m2023-10-02 23:14:57[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632/model_31542.pt[0m


Epoch 43/100:   0%|          | 0/751 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Load Dynamics

In [19]:
# load trained dynamics model
dynamics_model_path = "d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632"
dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics.from_json(dynamics_model_path + '/params.json')
dynamics.load_model(dynamics_model_path + '/model_31542.pt')

Using InvertedPendulumEncoderFactory


## Train Offline RL Algorithm

In [20]:
encoders = d3rlpy.models.encoders.DefaultEncoderFactory(dropout_rate=0.2)
# give COMBO as the generator argument.
combo = COMBO(dynamics=dynamics, critic_encoder_factory=encoders, actor_encoder_factory=encoders,
              use_gpu=use_gpu)

In [21]:
combo.fit(dataset = train_episodes, eval_episodes=test_episodes, n_steps=100000, n_steps_per_epoch=1000, tensorboard_dir="tensorboard_logs",
         scorers={
            'environment': d3rlpy.metrics.scorer.evaluate_on_environment(eval_env)
        })

[2m2023-10-02 23:16:08[0m [[32m[1mdebug    [0m] [1mRandomIterator is selected.[0m
[2m2023-10-02 23:16:08[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/COMBO_20231002231608[0m
[2m2023-10-02 23:16:08[0m [[32m[1mdebug    [0m] [1mBuilding models...[0m
[2m2023-10-02 23:16:08[0m [[32m[1mdebug    [0m] [1mModels have been built.[0m
[2m2023-10-02 23:16:08[0m [[32m[1minfo     [0m] [1mParameters are saved to d3rlpy_logs/COMBO_20231002231608/params.json[0m [36mparams[0m=[35m{'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': 0.2}}, 'actor_learning_rate': 0.0001, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'batch_size': 256, 'conservative_weight': 1.0, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': 0.2}}, 'c

Epoch 1/100:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2023-10-02 23:16:21[0m [[32m[1mdebug    [0m] [1m250000 transitions are generated.[0m [36mfake_transitions[0m=[35m250000[0m [36mreal_transitions[0m=[35m75144[0m


KeyboardInterrupt: 

## Load the Policy

In [9]:
trained_policy = COMBO()
# initialize with dataset
trained_policy.build_with_dataset(dataset)
# Load entire model parameters.
trained_policy.load_model('d3rlpy_logs/COMBO_20230929153035/model_53000.pt')

## See the policy running

In [10]:
scorer = d3rlpy.metrics.scorer.evaluate_on_environment(eval_env, render=True)
mean_episode_return = scorer(trained_policy)