In [1]:
import d3rlpy
from d3rlpy.algos import COMBO
from sklearn.model_selection import train_test_split
import gymnasium as gym
import numpy as np
import encoders



## Params

In [2]:
seed = 1
d3rlpy.seed(seed)
use_gpu = True
# prepare environment
env = gym.make("InvertedPendulum-v4")
eval_env = gym.make("InvertedPendulum-v4")
env.reset(seed=seed)
eval_env.reset(seed=seed)

(array([ 0.00023643,  0.00900927, -0.00711681,  0.00897299]), {})

## Create dataset

In [None]:
actor_encoder = d3rlpy.models.encoders.DefaultEncoderFactory(dropout_rate=0.2)
# setup algorithm
sac = d3rlpy.algos.SAC(
    batch_size=256,
    actor_encoder_factory=actor_encoder,
    actor_learning_rate=3e-4,
    critic_learning_rate=3e-4,
    temp_learning_rate=3e-4,
    use_gpu=use_gpu
)

# prepare utilities
buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=1000000, env=env)

# start training
sac.fit_online(
    env,
    buffer,
    eval_env=eval_env,
    n_steps=100000,
    n_steps_per_epoch=1000,
    update_interval=1,
    update_start_step=1000,
    tensorboard_dir='tensorboard_logs'
)

In [None]:
# export replay buffer as MDPDataset
dataset = buffer.to_mdp_dataset()

# save MDPDataset
dataset.dump('d3rlpy_data/inverted_pendulum2.h5')

## Load the dataset

In [3]:
dataset = d3rlpy.dataset.MDPDataset.load('d3rlpy_data/inverted_pendulum2.h5')

In [4]:
train_episodes, test_episodes = train_test_split(dataset, random_state=seed)

## Dynamics learning

In [5]:
def inverted_pendulum_project(x):
    return x[:, 1:]
projection_size = 3
encoder_factory = encoders.SymmetryEncoderFactory(project=inverted_pendulum_project, projection_size=projection_size)
dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=True, state_encoder_factory=encoder_factory)
#dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=True) # Baseline

Using SymmetryEncoderFactory


In [7]:
# same as algorithms
dynamics.fit(train_episodes,
             eval_episodes=test_episodes,
             n_epochs=100,
             scorers={
                'observation_error': d3rlpy.metrics.scorer.dynamics_observation_prediction_error_scorer,
                'reward_error': d3rlpy.metrics.scorer.dynamics_reward_prediction_error_scorer,
                'variance': d3rlpy.metrics.scorer.dynamics_prediction_variance_scorer,
             },
            tensorboard_dir='tensorboard_logs/dynamics',
            experiment_name='SeparateStateAndRewardEncoders')

[2m2023-10-11 16:01:13[0m [[32m[1mdebug    [0m] [1mRoundIterator is selected.[0m
[2m2023-10-11 16:01:13[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113[0m
[2m2023-10-11 16:01:13[0m [[32m[1minfo     [0m] [1mParameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/params.json[0m [36mparams[0m=[35m{'action_scaler': None, 'batch_size': 100, 'discrete_action': False, 'gamma': 1.0, 'generated_maxlen': 100000, 'learning_rate': 0.0001, 'n_ensembles': 5, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0.0001, 'amsgrad': False}, 'real_ratio': 1.0, 'reward_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'reward_scaler': None, 'scaler': None, 'state_encoder_factory': {'type': 'symmetry', 'params': {'hidden_units': [256, 256], 'activation': 'relu'

Epoch 1/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:01:42[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=1 step=714[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005488235409520253, 'time_algorithm_update': 0.0357434352238973, 'loss': -32.93809812075617, 'time_step': 0.03645618723220184, 'observation_error': 0.016063637931700202, 'reward_error': 0.0016804140969261861, 'variance': 0.01215117613131028}[0m [36mstep[0m=[35m714[0m
[2m2023-10-11 16:01:42[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_714.pt[0m


Epoch 2/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:02:12[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=2 step=1428[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005533832127974481, 'time_algorithm_update': 0.0370452494180503, 'loss': -41.99179640804686, 'time_step': 0.03776338420996145, 'observation_error': 0.01028619149651415, 'reward_error': 0.0006694518923929904, 'variance': 0.004676957080329532}[0m [36mstep[0m=[35m1428[0m
[2m2023-10-11 16:02:12[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_1428.pt[0m


Epoch 3/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:02:47[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=3 step=2142[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005800857597372445, 'time_algorithm_update': 0.041098010974103996, 'loss': -49.00794673266531, 'time_step': 0.0418581545185976, 'observation_error': 0.012040702134374115, 'reward_error': 0.00039915916356946386, 'variance': 0.0093081758324502}[0m [36mstep[0m=[35m2142[0m
[2m2023-10-11 16:02:47[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_2142.pt[0m


Epoch 4/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:03:10[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=4 step=2856[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003020255839457365, 'time_algorithm_update': 0.027733295571570303, 'loss': -55.49056655686109, 'time_step': 0.028121929876610677, 'observation_error': 0.012499884110986357, 'reward_error': 0.0002242284766495115, 'variance': 0.01795299056737318}[0m [36mstep[0m=[35m2856[0m
[2m2023-10-11 16:03:10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_2856.pt[0m


Epoch 5/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:03:32[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=5 step=3570[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002748114722115653, 'time_algorithm_update': 0.026892453682522813, 'loss': -59.63024784336571, 'time_step': 0.02724430574422457, 'observation_error': 0.02593868816331802, 'reward_error': 0.00020959566739003855, 'variance': 0.028549895649231136}[0m [36mstep[0m=[35m3570[0m
[2m2023-10-11 16:03:32[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_3570.pt[0m


Epoch 6/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:03:56[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=6 step=4284[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003071298786238128, 'time_algorithm_update': 0.029435155772361436, 'loss': -62.5907353085964, 'time_step': 0.02982819581232151, 'observation_error': 0.0352631327635547, 'reward_error': 5.913273457840564e-05, 'variance': 0.04466193600213559}[0m [36mstep[0m=[35m4284[0m
[2m2023-10-11 16:03:56[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_4284.pt[0m


Epoch 7/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:04:20[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=7 step=4998[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002860315039712174, 'time_algorithm_update': 0.029342776587029464, 'loss': -64.54988837843182, 'time_step': 0.029707822145200243, 'observation_error': 0.07959647836077188, 'reward_error': 7.576506914797682e-05, 'variance': 0.07718235905538445}[0m [36mstep[0m=[35m4998[0m
[2m2023-10-11 16:04:20[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_4998.pt[0m


Epoch 8/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:04:43[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=8 step=5712[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002635299992494556, 'time_algorithm_update': 0.027883324636464694, 'loss': -66.8159948677576, 'time_step': 0.028218350824521704, 'observation_error': 0.06598608904128779, 'reward_error': 3.1240549454581794e-05, 'variance': 0.08229025061837277}[0m [36mstep[0m=[35m5712[0m
[2m2023-10-11 16:04:43[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_5712.pt[0m


Epoch 9/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:05:05[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=9 step=6426[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002669553462864638, 'time_algorithm_update': 0.027892351150512695, 'loss': -67.98284289616497, 'time_step': 0.028229115389976182, 'observation_error': 0.07279709196346495, 'reward_error': 1.9327542722844794e-05, 'variance': 0.08355996130921885}[0m [36mstep[0m=[35m6426[0m
[2m2023-10-11 16:05:05[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_6426.pt[0m


Epoch 10/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:05:28[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=10 step=7140[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002694931350836233, 'time_algorithm_update': 0.02798398502734529, 'loss': -69.49162844115612, 'time_step': 0.028327676762385862, 'observation_error': 0.07283078453936066, 'reward_error': 1.2177851885506182e-05, 'variance': 0.10398370912911659}[0m [36mstep[0m=[35m7140[0m
[2m2023-10-11 16:05:28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_7140.pt[0m


Epoch 11/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:05:52[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=11 step=7854[0m [36mepoch[0m=[35m11[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026920763384394287, 'time_algorithm_update': 0.02853827349612025, 'loss': -70.72018104307458, 'time_step': 0.028880503665165407, 'observation_error': 0.08261175267442493, 'reward_error': 8.42658654284104e-06, 'variance': 0.08459742627140697}[0m [36mstep[0m=[35m7854[0m
[2m2023-10-11 16:05:52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_7854.pt[0m


Epoch 12/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:06:15[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=12 step=8568[0m [36mepoch[0m=[35m12[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026392970098500825, 'time_algorithm_update': 0.028242904932892957, 'loss': -71.85331416731121, 'time_step': 0.02857557298088608, 'observation_error': 0.07821686286733326, 'reward_error': 8.127057496526309e-06, 'variance': 0.08100600219900284}[0m [36mstep[0m=[35m8568[0m
[2m2023-10-11 16:06:15[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_8568.pt[0m


Epoch 13/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:06:39[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=13 step=9282[0m [36mepoch[0m=[35m13[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002645177333628764, 'time_algorithm_update': 0.02868460373384278, 'loss': -72.7339353708326, 'time_step': 0.02902225018883286, 'observation_error': 0.07101217299522311, 'reward_error': 7.054472255326367e-06, 'variance': 0.08766420931142052}[0m [36mstep[0m=[35m9282[0m
[2m2023-10-11 16:06:39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_9282.pt[0m


Epoch 14/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:07:03[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=14 step=9996[0m [36mepoch[0m=[35m14[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002683755062541374, 'time_algorithm_update': 0.02902599075595204, 'loss': -72.85033555591808, 'time_step': 0.029369880171383127, 'observation_error': 0.09024285228451698, 'reward_error': 9.258893871044738e-06, 'variance': 0.08673457444343503}[0m [36mstep[0m=[35m9996[0m
[2m2023-10-11 16:07:03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_9996.pt[0m


Epoch 15/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:07:28[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=15 step=10710[0m [36mepoch[0m=[35m15[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000267836226134741, 'time_algorithm_update': 0.02905042565503374, 'loss': -73.71083363231156, 'time_step': 0.029392140252249583, 'observation_error': 0.07191846972008979, 'reward_error': 6.80804858040102e-06, 'variance': 0.0826738396330889}[0m [36mstep[0m=[35m10710[0m
[2m2023-10-11 16:07:28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_10710.pt[0m


Epoch 16/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:07:52[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=16 step=11424[0m [36mepoch[0m=[35m16[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002654530421024611, 'time_algorithm_update': 0.028646148553415508, 'loss': -74.32843611420704, 'time_step': 0.028982575200185056, 'observation_error': 0.0727589084181637, 'reward_error': 5.376773028146061e-06, 'variance': 0.08366453285136155}[0m [36mstep[0m=[35m11424[0m
[2m2023-10-11 16:07:52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_11424.pt[0m


Epoch 17/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:08:15[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=17 step=12138[0m [36mepoch[0m=[35m17[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026712731486942923, 'time_algorithm_update': 0.027662879946519014, 'loss': -75.01022903057708, 'time_step': 0.02800061288667994, 'observation_error': 0.06319164701810565, 'reward_error': 8.348477831786902e-06, 'variance': 0.07418344688508777}[0m [36mstep[0m=[35m12138[0m
[2m2023-10-11 16:08:15[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_12138.pt[0m


Epoch 18/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:08:37[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=18 step=12852[0m [36mepoch[0m=[35m18[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00028229961876107863, 'time_algorithm_update': 0.02667390098090933, 'loss': -74.88982217318538, 'time_step': 0.02702823907387357, 'observation_error': 0.05122103341570176, 'reward_error': 6.715269744926453e-06, 'variance': 0.06030827052741422}[0m [36mstep[0m=[35m12852[0m
[2m2023-10-11 16:08:37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_12852.pt[0m


Epoch 19/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:09:00[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=19 step=13566[0m [36mepoch[0m=[35m19[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00027028118528905703, 'time_algorithm_update': 0.02664151345314432, 'loss': -75.90786019660511, 'time_step': 0.026988395789758163, 'observation_error': 0.05814140938035148, 'reward_error': 1.0582233100350777e-05, 'variance': 0.06042807269206035}[0m [36mstep[0m=[35m13566[0m
[2m2023-10-11 16:09:00[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_13566.pt[0m


Epoch 20/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:09:25[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=20 step=14280[0m [36mepoch[0m=[35m20[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002836830475751091, 'time_algorithm_update': 0.02786560352442979, 'loss': -76.01860527097344, 'time_step': 0.02822520151859572, 'observation_error': 0.05381588387876356, 'reward_error': 6.203574376925544e-06, 'variance': 0.05712321908310585}[0m [36mstep[0m=[35m14280[0m
[2m2023-10-11 16:09:25[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_14280.pt[0m


Epoch 21/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:09:50[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=21 step=14994[0m [36mepoch[0m=[35m21[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00031195134341883724, 'time_algorithm_update': 0.029524042826740683, 'loss': -76.15675635197583, 'time_step': 0.029920159601697735, 'observation_error': 0.054176032864033954, 'reward_error': 5.83681408161328e-06, 'variance': 0.056528824787859665}[0m [36mstep[0m=[35m14994[0m
[2m2023-10-11 16:09:50[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_14994.pt[0m


Epoch 22/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:10:13[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=22 step=15708[0m [36mepoch[0m=[35m22[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00026576091595390596, 'time_algorithm_update': 0.026803791356019947, 'loss': -77.24458712088962, 'time_step': 0.027137501900937378, 'observation_error': 0.05019693149219176, 'reward_error': 6.027386436882101e-06, 'variance': 0.04588402651152553}[0m [36mstep[0m=[35m15708[0m
[2m2023-10-11 16:10:13[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_15708.pt[0m


Epoch 23/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:10:36[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=23 step=16422[0m [36mepoch[0m=[35m23[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002678679484947055, 'time_algorithm_update': 0.027553397066452923, 'loss': -76.27150901292218, 'time_step': 0.027895345407373765, 'observation_error': 0.047019793146334025, 'reward_error': 4.119048071877392e-06, 'variance': 0.044560108004023144}[0m [36mstep[0m=[35m16422[0m
[2m2023-10-11 16:10:36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_16422.pt[0m


Epoch 24/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:11:00[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=24 step=17136[0m [36mepoch[0m=[35m24[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00027278925524372346, 'time_algorithm_update': 0.028071282290610948, 'loss': -77.37493081186332, 'time_step': 0.028419137001037598, 'observation_error': 0.05478922580812391, 'reward_error': 8.536133370875475e-06, 'variance': 0.04322080005910254}[0m [36mstep[0m=[35m17136[0m
[2m2023-10-11 16:11:00[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_17136.pt[0m


Epoch 25/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:11:24[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=25 step=17850[0m [36mepoch[0m=[35m25[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002664805126457321, 'time_algorithm_update': 0.028459115522582325, 'loss': -77.15871362392308, 'time_step': 0.028798521733751484, 'observation_error': 0.049331784441540794, 'reward_error': 5.202337601934514e-06, 'variance': 0.041804855220275224}[0m [36mstep[0m=[35m17850[0m
[2m2023-10-11 16:11:24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_17850.pt[0m


Epoch 26/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:11:48[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=26 step=18564[0m [36mepoch[0m=[35m26[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00025644222227465204, 'time_algorithm_update': 0.02868655115282502, 'loss': -77.46443978077224, 'time_step': 0.029010880895021584, 'observation_error': 0.0416893146834523, 'reward_error': 1.272563056886849e-05, 'variance': 0.03832973771797907}[0m [36mstep[0m=[35m18564[0m
[2m2023-10-11 16:11:48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_18564.pt[0m


Epoch 27/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:12:12[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=27 step=19278[0m [36mepoch[0m=[35m27[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002668668575981418, 'time_algorithm_update': 0.02837608508369168, 'loss': -77.9178146394361, 'time_step': 0.028716801929206743, 'observation_error': 0.05030161891218812, 'reward_error': 6.676089352620778e-06, 'variance': 0.03765952237216602}[0m [36mstep[0m=[35m19278[0m
[2m2023-10-11 16:12:12[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_19278.pt[0m


Epoch 28/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:12:36[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=28 step=19992[0m [36mepoch[0m=[35m28[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002860488677893032, 'time_algorithm_update': 0.029162906465076265, 'loss': -77.37744206607509, 'time_step': 0.02952440312596596, 'observation_error': 0.03297123173199638, 'reward_error': 1.8873392512282002e-05, 'variance': 0.03619601391406776}[0m [36mstep[0m=[35m19992[0m
[2m2023-10-11 16:12:36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_19992.pt[0m


Epoch 29/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:13:00[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=29 step=20706[0m [36mepoch[0m=[35m29[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002691572119875782, 'time_algorithm_update': 0.02792070960464264, 'loss': -78.51942420339718, 'time_step': 0.028262248226240568, 'observation_error': 0.03453809876229121, 'reward_error': 1.514783918821821e-05, 'variance': 0.035126873921009834}[0m [36mstep[0m=[35m20706[0m
[2m2023-10-11 16:13:00[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_20706.pt[0m


Epoch 30/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:13:39[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=30 step=21420[0m [36mepoch[0m=[35m30[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006017207431526077, 'time_algorithm_update': 0.04797992485911906, 'loss': -78.88330051144298, 'time_step': 0.04878089107385203, 'observation_error': 0.03715907800822005, 'reward_error': 4.053780895902161e-06, 'variance': 0.034499472030521665}[0m [36mstep[0m=[35m21420[0m
[2m2023-10-11 16:13:39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_21420.pt[0m


Epoch 31/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:14:16[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=31 step=22134[0m [36mepoch[0m=[35m31[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005516568485762225, 'time_algorithm_update': 0.04407665575919699, 'loss': -77.95981201471066, 'time_step': 0.04479920997673056, 'observation_error': 0.034536206660390915, 'reward_error': 4.634540308391525e-06, 'variance': 0.02914444518165094}[0m [36mstep[0m=[35m22134[0m
[2m2023-10-11 16:14:17[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_22134.pt[0m


Epoch 32/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:14:52[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=32 step=22848[0m [36mepoch[0m=[35m32[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000508778569411163, 'time_algorithm_update': 0.04314671143764207, 'loss': -79.16124798002697, 'time_step': 0.04381124178568522, 'observation_error': 0.02940685172400477, 'reward_error': 5.042256148722327e-06, 'variance': 0.02672819085370568}[0m [36mstep[0m=[35m22848[0m
[2m2023-10-11 16:14:52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_22848.pt[0m


Epoch 33/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:15:21[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=33 step=23562[0m [36mepoch[0m=[35m33[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00037025136440074075, 'time_algorithm_update': 0.034326914979630155, 'loss': -78.69275724053048, 'time_step': 0.034803725424267, 'observation_error': 0.03409830950313699, 'reward_error': 6.181279594700322e-06, 'variance': 0.027149672709130556}[0m [36mstep[0m=[35m23562[0m
[2m2023-10-11 16:15:21[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_23562.pt[0m


Epoch 34/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:15:57[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=34 step=24276[0m [36mepoch[0m=[35m34[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005218685508108273, 'time_algorithm_update': 0.04354527083431639, 'loss': -78.82215552503655, 'time_step': 0.04422280234115131, 'observation_error': 0.02886928450456361, 'reward_error': 5.108550123520058e-06, 'variance': 0.02645634787088616}[0m [36mstep[0m=[35m24276[0m
[2m2023-10-11 16:15:57[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_24276.pt[0m


Epoch 35/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:16:32[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=35 step=24990[0m [36mepoch[0m=[35m35[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004935922408972134, 'time_algorithm_update': 0.04230437759591752, 'loss': -79.22568486585003, 'time_step': 0.042942015730700236, 'observation_error': 0.027291970905697415, 'reward_error': 1.1562652519398613e-05, 'variance': 0.023663024133733845}[0m [36mstep[0m=[35m24990[0m
[2m2023-10-11 16:16:32[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_24990.pt[0m


Epoch 36/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:17:00[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=36 step=25704[0m [36mepoch[0m=[35m36[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00036127794356573197, 'time_algorithm_update': 0.03417142666354567, 'loss': -79.45777052657611, 'time_step': 0.03465371539278859, 'observation_error': 0.027708821920332548, 'reward_error': 4.702045926392268e-06, 'variance': 0.022300928927597368}[0m [36mstep[0m=[35m25704[0m
[2m2023-10-11 16:17:00[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_25704.pt[0m


Epoch 37/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:17:25[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=37 step=26418[0m [36mepoch[0m=[35m37[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00030151201563389027, 'time_algorithm_update': 0.030993534737274425, 'loss': -80.21153719685658, 'time_step': 0.03137853900257613, 'observation_error': 0.026951387908764366, 'reward_error': 4.369890892361099e-06, 'variance': 0.02053690517436888}[0m [36mstep[0m=[35m26418[0m
[2m2023-10-11 16:17:25[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_26418.pt[0m


Epoch 38/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:17:50[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=38 step=27132[0m [36mepoch[0m=[35m38[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002969022558516815, 'time_algorithm_update': 0.03014738553044509, 'loss': -80.0239588907119, 'time_step': 0.03052353157716639, 'observation_error': 0.031072697627830125, 'reward_error': 3.97495602333563e-06, 'variance': 0.022706129107332444}[0m [36mstep[0m=[35m27132[0m
[2m2023-10-11 16:17:50[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_27132.pt[0m


Epoch 39/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:18:16[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=39 step=27846[0m [36mepoch[0m=[35m39[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003051160096454353, 'time_algorithm_update': 0.030429923901704847, 'loss': -80.28760922055285, 'time_step': 0.030818075025115025, 'observation_error': 0.023312361508781947, 'reward_error': 4.1042375402914884e-06, 'variance': 0.019981544668422113}[0m [36mstep[0m=[35m27846[0m
[2m2023-10-11 16:18:16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_27846.pt[0m


Epoch 40/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:18:40[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=40 step=28560[0m [36mepoch[0m=[35m40[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002827297071782815, 'time_algorithm_update': 0.029362126892688228, 'loss': -79.85268249298011, 'time_step': 0.02972174959690297, 'observation_error': 0.023187914986421535, 'reward_error': 4.393567446202786e-06, 'variance': 0.019260564122541513}[0m [36mstep[0m=[35m28560[0m
[2m2023-10-11 16:18:40[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_28560.pt[0m


Epoch 41/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:19:03[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=41 step=29274[0m [36mepoch[0m=[35m41[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002654827609449542, 'time_algorithm_update': 0.028316502811528053, 'loss': -80.2375579578202, 'time_step': 0.02865489924989161, 'observation_error': 0.023586039345311026, 'reward_error': 4.66639229602513e-06, 'variance': 0.018237489018015397}[0m [36mstep[0m=[35m29274[0m
[2m2023-10-11 16:19:03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_29274.pt[0m


Epoch 42/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:19:27[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=42 step=29988[0m [36mepoch[0m=[35m42[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002761060784176952, 'time_algorithm_update': 0.028533778270753492, 'loss': -80.58065332685199, 'time_step': 0.02888230516129181, 'observation_error': 0.021782848647135717, 'reward_error': 3.836896337399115e-06, 'variance': 0.017278981603537896}[0m [36mstep[0m=[35m29988[0m
[2m2023-10-11 16:19:27[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_29988.pt[0m


Epoch 43/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:19:59[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=43 step=30702[0m [36mepoch[0m=[35m43[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004427259375735157, 'time_algorithm_update': 0.03940820426834064, 'loss': -79.9376709988805, 'time_step': 0.03997565918609876, 'observation_error': 0.022542394061086674, 'reward_error': 1.2767225957362633e-05, 'variance': 0.016718689407726712}[0m [36mstep[0m=[35m30702[0m
[2m2023-10-11 16:19:59[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_30702.pt[0m


Epoch 44/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:20:31[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=44 step=31416[0m [36mepoch[0m=[35m44[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00042895409239440403, 'time_algorithm_update': 0.03813651362720992, 'loss': -79.75429475340857, 'time_step': 0.03868182500203451, 'observation_error': 0.02281993599851113, 'reward_error': 3.624094943742227e-06, 'variance': 0.01570795112701311}[0m [36mstep[0m=[35m31416[0m
[2m2023-10-11 16:20:31[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_31416.pt[0m


Epoch 45/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:21:07[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=45 step=32130[0m [36mepoch[0m=[35m45[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000537439220759715, 'time_algorithm_update': 0.042958160408404694, 'loss': -81.17683072985054, 'time_step': 0.043636413849368484, 'observation_error': 0.01917555776260205, 'reward_error': 3.339657073260015e-06, 'variance': 0.014572427736110059}[0m [36mstep[0m=[35m32130[0m
[2m2023-10-11 16:21:07[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_32130.pt[0m


Epoch 46/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:21:45[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=46 step=32844[0m [36mepoch[0m=[35m46[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005676629496555702, 'time_algorithm_update': 0.04576293260109525, 'loss': -80.66207380321514, 'time_step': 0.04649978325146587, 'observation_error': 0.020848629467595844, 'reward_error': 3.003544278583763e-06, 'variance': 0.01374013132031655}[0m [36mstep[0m=[35m32844[0m
[2m2023-10-11 16:21:45[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_32844.pt[0m


Epoch 47/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:22:22[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=47 step=33558[0m [36mepoch[0m=[35m47[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000574769426126774, 'time_algorithm_update': 0.04427294671034612, 'loss': -81.72468439070117, 'time_step': 0.045004623944685906, 'observation_error': 0.022200483680366267, 'reward_error': 4.883020578810794e-06, 'variance': 0.013894735464953777}[0m [36mstep[0m=[35m33558[0m
[2m2023-10-11 16:22:22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_33558.pt[0m


Epoch 48/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:22:58[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=48 step=34272[0m [36mepoch[0m=[35m48[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005590818843253854, 'time_algorithm_update': 0.044602059516586176, 'loss': -81.26155036904899, 'time_step': 0.04532777595252884, 'observation_error': 0.022188829801390163, 'reward_error': 6.466133380181609e-06, 'variance': 0.013311825763484822}[0m [36mstep[0m=[35m34272[0m
[2m2023-10-11 16:22:58[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_34272.pt[0m


Epoch 49/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:23:33[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=49 step=34986[0m [36mepoch[0m=[35m49[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005323012002042028, 'time_algorithm_update': 0.041902289337136835, 'loss': -80.88673056006766, 'time_step': 0.04260573734422358, 'observation_error': 0.021307029513213883, 'reward_error': 4.394884322647032e-06, 'variance': 0.013475305284165823}[0m [36mstep[0m=[35m34986[0m
[2m2023-10-11 16:23:33[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_34986.pt[0m


Epoch 50/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:24:04[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=50 step=35700[0m [36mepoch[0m=[35m50[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004538591502427387, 'time_algorithm_update': 0.03878985800329043, 'loss': -81.63311327338553, 'time_step': 0.039375832768715396, 'observation_error': 0.018807358095784606, 'reward_error': 2.9673021301292876e-06, 'variance': 0.012403515718383513}[0m [36mstep[0m=[35m35700[0m
[2m2023-10-11 16:24:04[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_35700.pt[0m


Epoch 51/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:24:36[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=51 step=36414[0m [36mepoch[0m=[35m51[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004834985198760901, 'time_algorithm_update': 0.03882662054537391, 'loss': -81.57586403186916, 'time_step': 0.03944986040184812, 'observation_error': 0.019734688144467268, 'reward_error': 3.4522006654801007e-06, 'variance': 0.012681928836373589}[0m [36mstep[0m=[35m36414[0m
[2m2023-10-11 16:24:36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_36414.pt[0m


Epoch 52/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:25:05[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=52 step=37128[0m [36mepoch[0m=[35m52[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00041687221420245345, 'time_algorithm_update': 0.035589202111508664, 'loss': -81.70678867478999, 'time_step': 0.036134745226520784, 'observation_error': 0.019651802406707115, 'reward_error': 4.201648716383661e-06, 'variance': 0.01232545005660732}[0m [36mstep[0m=[35m37128[0m
[2m2023-10-11 16:25:05[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_37128.pt[0m


Epoch 53/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:25:32[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=53 step=37842[0m [36mepoch[0m=[35m53[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00040061099856507544, 'time_algorithm_update': 0.03398974870099407, 'loss': -82.3489609218779, 'time_step': 0.034501134514474736, 'observation_error': 0.02044353635734806, 'reward_error': 4.157707748217008e-06, 'variance': 0.01159401209670947}[0m [36mstep[0m=[35m37842[0m
[2m2023-10-11 16:25:33[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_37842.pt[0m


Epoch 54/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:26:06[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=54 step=38556[0m [36mepoch[0m=[35m54[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005090867771821864, 'time_algorithm_update': 0.040470067526446, 'loss': -82.84689677796779, 'time_step': 0.041134428243343235, 'observation_error': 0.018748553674222606, 'reward_error': 4.45255434746054e-06, 'variance': 0.011015604780435399}[0m [36mstep[0m=[35m38556[0m
[2m2023-10-11 16:26:06[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_38556.pt[0m


Epoch 55/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:26:38[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=55 step=39270[0m [36mepoch[0m=[35m55[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00047514652337680676, 'time_algorithm_update': 0.039072419748920686, 'loss': -80.60870648231827, 'time_step': 0.03968911945652895, 'observation_error': 0.02280262520621532, 'reward_error': 6.828771326472039e-06, 'variance': 0.01282564367465327}[0m [36mstep[0m=[35m39270[0m
[2m2023-10-11 16:26:38[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_39270.pt[0m


Epoch 56/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:27:12[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=56 step=39984[0m [36mepoch[0m=[35m56[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005311214623331022, 'time_algorithm_update': 0.041672227429408654, 'loss': -82.7783345348027, 'time_step': 0.042366036180020716, 'observation_error': 0.01917708586001866, 'reward_error': 3.580543799019659e-06, 'variance': 0.01043618020588826}[0m [36mstep[0m=[35m39984[0m
[2m2023-10-11 16:27:12[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_39984.pt[0m


Epoch 57/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:27:43[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=57 step=40698[0m [36mepoch[0m=[35m57[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00046360759841961686, 'time_algorithm_update': 0.0388637524025113, 'loss': -82.91348008548512, 'time_step': 0.03947734265100388, 'observation_error': 0.020159360660450446, 'reward_error': 3.5016823324708196e-06, 'variance': 0.010095075476883708}[0m [36mstep[0m=[35m40698[0m
[2m2023-10-11 16:27:43[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_40698.pt[0m


Epoch 58/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:28:16[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=58 step=41412[0m [36mepoch[0m=[35m58[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00048469762508274796, 'time_algorithm_update': 0.0399107038139963, 'loss': -81.52942722651805, 'time_step': 0.04054202151899578, 'observation_error': 0.019371351789565266, 'reward_error': 3.5396250811011863e-06, 'variance': 0.00985641742054154}[0m [36mstep[0m=[35m41412[0m
[2m2023-10-11 16:28:16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_41412.pt[0m


Epoch 59/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:28:49[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=59 step=42126[0m [36mepoch[0m=[35m59[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00048619992926675066, 'time_algorithm_update': 0.04102560285092736, 'loss': -81.14000523324107, 'time_step': 0.04165593015045679, 'observation_error': 0.018900732641539384, 'reward_error': 1.2771485146743034e-05, 'variance': 0.011473967552451123}[0m [36mstep[0m=[35m42126[0m
[2m2023-10-11 16:28:49[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_42126.pt[0m


Epoch 60/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:29:14[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=60 step=42840[0m [36mepoch[0m=[35m60[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00031686630569586235, 'time_algorithm_update': 0.030413695744105747, 'loss': -83.30349615508435, 'time_step': 0.030814421277086276, 'observation_error': 0.01893814998241631, 'reward_error': 2.659328149956781e-06, 'variance': 0.00960920060754477}[0m [36mstep[0m=[35m42840[0m
[2m2023-10-11 16:29:14[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_42840.pt[0m


Epoch 61/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:29:38[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=61 step=43554[0m [36mepoch[0m=[35m61[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00029282476387771907, 'time_algorithm_update': 0.02896586419487534, 'loss': -83.61640770962926, 'time_step': 0.02933753204612839, 'observation_error': 0.017601060831410116, 'reward_error': 3.5589070221663443e-06, 'variance': 0.010041168371136314}[0m [36mstep[0m=[35m43554[0m
[2m2023-10-11 16:29:38[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_43554.pt[0m


Epoch 62/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:30:00[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=62 step=44268[0m [36mepoch[0m=[35m62[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000273803035084273, 'time_algorithm_update': 0.027988218125842866, 'loss': -82.11825900518593, 'time_step': 0.028338132118310582, 'observation_error': 0.01876697190605906, 'reward_error': 3.826909808330311e-06, 'variance': 0.009397098749053038}[0m [36mstep[0m=[35m44268[0m
[2m2023-10-11 16:30:00[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_44268.pt[0m


Epoch 63/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:30:24[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=63 step=44982[0m [36mepoch[0m=[35m63[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0002734507499288778, 'time_algorithm_update': 0.028508590383022104, 'loss': -83.53061570506804, 'time_step': 0.028859060685507722, 'observation_error': 0.020677537879502928, 'reward_error': 3.5803644488745474e-06, 'variance': 0.008613673756421993}[0m [36mstep[0m=[35m44982[0m
[2m2023-10-11 16:30:24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_44982.pt[0m


Epoch 64/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:30:48[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=64 step=45696[0m [36mepoch[0m=[35m64[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00029668587596476577, 'time_algorithm_update': 0.030167175274269254, 'loss': -82.55528586251395, 'time_step': 0.03054780252173501, 'observation_error': 0.019444775910352118, 'reward_error': 1.5123900278872297e-05, 'variance': 0.009637590576139107}[0m [36mstep[0m=[35m45696[0m
[2m2023-10-11 16:30:48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_45696.pt[0m


Epoch 65/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:31:12[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=65 step=46410[0m [36mepoch[0m=[35m65[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00029435010851264334, 'time_algorithm_update': 0.029161221172963204, 'loss': -83.39281118483771, 'time_step': 0.02953894398793453, 'observation_error': 0.019808265908819116, 'reward_error': 6.907334142790937e-06, 'variance': 0.009124331380083502}[0m [36mstep[0m=[35m46410[0m
[2m2023-10-11 16:31:12[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_46410.pt[0m


Epoch 66/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:31:34[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=66 step=47124[0m [36mepoch[0m=[35m66[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00027301364920052495, 'time_algorithm_update': 0.02814870788937523, 'loss': -83.47771471755512, 'time_step': 0.02849504359963895, 'observation_error': 0.018219062578555503, 'reward_error': 5.823189005841414e-06, 'variance': 0.0095371551560211}[0m [36mstep[0m=[35m47124[0m
[2m2023-10-11 16:31:34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_47124.pt[0m


Epoch 67/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:32:06[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=67 step=47838[0m [36mepoch[0m=[35m67[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00044384723951836595, 'time_algorithm_update': 0.03923159086403726, 'loss': -82.18948255667165, 'time_step': 0.03981178648331586, 'observation_error': 0.018351264684812445, 'reward_error': 2.747431738522859e-06, 'variance': 0.009317538942303024}[0m [36mstep[0m=[35m47838[0m
[2m2023-10-11 16:32:06[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_47838.pt[0m


Epoch 68/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:32:39[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=68 step=48552[0m [36mepoch[0m=[35m68[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00048524825846781586, 'time_algorithm_update': 0.040795605055758265, 'loss': -84.39263427157361, 'time_step': 0.041427050651956336, 'observation_error': 0.01798807329293541, 'reward_error': 5.572373785866355e-06, 'variance': 0.009082506607400493}[0m [36mstep[0m=[35m48552[0m
[2m2023-10-11 16:32:39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_48552.pt[0m


Epoch 69/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:33:14[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=69 step=49266[0m [36mepoch[0m=[35m69[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005335917993753898, 'time_algorithm_update': 0.04183447294208516, 'loss': -83.9429427315207, 'time_step': 0.04253928267321333, 'observation_error': 0.018997037600291183, 'reward_error': 2.2931732848222106e-06, 'variance': 0.008427047030137972}[0m [36mstep[0m=[35m49266[0m
[2m2023-10-11 16:33:14[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_49266.pt[0m


Epoch 70/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:33:48[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=70 step=49980[0m [36mepoch[0m=[35m70[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004924592517671131, 'time_algorithm_update': 0.041611824049001316, 'loss': -83.47884010934696, 'time_step': 0.04225606076857623, 'observation_error': 0.018696850271322823, 'reward_error': 8.404580434488172e-06, 'variance': 0.009828245469365003}[0m [36mstep[0m=[35m49980[0m
[2m2023-10-11 16:33:48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_49980.pt[0m


Epoch 71/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:34:22[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=71 step=50694[0m [36mepoch[0m=[35m71[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000516717173472172, 'time_algorithm_update': 0.04201172713805981, 'loss': -83.28063187746106, 'time_step': 0.04267964075927307, 'observation_error': 0.018410752602647764, 'reward_error': 5.609950330287334e-06, 'variance': 0.010892596144294355}[0m [36mstep[0m=[35m50694[0m
[2m2023-10-11 16:34:22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_50694.pt[0m


Epoch 72/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:34:56[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=72 step=51408[0m [36mepoch[0m=[35m72[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005229564607978201, 'time_algorithm_update': 0.041944632677137016, 'loss': -83.68249530151111, 'time_step': 0.042618763213064154, 'observation_error': 0.01887437765087096, 'reward_error': 2.6989600910609873e-06, 'variance': 0.007675929806760356}[0m [36mstep[0m=[35m51408[0m
[2m2023-10-11 16:34:56[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_51408.pt[0m


Epoch 73/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:35:30[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=73 step=52122[0m [36mepoch[0m=[35m73[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005013588763752571, 'time_algorithm_update': 0.04101110205930822, 'loss': -84.10099656641984, 'time_step': 0.04167364324842181, 'observation_error': 0.016420762972113815, 'reward_error': 3.987016289871897e-06, 'variance': 0.008195910949900291}[0m [36mstep[0m=[35m52122[0m
[2m2023-10-11 16:35:30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_52122.pt[0m


Epoch 74/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:36:03[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=74 step=52836[0m [36mepoch[0m=[35m74[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005145790864105652, 'time_algorithm_update': 0.040900808088585774, 'loss': -84.17150985493379, 'time_step': 0.04157400565321038, 'observation_error': 0.016604824234205107, 'reward_error': 5.629404230315243e-06, 'variance': 0.010142816833024877}[0m [36mstep[0m=[35m52836[0m
[2m2023-10-11 16:36:03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_52836.pt[0m


Epoch 75/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:36:35[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=75 step=53550[0m [36mepoch[0m=[35m75[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000503691972470751, 'time_algorithm_update': 0.040288687754078074, 'loss': -84.43958365817029, 'time_step': 0.04094687630148495, 'observation_error': 0.017680384257064974, 'reward_error': 8.094908783291446e-06, 'variance': 0.008842178810337576}[0m [36mstep[0m=[35m53550[0m
[2m2023-10-11 16:36:36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_53550.pt[0m


Epoch 76/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:37:09[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=76 step=54264[0m [36mepoch[0m=[35m76[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005276627233382367, 'time_algorithm_update': 0.0416178289247828, 'loss': -83.55297973643498, 'time_step': 0.04229995316150142, 'observation_error': 0.0192883158911782, 'reward_error': 4.703222534231512e-06, 'variance': 0.008786378555535237}[0m [36mstep[0m=[35m54264[0m
[2m2023-10-11 16:37:09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_54264.pt[0m


Epoch 77/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:37:42[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=77 step=54978[0m [36mepoch[0m=[35m77[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005011618638238987, 'time_algorithm_update': 0.04059468163829558, 'loss': -84.70983488953748, 'time_step': 0.04125015107857413, 'observation_error': 0.016787260428445774, 'reward_error': 4.77717984826179e-06, 'variance': 0.008122651821448347}[0m [36mstep[0m=[35m54978[0m
[2m2023-10-11 16:37:42[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_54978.pt[0m


Epoch 78/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:38:16[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=78 step=55692[0m [36mepoch[0m=[35m78[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005089565485465427, 'time_algorithm_update': 0.04123480046162752, 'loss': -84.15607547492874, 'time_step': 0.041904026386784574, 'observation_error': 0.015802451542780503, 'reward_error': 3.2201079465719142e-06, 'variance': 0.007285563016078921}[0m [36mstep[0m=[35m55692[0m
[2m2023-10-11 16:38:16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_55692.pt[0m


Epoch 79/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:38:44[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=79 step=56406[0m [36mepoch[0m=[35m79[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00040828380264154, 'time_algorithm_update': 0.03613395751023493, 'loss': -84.72234459617893, 'time_step': 0.036666977973211376, 'observation_error': 0.016583759390474233, 'reward_error': 5.527671087412465e-06, 'variance': 0.0076918716103711975}[0m [36mstep[0m=[35m56406[0m
[2m2023-10-11 16:38:44[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_56406.pt[0m


Epoch 80/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:39:16[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=80 step=57120[0m [36mepoch[0m=[35m80[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004528286744232605, 'time_algorithm_update': 0.039286317945528434, 'loss': -84.45935203381279, 'time_step': 0.03987863317591136, 'observation_error': 0.01705104717879565, 'reward_error': 4.693105074000412e-06, 'variance': 0.007929689519758192}[0m [36mstep[0m=[35m57120[0m
[2m2023-10-11 16:39:16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_57120.pt[0m


Epoch 81/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:39:47[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=81 step=57834[0m [36mepoch[0m=[35m81[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00042891101676876804, 'time_algorithm_update': 0.037631660616364464, 'loss': -85.32332361045005, 'time_step': 0.03818406277344007, 'observation_error': 0.01782033293614169, 'reward_error': 2.8060035910354842e-06, 'variance': 0.007128678959522979}[0m [36mstep[0m=[35m57834[0m
[2m2023-10-11 16:39:47[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_57834.pt[0m


Epoch 82/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:40:18[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=82 step=58548[0m [36mepoch[0m=[35m82[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004540070766160468, 'time_algorithm_update': 0.03879248394685633, 'loss': -85.51477203048577, 'time_step': 0.039374564208236394, 'observation_error': 0.015890706744902294, 'reward_error': 3.850960545544194e-06, 'variance': 0.007756328706452901}[0m [36mstep[0m=[35m58548[0m
[2m2023-10-11 16:40:18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_58548.pt[0m


Epoch 83/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:40:48[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=83 step=59262[0m [36mepoch[0m=[35m83[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004217865086403214, 'time_algorithm_update': 0.037147782763847115, 'loss': -85.89397691411465, 'time_step': 0.037701444131653515, 'observation_error': 0.01584994735965218, 'reward_error': 3.1608465737673006e-06, 'variance': 0.0064293394195474775}[0m [36mstep[0m=[35m59262[0m
[2m2023-10-11 16:40:48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_59262.pt[0m


Epoch 84/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:41:18[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=84 step=59976[0m [36mepoch[0m=[35m84[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004475500737251688, 'time_algorithm_update': 0.03718172902820491, 'loss': -84.3186003073257, 'time_step': 0.03775471265242547, 'observation_error': 0.017288418076104404, 'reward_error': 5.802426097488414e-06, 'variance': 0.00834619214845138}[0m [36mstep[0m=[35m59976[0m
[2m2023-10-11 16:41:18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_59976.pt[0m


Epoch 85/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:41:48[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=85 step=60690[0m [36mepoch[0m=[35m85[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00044357776641845703, 'time_algorithm_update': 0.03743406070046732, 'loss': -85.25742914429566, 'time_step': 0.038002759802575205, 'observation_error': 0.015551940279715551, 'reward_error': 4.5339145670757255e-06, 'variance': 0.006780843355233836}[0m [36mstep[0m=[35m60690[0m
[2m2023-10-11 16:41:49[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_60690.pt[0m


Epoch 86/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:42:19[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=86 step=61404[0m [36mepoch[0m=[35m86[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00045134072878113646, 'time_algorithm_update': 0.037531678082228376, 'loss': -85.10525956407649, 'time_step': 0.038107538757537925, 'observation_error': 0.017245176991455287, 'reward_error': 1.1857091674294615e-05, 'variance': 0.007228420543766096}[0m [36mstep[0m=[35m61404[0m
[2m2023-10-11 16:42:19[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_61404.pt[0m


Epoch 87/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:42:50[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=87 step=62118[0m [36mepoch[0m=[35m87[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00045861516680036275, 'time_algorithm_update': 0.03803903315247608, 'loss': -85.21683878324279, 'time_step': 0.03863671914536078, 'observation_error': 0.015272362958843834, 'reward_error': 2.6666651683927705e-06, 'variance': 0.0067244435456722575}[0m [36mstep[0m=[35m62118[0m
[2m2023-10-11 16:42:50[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_62118.pt[0m


Epoch 88/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:43:22[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=88 step=62832[0m [36mepoch[0m=[35m88[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00048427488289627376, 'time_algorithm_update': 0.039505410929020046, 'loss': -85.17489622420624, 'time_step': 0.04013240571115531, 'observation_error': 0.01640689016214412, 'reward_error': 5.620667252562379e-06, 'variance': 0.00704788159850248}[0m [36mstep[0m=[35m62832[0m
[2m2023-10-11 16:43:22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_62832.pt[0m


Epoch 89/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:43:55[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=89 step=63546[0m [36mepoch[0m=[35m89[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00048676224983706864, 'time_algorithm_update': 0.04017375129945472, 'loss': -86.0148660996381, 'time_step': 0.040805240639117585, 'observation_error': 0.014562621026195592, 'reward_error': 2.5073676613273196e-06, 'variance': 0.006782828124273373}[0m [36mstep[0m=[35m63546[0m
[2m2023-10-11 16:43:55[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_63546.pt[0m


Epoch 90/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:44:26[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=90 step=64260[0m [36mepoch[0m=[35m90[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00042818040073084896, 'time_algorithm_update': 0.03801772541024772, 'loss': -85.17031541925853, 'time_step': 0.03856424791138379, 'observation_error': 0.014570377447710316, 'reward_error': 3.8725139648277935e-06, 'variance': 0.006397743556084688}[0m [36mstep[0m=[35m64260[0m
[2m2023-10-11 16:44:26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_64260.pt[0m


Epoch 91/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:44:56[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=91 step=64974[0m [36mepoch[0m=[35m91[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004207764019151362, 'time_algorithm_update': 0.03746064303635883, 'loss': -86.30247555259896, 'time_step': 0.0380053593664944, 'observation_error': 0.014050345812738133, 'reward_error': 4.441793514932537e-06, 'variance': 0.007038437875391445}[0m [36mstep[0m=[35m64974[0m
[2m2023-10-11 16:44:56[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_64974.pt[0m


Epoch 92/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:45:28[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=92 step=65688[0m [36mepoch[0m=[35m92[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004908941707023385, 'time_algorithm_update': 0.03968228378883597, 'loss': -84.29094134792894, 'time_step': 0.040308200010732445, 'observation_error': 0.014216599042095868, 'reward_error': 3.3387525994743937e-06, 'variance': 0.007291049825388232}[0m [36mstep[0m=[35m65688[0m
[2m2023-10-11 16:45:28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_65688.pt[0m


Epoch 93/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:45:58[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=93 step=66402[0m [36mepoch[0m=[35m93[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00043333845646107566, 'time_algorithm_update': 0.03743885912481142, 'loss': -86.04639410237972, 'time_step': 0.037997501904890985, 'observation_error': 0.015295580028483196, 'reward_error': 4.476113814413308e-06, 'variance': 0.006192965465197072}[0m [36mstep[0m=[35m66402[0m
[2m2023-10-11 16:45:59[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_66402.pt[0m


Epoch 94/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:46:30[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=94 step=67116[0m [36mepoch[0m=[35m94[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00047028031335825347, 'time_algorithm_update': 0.03899181089481386, 'loss': -85.82760954571037, 'time_step': 0.03959856948264841, 'observation_error': 0.014363343865746543, 'reward_error': 3.975460381226627e-06, 'variance': 0.006299911156804411}[0m [36mstep[0m=[35m67116[0m
[2m2023-10-11 16:46:30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_67116.pt[0m


Epoch 95/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:47:01[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=95 step=67830[0m [36mepoch[0m=[35m95[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00044498523744214484, 'time_algorithm_update': 0.038294371770543546, 'loss': -86.14321686774075, 'time_step': 0.038875431573691485, 'observation_error': 0.015557257215468141, 'reward_error': 4.108670667968917e-06, 'variance': 0.006150716252376151}[0m [36mstep[0m=[35m67830[0m
[2m2023-10-11 16:47:01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_67830.pt[0m


Epoch 96/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:47:34[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=96 step=68544[0m [36mepoch[0m=[35m96[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00045139181847665824, 'time_algorithm_update': 0.04023081815543295, 'loss': -85.64318241525432, 'time_step': 0.04081304166831222, 'observation_error': 0.015168525471019987, 'reward_error': 7.322407847279608e-06, 'variance': 0.006397175015959136}[0m [36mstep[0m=[35m68544[0m
[2m2023-10-11 16:47:34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_68544.pt[0m


Epoch 97/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:48:07[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=97 step=69258[0m [36mepoch[0m=[35m97[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005024401079706785, 'time_algorithm_update': 0.04212675909368264, 'loss': -85.41123670155929, 'time_step': 0.042782035862364357, 'observation_error': 0.014939372027605754, 'reward_error': 4.560460196481388e-06, 'variance': 0.007079962730012064}[0m [36mstep[0m=[35m69258[0m
[2m2023-10-11 16:48:07[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_69258.pt[0m


Epoch 98/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:48:40[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=98 step=69972[0m [36mepoch[0m=[35m98[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00048981227126776, 'time_algorithm_update': 0.04052655069100089, 'loss': -86.10941684947295, 'time_step': 0.04116782120295933, 'observation_error': 0.014430448917987107, 'reward_error': 2.9670767929387717e-06, 'variance': 0.007091523904537046}[0m [36mstep[0m=[35m69972[0m
[2m2023-10-11 16:48:40[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_69972.pt[0m


Epoch 99/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:49:13[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=99 step=70686[0m [36mepoch[0m=[35m99[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004933885499542835, 'time_algorithm_update': 0.040260149317295275, 'loss': -86.04313476947175, 'time_step': 0.040914698141295704, 'observation_error': 0.01615742872760815, 'reward_error': 8.8927554587419e-06, 'variance': 0.007027707547621536}[0m [36mstep[0m=[35m70686[0m
[2m2023-10-11 16:49:13[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_70686.pt[0m


Epoch 100/100:   0%|          | 0/714 [00:00<?, ?it/s]

[2m2023-10-11 16:49:47[0m [[32m[1minfo     [0m] [1mSeparateStateAndRewardEncoders_20231011160113: epoch=100 step=71400[0m [36mepoch[0m=[35m100[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005305554686474199, 'time_algorithm_update': 0.04220393391884342, 'loss': -86.33843726203555, 'time_step': 0.04288713838539872, 'observation_error': 0.016058845319678435, 'reward_error': 2.7430494585063786e-06, 'variance': 0.006247049604075671}[0m [36mstep[0m=[35m71400[0m
[2m2023-10-11 16:49:47[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/SeparateStateAndRewardEncoders_20231011160113/model_71400.pt[0m


[(1,
  {'time_sample_batch': 0.0005488235409520253,
   'time_algorithm_update': 0.0357434352238973,
   'loss': -32.93809812075617,
   'time_step': 0.03645618723220184,
   'observation_error': 0.016063637931700202,
   'reward_error': 0.0016804140969261861,
   'variance': 0.01215117613131028}),
 (2,
  {'time_sample_batch': 0.0005533832127974481,
   'time_algorithm_update': 0.0370452494180503,
   'loss': -41.99179640804686,
   'time_step': 0.03776338420996145,
   'observation_error': 0.01028619149651415,
   'reward_error': 0.0006694518923929904,
   'variance': 0.004676957080329532}),
 (3,
  {'time_sample_batch': 0.0005800857597372445,
   'time_algorithm_update': 0.041098010974103996,
   'loss': -49.00794673266531,
   'time_step': 0.0418581545185976,
   'observation_error': 0.012040702134374115,
   'reward_error': 0.00039915916356946386,
   'variance': 0.0093081758324502}),
 (4,
  {'time_sample_batch': 0.0003020255839457365,
   'time_algorithm_update': 0.027733295571570303,
   'loss': -55.

In [None]:
def experiment_dynamics_training(dataset, n_runs, experiment_name, seed=1, use_gpu=True):
    for i in range(n_runs):
        for encoder_factory in ['default', 'inverted_pendulum']:
            # use the same seeds for default and symmetric runs
            train_episodes, test_episodes = train_test_split(dataset, random_state=seed+i)
            dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=use_gpu, encoder_factory=encoder_factory)
            dynamics.fit(train_episodes,
                 eval_episodes=test_episodes,
                 n_epochs=100,
                 scorers={
                    'observation_error': d3rlpy.metrics.scorer.dynamics_observation_prediction_error_scorer,
                    'reward_error': d3rlpy.metrics.scorer.dynamics_reward_prediction_error_scorer,
                    'variance': d3rlpy.metrics.scorer.dynamics_prediction_variance_scorer,
                 },
                tensorboard_dir='tensorboard_logs/dynamics',
                experiment_name=experiment_name)

In [None]:
experiment_dynamics_training(dataset=dataset, n_runs=3, experiment_name="exp_0", use_gpu=False)

## Load Dynamics

In [None]:
# load trained dynamics model
dynamics_model_path = "d3rlpy_logs/ProbabilisticEnsembleDynamics_20231002230632"
dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics.from_json(dynamics_model_path + '/params.json')
dynamics.load_model(dynamics_model_path + '/model_31542.pt')

## Train Offline RL Algorithm

In [None]:
encoders = d3rlpy.models.encoders.DefaultEncoderFactory(dropout_rate=0.2)
# give COMBO as the generator argument.
combo = COMBO(dynamics=dynamics, critic_encoder_factory=encoders, actor_encoder_factory=encoders,
              use_gpu=use_gpu)

In [None]:
combo.fit(dataset = train_episodes, eval_episodes=test_episodes, n_steps=100000, n_steps_per_epoch=1000, tensorboard_dir="tensorboard_logs",
         scorers={
            'environment': d3rlpy.metrics.scorer.evaluate_on_environment(eval_env)
        })

## Load the Policy

In [None]:
trained_policy = COMBO()
# initialize with dataset
trained_policy.build_with_dataset(dataset)
# Load entire model parameters.
trained_policy.load_model('d3rlpy_logs/COMBO_20230929153035/model_53000.pt')

## See the policy running

In [None]:
scorer = d3rlpy.metrics.scorer.evaluate_on_environment(eval_env, render=True)
mean_episode_return = scorer(trained_policy)