#### Import Libraries

In [1]:
import gymnasium as gym
import minari
import d3rlpy
from d3rlpy.datasets import MDPDataset

from d3rlpy.algos import CQLConfig, BEARConfig, AWACConfig, BCQConfig
from d3rlpy.metrics.evaluators import EnvironmentEvaluator

import pickle as pk
import numpy as np

#### Define Wrapper for certain environments with obs['observatioon']

In [79]:
class EnvWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return obs["observation"], info

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        return obs["observation"], reward, terminated, truncated, info



#### Load dataset from Minari

In [2]:
# Load dataset
dataset = minari.load_dataset("mujoco/inverteddoublependulum/expert-v0", download=True)

#### Convert and save dataset in d3rlpy format

In [16]:
# Convert Minari to D3RLpy Dataset Format
observations = np.concatenate([ep.observations[:-1] for ep in dataset.iterate_episodes()])
# next_observations = np.concatenate([ep.observations[1:] for ep in dataset.iterate_episodes()])
actions = np.concatenate([ep.actions for ep in dataset.iterate_episodes()])
rewards = np.concatenate([ep.rewards for ep in dataset.iterate_episodes()])
terminals = np.concatenate([ep.terminations for ep in dataset.iterate_episodes()])
timeouts = np.concatenate([ep.truncations for ep in dataset.iterate_episodes()])

mdp_dataset = MDPDataset(observations, actions, rewards, terminals, timeouts)

# save the modified dataset
with open('doublependulum_expert_v0.pkl', 'wb') as f:
    pk.dump(mdp_dataset, f)

[2m2025-04-18 15:14.13[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(9,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(1,)])[0m
[2m2025-04-18 15:14.13[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-04-18 15:14.13[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m1[0m


In [17]:
len(observations), len(actions), len(rewards), len(terminals), len(timeouts)

(100000, 100000, 100000, 100000, 100000)

#### Load converted dataset

In [4]:
# Load the modified dataset
with open('pointmaze_umaze_dense_v2.pkl', 'rb') as f:
    dataset = pk.load(f)

#### Define environment (if required)

In [None]:
# env = gym.make('InvertedPendulum-v5')
# Create the wrapped environment
# wrapped_env = EnvWrapper(env)

#### Load saved model

In [31]:
model = d3rlpy.load_learnable('d3rlpy_logs/bear_hopper_simple_v0_batch_1024_steps_100000_20250416203710/model_40000.d3')

#### Evaluate trained model

In [32]:
# Evaluation loop
n_episodes = 10

for episode in range(n_episodes):
    obs, _ = env.reset()
    done = False
    total_reward = 0.0

    while not done:
        action = model.predict(np.expand_dims(obs, axis=0))[0]  # predict takes a batch
        obs, reward, terminated, truncated, _ = env.step(action)
        # print(reward)
        total_reward += reward
        done = terminated or truncated

    print(f"Episode {episode + 1}: Reward = {total_reward}")

Episode 1: Reward = 66.85510679398618
Episode 2: Reward = 112.65848230983802
Episode 3: Reward = 104.13716182496448
Episode 4: Reward = 90.42100719906452
Episode 5: Reward = 171.83542111522274
Episode 6: Reward = 137.08998799975765
Episode 7: Reward = 129.84512177452945
Episode 8: Reward = 90.72568261717194
Episode 9: Reward = 118.70784036963096
Episode 10: Reward = 78.89335858457541


#### CQL

In [5]:
cql = CQLConfig(batch_size=1024).create(device='cuda:0')
# cql = CQLConfig().create(device='cpu')

In [None]:
cql.fit(
    dataset=dataset,
    n_steps = 3000,           # total number of gradient updates
    n_steps_per_epoch = 100,  # epochs total
    save_interval = 50,           # save every 1 epoch
    experiment_name="cql_pointmaze_umaze_dense_v2_batch_1024",
    with_timestamp=True,
    show_progress=True,
    evaluators={
        "environment": EnvironmentEvaluator(wrapped_env),
    }
)

#### BEAR

In [18]:
bear = BEARConfig(batch_size=1024).create(device='cuda:0')
# cql = CQLConfig().create(device='cpu')

In [None]:
bear.fit(
    dataset=dataset,
    n_steps = 3000,           # total number of gradient updates
    n_steps_per_epoch = 1000,  # 3 epochs total
    save_interval = 1,           # save every 1 epoch
    experiment_name="bear_pointmaze_umaze_dense_v2_batch_1024",
    with_timestamp=True,
    show_progress=True,
    evaluators={
        "environment": EnvironmentEvaluator(wrapped_env),
    }
)

#### AWAC

In [21]:
awac = AWACConfig(batch_size=1024).create(device='cuda:0')
# cql = CQLConfig().create(device='cpu')

In [None]:
awac.fit(
    dataset=dataset,
    n_steps = 3000,           # total number of gradient updates
    n_steps_per_epoch = 1000,  # 3 epochs total
    save_interval = 1,           # save every 1 epoch
    experiment_name="awac_pointmaze_umaze_dense_v2_batch_1024",
    with_timestamp=True,
    show_progress=True,
    evaluators={
        "environment": EnvironmentEvaluator(wrapped_env),
    }
)

#### BCQ

In [23]:
bcq = BCQConfig(batch_size=1024).create(device='cuda:0')
# cql = CQLConfig().create(device='cpu')

In [None]:
bcq.fit(
    dataset=dataset,
    n_steps = 3000,           # total number of gradient updates
    n_steps_per_epoch = 1000,  # 3 epochs total
    save_interval = 1,           # save every 1 epoch
    experiment_name="bcq_pointmaze_umaze_dense_v2_batch_1024",
    with_timestamp=True,
    show_progress=True,
    evaluators={
        "environment": EnvironmentEvaluator(wrapped_env),
    }
)