In [10]:
import d3rlpy
import os
import pandas as pd
import numpy as np
from d3rlpy.dataset import MDPDataset

from d3rlpy.preprocessing import StandardObservationScaler
import d3rlpy
from d3rlpy.models.encoders import VectorEncoderFactory
from d3rlpy.metrics import TDErrorEvaluator
from d3rlpy.metrics import InitialStateValueEstimationEvaluator

In [None]:
base_dir = os.path.expanduser("../Data")

def build_dataset_from_csv(path, episode_len=48):
    df = pd.read_csv(path)

    s1 = df.iloc[:, 3].values.astype(np.float32)
    s2 = df.iloc[:, 4].values.astype(np.float32)
    s3_raw = df.iloc[:, 6].values
    s3 = np.where(s3_raw == 0, 0, 1).astype(np.float32)

    states = np.stack([s1, s2, s3], axis=1)

    actions_raw = df.iloc[:, 5].values
    actions = np.where(actions_raw == 0, 0, 1).astype(np.int64)

    s1 = df.iloc[:, 3].values.astype(np.float32)
    rewards = (
        -( (s1 > 140) * np.abs(s1 - 140) ** 1.10
         + (s1 < 80)  * (s1 - 80) ** 2 ) / 30.0
    ).astype(np.float32)

    rews = (rewards+40.833332)/40.833332

    n = len(df)
    terminals = np.zeros(n, dtype=np.float32)
    terminals[episode_len - 1 :: episode_len] = 1.0

    return MDPDataset(
        observations=states,
        actions=actions,
        rewards=rews,
        terminals=terminals
    )


datasets = {}
for i in range(1, 7):
    name = f"pat{i}"
    csv_path = os.path.join(base_dir, f"{name}.csv")
    datasets[name] = build_dataset_from_csv(csv_path)

    print(f"{name} -> {datasets[name].size()} transitions, "
          f"{len(datasets[name].episodes)} episodes")

    h5_path = os.path.join(base_dir, f"{name}_binary_meal.h5")
    datasets[name].dump(h5_path)


In [None]:
sac_beh_1 = d3rlpy.load_learnable("../Models/sac_model_pat1.d3", device="cuda:0")

In [None]:
# 1th person data
names = [f"pat{i}" for i in range(1, 7)]
result_mat_1 = np.zeros((6, 6), dtype=float)

In [None]:
d3rlpy.seed(123)



for i, pol_name in enumerate(names, start=0):

    policy = globals()[f"sac_beh_{i+1}"]

    for j, ds_name in enumerate(names[0:1], start=4):
        ds = datasets[ds_name]
        print(f"[OPE] policy={pol_name}  on  dataset={ds_name}")

        fqe = d3rlpy.ope.DiscreteFQE(
            algo=policy,
            config=d3rlpy.ope.FQEConfig(
                observation_scaler=StandardObservationScaler(),
                batch_size=64,
                gamma=0.9,
                target_update_interval=500,  
            ),  device="cuda:0"
        )


        fqe.fit(
            ds,
            n_steps=30000,
            n_steps_per_epoch=1000,
            show_progress=False,
            evaluators={
                'TD': TDErrorEvaluator(),
                'init_value': InitialStateValueEstimationEvaluator(),
            },
        )

   
        start_states = np.array([ep.observations[0] for ep in ds.episodes])


        init_actions = policy.predict(start_states)  


        vhat = fqe.predict_value(start_states, init_actions)
        result_mat_1[i, j] = float(np.mean(vhat))

In [None]:
result_dir = os.path.expanduser("../OPE_Results")
save_path = os.path.join(result_dir, "result_mat_1.csv")

np.savetxt(save_path, np.array(result_mat_1), delimiter=",", fmt="%.6f")