In [3]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Kaggle notebook = https://www.kaggle.com/code/aryamanbansal/single-objective-offline-rl-1

## Using `d3rlpy` for offline deep reinforcement learning

- Read [this](https://d3rlpy.readthedocs.io/en/v2.8.1/tutorials/getting_started.html) for basic workflow of `d3rlpy`.

## Offline deep reinforcement learning on healthcare datasets

- Datasets used:
    - [Diabetes](https://www.kaggle.com/datasets/brandao/diabetes/data)
    - [Sepsis prediction](https://www.kaggle.com/datasets/salikhussaini49/prediction-of-sepsis/data)
- MDP (Markov Decision Process) design:
    - Diabetes dataset:
        - States =
        - Actions =
        - Rewards =
    - Sepsis dataset:
        - States
        - Actions =
        - Rewards =   
- Algorithms used:
    - CQL = Conservative Q-Learning
    - BC = Behavior Cloning
    - NFQ = Neural Fitted Q-Iteration
    - DQN = Deep Q-Network

In [4]:
# !pip install d3rlpy

In [5]:
import d3rlpy
from d3rlpy.dataset import MDPDataset
# from d3rlpy.algos import BC, NFQ, DQN, CQL
# from d3rlpy.metrics.scorer import evaluate_on_environment # For simulated env if we had one
from d3rlpy.ope import DiscreteFQE, FQE   # For Offline Policy Evaluation
from d3rlpy.metrics import TDErrorEvaluator, SoftOPCEvaluator, InitialStateValueEstimationEvaluator
# import gymnasium as gym

import torch
import torch.optim as optim

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from tqdm import tqdm
import gc
import os

print("imports done")

[2m2025-05-25 05:49.57[0m [[32m[1minfo     [0m] [1mRegister Shimmy environments. [0m
imports done


In [6]:
print(d3rlpy.__version__)

2.8.1


In [7]:
os.listdir("../input/")

['diabetes', 'prediction-of-sepsis']

## Diabetes dataset EDA

In [8]:
diabetes_path = "../input/diabetes/diabetic_data.csv"
diabetes_df = pd.read_csv(diabetes_path)
diabetes_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [9]:
# use this code to free up gpu memory
gc.collect()
# torch.cuda.empty_cache()
print("freed up some memory!")

freed up some memory!


In [10]:
print(diabetes_df.columns.tolist())

['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [11]:
diabetes_df.replace('?', np.nan).isna().sum()

encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [12]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [13]:
diabetes_df.describe(include="all")

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
count,101766.0,101766.0,101766,101766,101766,101766,101766.0,101766.0,101766.0,101766.0,...,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766
unique,,,6,3,10,10,,,,,...,1,4,4,2,2,2,2,2,2,3
top,,,Caucasian,Female,[70-80),?,,,,,...,No,No,No,No,No,No,No,No,Yes,NO
freq,,,76099,54708,26068,98569,,,,,...,101766,47383,101060,101753,101765,101764,101765,54755,78363,54864
mean,165201600.0,54330400.0,,,,,2.024006,3.715642,5.754437,4.395987,...,,,,,,,,,,
std,102640300.0,38696360.0,,,,,1.445403,5.280166,4.064081,2.985108,...,,,,,,,,,,
min,12522.0,135.0,,,,,1.0,1.0,1.0,1.0,...,,,,,,,,,,
25%,84961190.0,23413220.0,,,,,1.0,1.0,1.0,2.0,...,,,,,,,,,,
50%,152389000.0,45505140.0,,,,,1.0,1.0,7.0,4.0,...,,,,,,,,,,
75%,230270900.0,87545950.0,,,,,3.0,4.0,7.0,6.0,...,,,,,,,,,,


In [14]:
diabetes_df[["readmitted"]].value_counts()   # reward

readmitted
NO            54864
>30           35545
<30           11357
Name: count, dtype: int64

In [15]:
diabetes_df[["insulin"]].value_counts()     # action

insulin
No         47383
Steady     30849
Down       12218
Up         11316
Name: count, dtype: int64

In [16]:
def flatten_concatenation(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list
    

In [17]:
diabetes_demographics = ["race", "gender", "age"]
diabetes_admission = ["admission_type_id", "discharge_disposition_id", "admission_source_id"]
diabetes_clinical = ["time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", 
                     "number_outpatient", "number_emergency", "number_inpatient", "diag_1", "diag_2", 
                     "diag_3", "number_diagnoses"]
diabetes_lab_tests = ["max_glu_serum", "A1Cresult"]
diabetes_states = flatten_concatenation([diabetes_demographics, diabetes_admission, diabetes_clinical,
                                         diabetes_lab_tests])
diabetes_states

['race',
 'gender',
 'age',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'diag_1',
 'diag_2',
 'diag_3',
 'number_diagnoses',
 'max_glu_serum',
 'A1Cresult']

In [18]:
diabetes_df = diabetes_df.replace('?', np.nan)

**Basic Cleaning & Feature Selection**

Drop columns with too many missing values or not directly useful for this RL task.
- For example: `weight`, `payer_code`, `medical_specialty` often have many missing values.
- `encounter_id` and `patient_nbr` are for identification, not features for the model state itself, but `patient_nbr` could be used for more advanced episode construction, as we'll do later on...

In [19]:
cols_to_drop = ['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty']
diabetes_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# For simplicity, drop rows with any remaining NaN values in key columns
# A more robust approach would be imputation, but let's keep it simpler for now.
# Key columns for state, action, reward:
key_cols_for_nan_check = ['race', 'gender', 'age', 'time_in_hospital', 'insulin', 'readmitted'] # Add more as needed
diabetes_df.dropna(subset=key_cols_for_nan_check, inplace=True)

In [20]:
# --- Define Action Space ---
# Action: Insulin ('No', 'Steady', 'Up', 'Down')
action_map = {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3}
diabetes_df['action'] = diabetes_df['insulin'].map(action_map)
# Drop rows where insulin action is not defined (if any after previous cleaning)
diabetes_df.dropna(subset=['action'], inplace=True)
diabetes_df['action'] = diabetes_df['action'].astype(int)

In [21]:
# --- Define Reward ---
reward_map = {'NO': 10, '>30': 0, '<30': -10}
diabetes_df['reward'] = diabetes_df['readmitted'].map(reward_map)
diabetes_df.dropna(subset=['reward'], inplace=True) # Should not happen if 'readmitted' is clean
diabetes_df['reward'] = diabetes_df['reward'].astype(float)

In [22]:
diabetes_df

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,action,reward
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,NO,0,10.0
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,No,No,No,No,Ch,Yes,>30,2,0.0
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,Yes,NO,0,10.0
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,No,No,No,No,Ch,Yes,NO,2,10.0
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,No,No,No,No,Ch,Yes,NO,1,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),1,3,7,3,51,0,16,...,No,No,No,No,No,Ch,Yes,>30,3,0.0
101762,AfricanAmerican,Female,[80-90),1,4,5,5,33,3,18,...,No,No,No,No,No,No,Yes,NO,1,10.0
101763,Caucasian,Male,[70-80),1,1,7,1,53,0,9,...,No,No,No,No,No,Ch,Yes,NO,3,10.0
101764,Caucasian,Female,[80-90),2,3,7,10,45,2,21,...,No,No,No,No,No,Ch,Yes,NO,2,10.0


In [23]:
# --- Feature Engineering for State ---
# Convert 'age' categories to numerical (e.g., midpoint)
age_map = {'[0-10)': 5, '[10-20)': 15, '[20-30)': 25, '[30-40)': 35, '[40-50)': 45,
           '[50-60)': 55, '[60-70)': 65, '[70-80)': 75, '[80-90)': 85, '[90-100)': 95}
diabetes_df['age_numeric'] = diabetes_df['age'].map(age_map)

# For diagnoses (diag_1, diag_2, diag_3) - very high cardinality
# Simplification: Use broader categories or just presence/absence for now.
# Example: Convert to numeric if they are ICD9 codes (this is simplified)
def simplify_diag(diag_col):
    # Replace non-numeric or special codes before attempting conversion
    diabetes_df[diag_col] = diabetes_df[diag_col].str.replace('[^0-9.]', '', regex=True) # Keep only numbers and dots
    diabetes_df[diag_col] = pd.to_numeric(diabetes_df[diag_col], errors='coerce').fillna(0).astype(int)
    # Further binning or categorization could be done here
    # For instance, group by ICD-9 main chapters (e.g., 001-139 infectious, 140-239 neoplasms, etc.)
    # For now, simple numeric conversion or treat as categorical with many levels (later one-hot encoding)

simplify_diag('diag_1')
simplify_diag('diag_2')
simplify_diag('diag_3')

In [24]:
diabetes_df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,action,reward,age_numeric
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,NO,0,10.0,5
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,No,No,No,Ch,Yes,>30,2,0.0,15
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,Yes,NO,0,10.0,25
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,No,No,No,Ch,Yes,NO,2,10.0,35
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,No,No,No,Ch,Yes,NO,1,10.0,45


In [25]:
# Identify categorical and numerical features for the state
# Exclude 'insulin' (it's our action base), 'readmitted' (reward base), 'action', 'reward'
potential_state_cols = diabetes_df.drop(columns=['insulin', 'readmitted', 'action', 'reward', 'age']).columns

categorical_features = []
numerical_features = []

for col in potential_state_cols:
    if diabetes_df[col].dtype == 'object' or diabetes_df[col].nunique() < 20 : # Heuristic for categorical
        categorical_features.append(col)
    elif pd.api.types.is_numeric_dtype(diabetes_df[col]):
        numerical_features.append(col)

print(f"Selected categorical features for state:\n{categorical_features}")
print(f"\nSelected numerical features for state:\n{numerical_features}")

Selected categorical features for state:
['race', 'gender', 'admission_type_id', 'admission_source_id', 'time_in_hospital', 'num_procedures', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'age_numeric']

Selected numerical features for state:
['discharge_disposition_id', 'num_lab_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3']


In [26]:
# Ensure 'age_numeric' is in numerical_features if it wasn't caught
if 'age_numeric' not in numerical_features and 'age_numeric' in diabetes_df.columns:
    numerical_features.append('age_numeric')
if 'age' in categorical_features: # remove original age if age_numeric is used
    categorical_features.remove('age')

# Impute remaining NaNs (e.g., SimpleImputer for numerical, mode for categorical)
# And apply scaling / encoding using ColumnTransformer

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler())]), numerical_features),
        ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_features)
    ],
    remainder='drop' # or 'passthrough'
)

In [27]:
# Fit and transform the state features
# Note: Fitting the preprocessor on the whole dataset before split for simplicity here.
# For rigorous evaluation, fit only on training data.
processed_states = preprocessor.fit_transform(diabetes_df)
print("Shape of processed states:", processed_states.shape)

# Get feature names after one-hot encoding for context (optional, d3rlpy doesn't strictly need them)
try:
    ohe_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
    all_feature_names = numerical_features + list(ohe_feature_names)
    print(f"Total number of features after processing: {len(all_feature_names)}")
except Exception as e:
    print(f"Could not get OHE feature names: {e}")

Shape of processed states: (99493, 166)
Total number of features after processing: 166


In [28]:
# use this code to free up gpu memory
gc.collect()
# torch.cuda.empty_cache()
print("freed up some memory!")

freed up some memory!


## Formulate the MDP (Markov Decision Process) of the Sepsis dataset

In [29]:
# --- Prepare data for d3rlpy ---
observations = processed_states
actions = diabetes_df['action'].values
rewards = diabetes_df['reward'].values
terminals = np.ones_like(rewards) # Each encounter is a terminal step in this simple model

# Ensure data types
observations = observations.astype('float32')
actions = actions.astype('int32')
rewards = rewards.astype('float32')
terminals = terminals.astype('int32')   # or bool

# Create MDPDataset
dataset = MDPDataset(
    observations=observations,
    actions=actions,
    rewards=rewards,
    terminals=terminals
)

print("MDP created!")

[2m2025-05-25 05:50.02[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(166,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
[2m2025-05-25 05:50.02[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2025-05-25 05:50.02[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m4[0m
MDP created!


In [30]:
type(dataset)

d3rlpy.dataset.compat.MDPDataset

In [31]:
help(dataset)

Help on MDPDataset in module d3rlpy.dataset.compat object:

class MDPDataset(d3rlpy.dataset.replay_buffer.ReplayBuffer)
 |  MDPDataset(observations: Union[numpy.ndarray[Any, numpy.dtype[Any]], Sequence[numpy.ndarray[Any, numpy.dtype[Any]]]], actions: numpy.ndarray[typing.Any, numpy.dtype[typing.Any]], rewards: numpy.ndarray[typing.Any, numpy.dtype[numpy.float32]], terminals: numpy.ndarray[typing.Any, numpy.dtype[numpy.float32]], timeouts: Optional[numpy.ndarray[Any, numpy.dtype[numpy.float32]]] = None, transition_picker: Optional[d3rlpy.dataset.transition_pickers.TransitionPickerProtocol] = None, trajectory_slicer: Optional[d3rlpy.dataset.trajectory_slicers.TrajectorySlicerProtocol] = None, action_space: Optional[d3rlpy.constants.ActionSpace] = None, action_size: Optional[int] = None)
 |  
 |  Backward-compability class of MDPDataset.
 |  
 |  This is a wrapper class that has a backward-compatible constructor
 |  interface.
 |  
 |  Args:
 |      observations (ObservationSequence): Obs

In [32]:
dataset.dataset_info

DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(166,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=4)

In [33]:
dataset.size()

99493

In [34]:
# dataset.sample_transition()

In [35]:
# help(dataset.sample_transition())

In [36]:
# # Check the following link to know more about methods of MDP:
# # https://d3rlpy.readthedocs.io/en/v2.8.1/references/generated/d3rlpy.dataset.MDPDataset.html

# def fn():
#     state = dataset.sample_transition().observation
#     action = dataset.sample_transition().action
#     reward = dataset.sample_transition().reward
#     next_state = dataset.sample_transition().next_observation
#     next_action = dataset.sample_transition().next_action
#     terminal = dataset.sample_transition().terminal
#     rewards_to_go = dataset.sample_transition().rewards_to_go
#     print("state:\t", state)
#     print("\naction:\t", action)
#     print("\nreward:\t", reward)
#     print("\nnext state:\t", next_state)
#     print("\nnext action:\t", next_action)
#     print("\nterminal:\t", terminal)
#     print("\nrewards to go:\t", rewards_to_go)

# fn()

In [37]:
help(MDPDataset)

Help on class MDPDataset in module d3rlpy.dataset.compat:

class MDPDataset(d3rlpy.dataset.replay_buffer.ReplayBuffer)
 |  MDPDataset(observations: Union[numpy.ndarray[Any, numpy.dtype[Any]], Sequence[numpy.ndarray[Any, numpy.dtype[Any]]]], actions: numpy.ndarray[typing.Any, numpy.dtype[typing.Any]], rewards: numpy.ndarray[typing.Any, numpy.dtype[numpy.float32]], terminals: numpy.ndarray[typing.Any, numpy.dtype[numpy.float32]], timeouts: Optional[numpy.ndarray[Any, numpy.dtype[numpy.float32]]] = None, transition_picker: Optional[d3rlpy.dataset.transition_pickers.TransitionPickerProtocol] = None, trajectory_slicer: Optional[d3rlpy.dataset.trajectory_slicers.TrajectorySlicerProtocol] = None, action_space: Optional[d3rlpy.constants.ActionSpace] = None, action_size: Optional[int] = None)
 |  
 |  Backward-compability class of MDPDataset.
 |  
 |  This is a wrapper class that has a backward-compatible constructor
 |  interface.
 |  
 |  Args:
 |      observations (ObservationSequence): Obse

In [38]:
# dataset.episodes

In [39]:
type(dataset.episodes)

list

In [40]:
len(dataset.episodes)

99493

In [41]:
help(dataset.episodes[0])

Help on Episode in module d3rlpy.dataset.components object:

class Episode(builtins.object)
 |  Episode(observations: Union[numpy.ndarray[Any, numpy.dtype[Any]], Sequence[numpy.ndarray[Any, numpy.dtype[Any]]]], actions: numpy.ndarray[typing.Any, numpy.dtype[typing.Any]], rewards: numpy.ndarray[typing.Any, numpy.dtype[numpy.float32]], terminated: bool) -> None
 |  
 |  Standard episode implementation.
 |  
 |  Args:
 |      observations: Sequence of observations.
 |      actions: Sequence of actions.
 |      rewards: Sequence of rewards.
 |      terminated: Flag of environment termination.
 |  
 |  Methods defined here:
 |  
 |  __delattr__(self, name)
 |      Implement delattr(self, name).
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __hash__(self)
 |      Return hash(self).
 |  
 |  __init__(self, observations: Union[numpy.ndarray[Any, numpy.dtype[Any]], Sequence[numpy.ndarray[Any, numpy.dtype[Any]]]], actions: numpy.ndarray[typing.Any, numpy.dtype[typing.Any]], r

In [42]:
def fn():
    episode = dataset.episodes[0]
    observations = episode.observations
    actions = episode.actions
    rewards = episode.rewards
    terminated = episode.terminated
    print("observation:\t", observations)
    print("\nactions:\t", actions)
    print("\nrewards:\t", rewards)
    print("\nterminated:\t", terminated)


fn()

observation:	 [[0.8888889  0.3053435  0.         0.         0.         0.
  0.25025025 0.         0.         0.         0.         0.
  1.         0.         0.         1.         0.         0.
  0.         0.         0.         0.         0.         1.
  0.         0.         1.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         1.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         1.         0.         0.
  0.         0.         0.         0.         1.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         1.         0.
  1.         0.         0.         1.         0.         0.
  0.         1.         0.         0.         0.         1.
  0.         0.         0.

In [43]:
# Split dataset into training and testing
train_episodes, test_episodes = train_test_split(dataset.episodes, test_size=0.2, random_state=42)

print("train test split done!")
print("No. of train episodes:\t", len(train_episodes))
print("No. of test episodes:\t", len(test_episodes))

train test split done!
No. of train episodes:	 79594
No. of test episodes:	 19899


In [44]:
def episode_list_to_mdp_dataset(episode_list):
    """
    copied from https://github.com/FlemmingKondrup/DeepVent/blob/main/utils/load_utils.py
    """
    states = []
    actions = []
    rewards = []
    terminals = []
    for episode in episode_list:
        for i in range(len(episode.observations)):
            states.append(episode.observations[i])
            actions.append(episode.actions[i])
            rewards.append(episode.rewards[i])
            terminals.append(0)
        terminals[-1] = 1
    
    return MDPDataset(
        np.array(states),
        np.array(actions),
        np.array(rewards),
        np.array(terminals)
    )
    

In [45]:
train_data = episode_list_to_mdp_dataset(train_episodes)
test_data = episode_list_to_mdp_dataset(test_episodes)
print("train test split done!")

[2m2025-05-25 05:50.04[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(166,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
[2m2025-05-25 05:50.04[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2025-05-25 05:50.04[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m4[0m
[2m2025-05-25 05:50.04[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(166,)])[0m [36mreward_signature[0m=[35mSignature(dt

In [54]:
# use this code to free up gpu memory
gc.collect()
torch.cuda.empty_cache()
print("freed up some memory!")

freed up some memory!


## Gymnasium MDP

## Apply Offline DRL on the Diabetes dataset

In [47]:
# --- Initialize Algorithms ---
# For GPU usage, add use_gpu=True if CUDA is available (e.g., d3rlpy.algos.CQL(use_gpu=True))
# Kaggle notebooks often have GPUs.

In [48]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [49]:
# 1. Behavioral Cloning (BC)

bc_optim_factory = d3rlpy.optimizers.AdamFactory(weight_decay=1e-4)

bc = d3rlpy.algos.DiscreteBCConfig(learning_rate=3e-4,
                                   optim_factory=bc_optim_factory,
                                   batch_size=256,
                                   beta=0.5, # Action selection temperature. Lower for more deterministic.
                                   # use_gpu=True # Uncomment if GPU is available
                                  ).create(device=device)

print("created discrete behavioral cloning model...")

created discrete behavioral cloning model...


In [50]:
help(bc.fit)

Help on method fit in module d3rlpy.algos.qlearning.base:

fit(dataset: d3rlpy.dataset.replay_buffer.ReplayBufferBase, n_steps: int, n_steps_per_epoch: int = 10000, experiment_name: Optional[str] = None, with_timestamp: bool = True, logging_steps: int = 500, logging_strategy: d3rlpy.constants.LoggingStrategy = <LoggingStrategy.EPOCH: 'epoch'>, logger_adapter: d3rlpy.logging.logger.LoggerAdapterFactory = <d3rlpy.logging.file_adapter.FileAdapterFactory object at 0x79c2ffe17990>, show_progress: bool = True, save_interval: int = 1, evaluators: Optional[dict[str, d3rlpy.metrics.evaluators.EvaluatorProtocol]] = None, callback: Optional[Callable[[Self, int, int], NoneType]] = None, epoch_callback: Optional[Callable[[Self, int, int], NoneType]] = None) -> list[tuple[int, dict[str, float]]] method of d3rlpy.algos.qlearning.bc.DiscreteBC instance
    Trains with given dataset.
    
    .. code-block:: python
    
        algo.fit(episodes, n_steps=1000000)
    
    Args:
        dataset: ReplayB

In [51]:
bc.fit(dataset=train_data,
       n_steps=3)

[2m2025-05-25 05:50.04[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(166,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=4)[0m
[2m2025-05-25 05:50.04[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2025-05-25 05:50.05[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2025-05-25 05:50.05[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DiscreteBC_20250525055005[0m
[2m2025-05-25 05:50.05[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [166], 'action_size': 4, 'config': {'type': 'discrete_bc', 'params': {'batch_size': 256, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, '

[]

In [53]:
# os.listdir("../working/d3rlpy_logs/DiscreteBC_20250525044327")

In [56]:
# import json

# # Open and read the JSON file
# with open('../working/d3rlpy_logs/DiscreteBC_20250525044327/params.json', 'r') as file:
#     params_json_file = json.load(file)

# # Print the data
# print(params_json_file)

In [None]:
# # 2. Neural Fitted Q-iteration (NFQ) / Fitted Q-Iteration (FQI)
# nfq = d3rlpy.algos.NFQ(learning_rate=3e-4,
#           batch_size=256,   
#           n_epochs=10, # Adjust epochs as needed
#           # use_gpu=True
#          )

In [None]:
# # 3. DQN (Offline variant)
# dqn = d3rlpy.algos.DQN(learning_rate=3e-4,
#           batch_size=256,
#           n_epochs=10, # Adjust epochs as needed
#           # use_gpu=True
#          )

In [58]:
# 4. Conservative Q-Learning (CQL)
disc_cql = d3rlpy.algos.DiscreteCQLConfig(alpha=5.0, # Higher alpha for more conservatism
                                          learning_rate=3e-4,
                                          batch_size=256,
                                          # n_epochs=10, # Adjust epochs as needed
                                          # gamma=,
                                          # use_gpu=True
                                         ).create(device=device)

print("created discrete CQL model...")

created discrete CQL model...


In [None]:
# !rm -rf d3rlpy_logs/

In [59]:
os.listdir("../working/")

['.virtual_documents', 'd3rlpy_logs']

In [60]:
disc_cql.fit(dataset=train_data,
             n_steps=10000)

[2m2025-05-25 05:50.49[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(166,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=4)[0m
[2m2025-05-25 05:50.49[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2025-05-25 05:50.49[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2025-05-25 05:50.49[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DiscreteCQL_20250525055049[0m
[2m2025-05-25 05:50.49[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [166], 'action_size': 4, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 256, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}},

Epoch 1/1:   0%|          | 0/10000 [00:00<?, ?it/s]

[2m2025-05-25 05:52.26[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20250525055049: epoch=1 step=10000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.004800861620903015, 'time_algorithm_update': 0.004588833856582641, 'loss': 6.947201228094101, 'td_loss': 4.839465103936195, 'conservative_loss': 0.4215472247123718, 'time_step': 0.00957240936756134}[0m [36mstep[0m=[35m10000[0m
[2m2025-05-25 05:52.26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20250525055049/model_10000.d3[0m


[(1,
  {'time_sample_batch': 0.004800861620903015,
   'time_algorithm_update': 0.004588833856582641,
   'loss': 6.947201228094101,
   'td_loss': 4.839465103936195,
   'conservative_loss': 0.4215472247123718,
   'time_step': 0.00957240936756134})]

In [61]:
def print_episode_info(idx):
    episode = test_data.episodes[idx]
    print("observations:\n", episode.observations[idx])
    print("\naction:\t", episode.actions[idx])
    print("reward:\t", episode.rewards[idx])


In [62]:
print_episode_info(0)

observations:
 [0.         0.         0.1375     0.07142857 0.         0.
 0.7857858  0.5935936  0.25025025 0.7777778  0.         0.
 1.         0.         0.         1.         0.         0.
 1.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         1.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         1.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         1.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         1.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         1.         0.
 1.         0.         0.         1.         0.         0.
 0.         1.         0.         0.         0.         1.
 0.         0.         0.         1.     

In [63]:
test_data.episodes[0].observations.shape
# np.expand_dims(test_data.episodes[0], axis=0)

(1, 166)

In [64]:
np.expand_dims(test_data.episodes[0].observations, axis=0).shape

(1, 1, 166)

In [65]:
test_data.size()

19899

In [66]:
def fn(idx):
    episode = test_data.episodes[idx]
    obs = np.expand_dims(episode.observations, axis=0)
    real_action = episode.actions[0][0]
    # predict action
    action = disc_cql.predict(obs)
    # predict action-value
    action_value = disc_cql.predict_value(obs, action)
    print("Action:\t\t", action[0])
    print("Actual Action:\t", real_action)
    print("Action-value:\t", action_value)



fn(42)

Action:		 1
Actual Action:	 1
Action-value:	 [1.8559906]


In [67]:
def accuracy_cql():
    test_size = test_data.size()
    correct = 0
    incorrect = 0 
    for i in range(test_size):
        episode = test_data.episodes[i]
        obs = np.expand_dims(episode.observations, axis=0)
        real_action = episode.actions[0][0]
        # predict action
        action = disc_cql.predict(obs)
        # predict action-value
        action_value = disc_cql.predict_value(obs, action)
        if real_action == action[0]:
            correct += 1
        else:
            incorrect += 1
        # print("Action:\t\t", action[0])
        # print("Actual Action:\t", real_action)
        # print("Action-value:\t", action_value)
    acc = correct/test_size
    print("No. correct actions:\t", correct)
    print("No. incorrect actions:\t", incorrect)
    print("Accuracy:\t\t", acc)
    

In [68]:
accuracy_cql()

No. correct actions:	 15823
No. incorrect actions:	 4076
Accuracy:		 0.7951655862103624


In [84]:
test_data.episodes[0]

Episode(observations=array([[0.        , 0.        , 0.1375    , 0.07142857, 0.        ,
        0.        , 0.7857858 , 0.5935936 , 0.25025025, 0.7777778 ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.     

In [69]:
# use this code to free up gpu memory
gc.collect()
torch.cuda.empty_cache()
print("freed up some memory!")

freed up some memory!


## Formulate Multi-Step MDP of Diabetes Dataset

## Apply Offline DRL on the Multi-Step Diabetes dataset

## Sepsis prediction dataset EDA

In [19]:
# sepsis_path = "../input/prediction-of-sepsis/"
# os.listdir(sepsis_path)

['SHA256SUMS.txt',
 'utility_sepsis_diagram.svg',
 'physionet_challenge_2019_ccm_manuscript.pdf',
 'training_setA',
 'utility_nonsepsis_diagram.svg',
 'LICENSE.txt',
 'training_setB',
 'Dataset.csv']

In [20]:
# sepsis_df = pd.read_csv(os.path.join(sepsis_path, "Dataset.csv"))
# sepsis_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0.1,Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
0,0,0,,,,,,,,,...,,,68.54,0,,,-0.02,1,0,17072
1,1,1,65.0,100.0,,,72.0,,16.5,,...,,,68.54,0,,,-0.02,2,0,17072
2,2,2,78.0,100.0,,,42.5,,,,...,,,68.54,0,,,-0.02,3,0,17072
3,3,3,73.0,100.0,,,,,17.0,,...,,,68.54,0,,,-0.02,4,0,17072
4,4,4,70.0,100.0,,129.0,74.0,69.0,14.0,,...,,330.0,68.54,0,,,-0.02,5,0,17072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552205,21,21,83.0,99.0,,121.0,77.0,54.0,22.0,,...,,,88.00,0,1.0,0.0,-2.93,22,0,113911
1552206,22,22,80.0,92.0,,102.0,73.0,51.0,24.0,,...,,,88.00,0,1.0,0.0,-2.93,23,0,113911
1552207,23,23,95.0,97.0,36.7,128.5,83.0,58.5,25.0,,...,,,88.00,0,1.0,0.0,-2.93,24,0,113911
1552208,24,24,104.0,99.0,,127.0,85.0,59.0,24.0,,...,,,88.00,0,1.0,0.0,-2.93,25,0,113911


In [21]:
# # use this code to free up gpu memory
# gc.collect()
# # torch.cuda.empty_cache()
# print("freed up some memory!")

freed up some memory!


## Applying Offline DRL on the Sepsis dataset