In [1]:
import os 
import numpy as np

path = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob"
all_files = os.listdir(path)
all_files.sort()
print(all_files)

['Hospital Clínic', 'Hospital Josep Trueta', 'Hospital Sant Pau', 'Hospital Vall Hebrón', 'README', 'all_data_inc_trueta.xlsx']


In [2]:
import pandas as pd
excel_file = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob/all_data_inc_trueta.xlsx"
df = pd.read_excel(excel_file)
df.head()

Unnamed: 0,Hospital name,id,"Basal volume, ml","FU volume, ml","Absolute vol diff, ml",Relative vol diff (FU_vol/Basal_vol),HE vicorob,HE real
0,Hospital Clínic,2098,5.02,4.87,-0.15,0.97,0.0,
1,Hospital Clínic,233632,36.07,39.24,3.171,1.088,0.0,
2,Hospital Clínic,261065,9.5,11.41,1.915,1.202,0.0,
3,Hospital Clínic,34333,,,,,,
4,Hospital Clínic,397280,9.37,8.33,-1.038,0.889,0.0,


In [3]:
# for all the cases of hospital sant pau, make their ids 3 digits by adding leading zeros
df.loc[df['Hospital name'] == 'Hospital Sant Pau', 'id'] = df.loc[df['Hospital name'] == 'Hospital Sant Pau', 'id'].apply(lambda x: str(x).zfill(3))
df.loc[df['Hospital name'] == 'Hospital Vall Hebrón', 'id'] = df.loc[df['Hospital name'] == 'Hospital Vall Hebrón', 'id'].apply(lambda x: str(x).zfill(3))
df.head()

# save updated dataframe to excel file
output_excel = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob/all_data_inc_trueta.xlsx"
df.to_excel(output_excel, index=False)
print(f"Updated dataframe saved to {output_excel}")

Updated dataframe saved to /media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob/all_data_inc_trueta.xlsx


In [4]:
all_files

['Hospital Clínic',
 'Hospital Josep Trueta',
 'Hospital Sant Pau',
 'Hospital Vall Hebrón',
 'README',
 'all_data_inc_trueta.xlsx']

In [5]:
# create a csv file to read the data easily.
# For each patient find Basal and FU paths and save to df columns

# add 2 new columns for the paths
df['basal_img_path'] = ''
df['basal_mask_path'] = ''
df['fu_img_path'] = ''
df['fu_mask_path'] = ''

base_path = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob"

def collect_files(hospital_dir):
    files = []
    for root, _, filenames in os.walk(hospital_dir):
        for fname in filenames:
            if fname.lower().endswith(('.nii', '.nii.gz', '.mha', '.nrrd')):
                files.append(os.path.join(root, fname))
    return files

missing = []
for hospital in df['Hospital name'].unique():
    hosp_dir = os.path.join(base_path, hospital)
    hospital_df = df[df['Hospital name'] == hospital]

    for index, row in hospital_df.iterrows():
        pid = str(row['id'])
        basal_path = os.path.join(hosp_dir, pid, 'Basal')
        fu_path = os.path.join(hosp_dir, pid, 'FU1')

        basal_image = os.path.join(basal_path, f"CT_SS.nii.gz")
        basal_mask = os.path.join(basal_path, f"hematoma_mask_vicorob_reviewed_reoriented.nii.gz")
        fu_image = os.path.join(fu_path, f"CT_SS.nii.gz")
        fu_mask = os.path.join(fu_path, f"hematoma_mask_vicorob_reviewed_reoriented.nii.gz")

        # record paths (empty string if not found)
        df.at[index, 'basal_img_path'] = basal_image if os.path.exists(basal_image) else ''
        df.at[index, 'basal_mask_path'] = basal_mask if os.path.exists(basal_mask) else ''
        df.at[index, 'fu_img_path'] = fu_image if os.path.exists(fu_image) else ''
        df.at[index, 'fu_mask_path'] = fu_mask if os.path.exists(fu_mask) else ''

        if not os.path.exists(basal_image) or not os.path.exists(fu_image):
            missing.append((hospital, pid, os.path.exists(basal_image), os.path.exists(fu_image)))

# summary
print(f"Missing entries (hospital, id, has_basal, has_fu): {len(missing)}")
if missing:
    print(missing[:50])

# save augmented table
df.to_csv('/media/cansu/DiskSpace/Cansu/HE_Prediction/all_data_with_paths.csv', index=False)
print('Saved CSV with paths to all_data_with_paths.csv')

Missing entries (hospital, id, has_basal, has_fu): 24
[('Hospital Clínic', '34333', True, False), ('Hospital Clínic', '4084062', True, False), ('Hospital Clínic', '4731571', True, False), ('Hospital Clínic', '5229590', True, False), ('Hospital Clínic', '5281637', True, False), ('Hospital Clínic', '5295375', True, False), ('Hospital Clínic', '600801', True, False), ('Hospital Clínic', '70293754', True, False), ('Hospital Clínic', '70588150', True, False), ('Hospital Sant Pau', '016', True, False), ('Hospital Sant Pau', '033', True, False), ('Hospital Vall Hebrón', '016', True, False), ('Hospital Vall Hebrón', '017', True, False), ('Hospital Vall Hebrón', '023', True, False), ('Hospital Vall Hebrón', '053', True, False), ('Hospital Vall Hebrón', '054', True, False), ('Hospital Vall Hebrón', '056', False, True), ('Hospital Vall Hebrón', '097', True, False), ('Hospital Vall Hebrón', '121', True, False), ('Hospital Vall Hebrón', '126', True, False), ('Hospital Vall Hebrón', '139', True, Fal

In [8]:
# filter the ones which have both basal and fu images
df_filtered = df[(df['basal_img_path'] != '') & (df['fu_img_path'] != '') & (df['basal_mask_path'] != '') & (df['fu_mask_path'] != '')]
df_filtered.to_csv('/media/cansu/DiskSpace/Cansu/HE_Prediction/data_with_both_basal_fu.csv', index=False)
print('Saved filtered CSV with both basal and fu images to data_with_both_basal_fu.csv')

Saved filtered CSV with both basal and fu images to data_with_both_basal_fu.csv


In [9]:
# compare the filtered dataframe with the original one and print the number of entries removed
print(f"Original dataframe entries: {len(df)}")
print(f"Filtered dataframe entries: {len(df_filtered)}")
print(f"Number of entries removed: {len(df) - len(df_filtered)}")

Original dataframe entries: 482
Filtered dataframe entries: 456
Number of entries removed: 26


In [10]:
# number of entries from eachh hospital in the filtered dataframe
print("Entries per hospital in filtered dataframe:")
print(df_filtered['Hospital name'].value_counts())

Entries per hospital in filtered dataframe:
Hospital name
Hospital Josep Trueta    209
Hospital Vall Hebrón     167
Hospital Clínic           47
Hospital Sant Pau         33
Name: count, dtype: int64


In [12]:
# for each hospital count the number of "he vicorob" columns, if they are 1. and list numbers based on the hospital name
for hospital in df_filtered['Hospital name'].unique():
    hosp_df = df_filtered[df_filtered['Hospital name'] == hospital]
    he_vicorob_counts = hosp_df['HE vicorob'].value_counts()
    print(f"Hospital: {hospital}")
    print(he_vicorob_counts)

Hospital: Hospital Clínic
HE vicorob
0.0    39
1.0     8
Name: count, dtype: int64
Hospital: Hospital Sant Pau
HE vicorob
0.0    24
1.0     9
Name: count, dtype: int64
Hospital: Hospital Vall Hebrón
HE vicorob
0.0    137
1.0     30
Name: count, dtype: int64
Hospital: Hospital Josep Trueta
HE vicorob
0.0    166
1.0     43
Name: count, dtype: int64


In [None]:
# # Remove files with specific name in Hospital Vall Hebrón folder - space cleanup
# path_hospital = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob/Hospital Vall Hebrón"

# for cases in os.listdir(path_hospital):
#     basal_folder = os.path.join(path_hospital, cases, "Basal")
#     fu_folder = os.path.join(path_hospital, cases, "FU1")

#     # remove the files if it contains name hematoma_mask_fold0.nii.gz
#     for folder in [basal_folder, fu_folder]:
#         if os.path.exists(folder):
#             for file in os.listdir(folder):
#                 if "hematoma_mask_fold4.nii.gz" in file:
#                     # print(f"Removing file: {os.path.join(folder, file)}")
#                     os.remove(os.path.join(folder, file))

# Redesigning the Dataset and Dataloading process

In [16]:
df = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/data_with_both_basal_fu.csv")
##
# bring all ids to string type and save them as txt file from all hospitals excepts Trueta
df['id'] = df['id'].astype(str).str.zfill(3)
hospitals = df['Hospital name'].unique().tolist()
# hospitals.remove('Hospital Josep Trueta')

for hospital in hospitals:
    hosp_df = df[df['Hospital name'] == hospital]
    ids = hosp_df['id'].tolist()
    output_txt = f"/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/{hospital.replace(' ', '_')}_ids.txt"
    with open(output_txt, 'w') as f:
        for pid in ids:
            f.write(f"{pid}\n")
    print(f"Saved IDs for {hospital} to {output_txt}")

Saved IDs for Hospital Clínic to /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/Hospital_Clínic_ids.txt
Saved IDs for Hospital Sant Pau to /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/Hospital_Sant_Pau_ids.txt
Saved IDs for Hospital Vall Hebrón to /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/Hospital_Vall_Hebrón_ids.txt
Saved IDs for Hospital Josep Trueta to /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/Hospital_Josep_Trueta_ids.txt


In [None]:
import os
# from sklearn.utils.class_weight import compute_class_weight
# import sys; sys.path.insert(0, os.path.abspath("../"))
# from dataset import *
# from utils import *
# from model import *
import torch.utils.data as data
import random
import argparse
import yaml
from tqdm import tqdm
import pandas as pd
import SimpleITK as sitk
from torch.utils.data import Dataset
from pathlib import Path


class ISLES24(Dataset):
    """ ISLES24 Dataset

    Now supports an optional `fold` argument. If `fold` is provided the
    dataset will look for split files named `fold_{fold}_{split}_files.txt`
    inside the `splits/` folder. If the fold-specific file does not exist
    it falls back to the legacy `{split}_files.txt`.
    """

    def __init__(self, base_dir=None, split='train', transform=None, fold: int = None):
        self._base_dir = base_dir
        self.transform = transform
        self.sample_list = []

        # build candidate path for split files
        splits_dir = os.path.join(self._base_dir, 'splits')

        if fold is not None:
            candidate = os.path.join(splits_dir, f'fold_{fold}_{split}_files.txt')
            if os.path.exists(candidate):
                path = candidate
                print(f"Using fold {fold} split file: {os.path.basename(path)}")
            else:
                raise FileNotFoundError(f"Fold-specific file not found: {candidate}")
        else: 
            raise FileNotFoundError("Fold argument is required but not provided.")

        if not os.path.exists(path):
            raise FileNotFoundError(f"Split file not found: {path}")

        with open(path, 'r') as f:
            self.image_list = f.readlines()

        self.image_list = [item.replace('\n', '').split(',')[0] for item in self.image_list]
        print("Total {} samples in {} set.".format(len(self.image_list), split))

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, idx):
        
        if isinstance(idx, str):
            image_name = idx
        else:
            image_name = self.image_list[idx]

        # open h5 file safely and read datasets
        h5_path = os.path.join(self._base_dir, "h5_files_preprocessed_no_znorm", image_name)
        with h5py.File(h5_path, 'r') as h5f:
            # support files that use either 'data' or 'image' as the image dataset name
            if 'data' in h5f:
                image = h5f['data'][:]
            elif 'image' in h5f:
                image = h5f['image'][:]
            else:
                raise KeyError(f"No 'data' or 'image' dataset found in {h5_path}")

            if 'label' in h5f:
                label = h5f['label'][:]
            elif 'gt' in h5f:
                label = h5f['gt'][:]
            else:
                raise KeyError(f"No 'label' or 'gt' dataset found in {h5_path}")

        # extract patient id from filename (robust to full paths)
        base_name = os.path.basename(image_name)
        patient_id = base_name.split('_')[0] if isinstance(base_name, str) else None

        sample = {'image': image, 'label': label.astype(np.uint8), 'patient_id': patient_id}
        if self.transform:
            sample = self.transform(sample)
        # keep original idx for backward compatibility
        sample["idx"] = idx

        return sample
    


class PredictionDataset(Dataset):
    def __init__(self, base_dir=None, split='train', transform=None, fold: int = None, 
                 md_path: Path = None, test_other_hospitals: bool = False):
        
        self.md_df = pd.read_csv(md_path)
        splits_dir = os.path.join(base_dir, 'splits')

        if test_other_hospitals and split == 'test':
            hospital_clinic_ids = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Clínic_ids.txt", header=None)
            # hospital_trueta_gt_ids = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Josep_Trueta_gt_ids.txt", header=None)
            hospital_trueta_vicorob_ids = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Josep_Trueta_pseudo_ids.txt", header=None)
            hospital_santpau_ids = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Sant_Pau_ids.txt", header=None)
            hospital_vallhebron_ids = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Vall_Hebrón_ids.txt", header=None)
            self.image_list = pd.concat([hospital_clinic_ids, hospital_trueta_vicorob_ids, hospital_santpau_ids, hospital_vallhebron_ids], ignore_index=True)
            print("Total {} samples in test set from other hospitals.".format(len(self.image_list)))
            # all all ids with pseudo labels. 

    def __len__(self):
        return len(self.image_list)
    
    def __getitem__(self, idx):

        if isinstance(idx, str):
            image_name = idx
        else:
            image_name = self.image_list[idx]

        # for each id match the image and label paths from the dataframe
        row = self.md_df[self.md_df['id'] == image_name]
        if row.empty:
            raise ValueError(f"ID {image_name} not found in metadata dataframe.")
        
        basal_img_path = row['basal_img_path'].values[0]
        basal_mask_path = row['basal_mask_path'].values[0]

        # read images using sitk
        basal_img = sitk.GetArrayFromImage(sitk.ReadImage(basal_img_path))
        basal_mask = sitk.GetArrayFromImage(sitk.ReadImage(basal_mask_path))

        sample = {'image': basal_img, 'mask': basal_mask.astype(np.uint8), 'patient_id': image_name, 'pseudo_label': 
                  row['HE vicorob'].values[0], 'gt_label': row['HE real'].values[0]}
        
        if self.transform:
            sample = self.transform(sample)

        sample["idx"] = idx
        return sample

repo_path = os.getcwd()
CONFIG_PATH = repo_path + '/configs'
with open(os.path.join(CONFIG_PATH, "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/configs/config_eff_t5_repeat_1_othermain.yml")) as file:
    config = yaml.safe_load(file)

gpu = config["GPU"]
os.environ["CUDA_VISIBLE_DEVICES"] = gpu
threshold_name = config["threshold_name"]
experiment_sample= config["experiment_sample"]
threshold = config["threshold_percentage"] # 0.5

# # NUM_WORKERS=8
# # MAX_EPOCHS=20
# # MASK=True
# # USE_2D=True
# # FILTER_SEGMENTED = True
# # PATIENCE = config["PATIENCE_EARLYSTOP"]
# # MODEL = config["MODEL"]
# # BACKBONE = config["BACKBONE"]
# # model = f"{MODEL}"
# # gradient_accumulation_steps =  1
# # pw_based = "mean" # the way the pw is calculated.
# # test_pw_based = "mean"
# # no_ivh = config["NO_IVH"] # Eliminate ivh cases from the dataset
# # task = config["TASK"]
# # lesion = config["LESION"]
# # test_type = config["TEST_TYPE"]

md_path = repo_path + "/data/data_with_both_basal_fu.csv"
df = pd.read_csv(md_path)
df

from torch.utils.data import DataLoader
dataset = PredictionDataset(base_dir=repo_path, split='test', transform=None, fold=1, 
                            md_path=md_path, test_other_hospitals=True)
dataloader = DataLoader(dataset, batch_size=2, num_workers=4, shuffle=False)
dataloader

Total 250 samples in test set from other hospitals.


<torch.utils.data.dataloader.DataLoader at 0x798e635c1ab0>

In [10]:
pseudo_labels = df['HE vicorob'].values  
gt_labels = df['HE real'].values
patient_id= df['id'].values
hospital_bname = df['Hospital name'].values
basal_img_path = df['basal_img_path'].values
basal_mask_path = df['basal_mask_path'].values
fu_img_path = df['fu_img_path'].values
fu_mask_path = df['fu_mask_path'].values

In [11]:
pseudo_labels

array([0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
       0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 1.