In [1]:
import os 
import numpy as np

path = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob"
all_files = os.listdir(path)
all_files.sort()
print(all_files)

['Hospital Clínic', 'Hospital Josep Trueta', 'Hospital Sant Pau', 'Hospital Vall Hebrón', 'README', 'all_data_inc_trueta.xlsx']


In [2]:
import pandas as pd
excel_file = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob/all_data_inc_trueta.xlsx"
df = pd.read_excel(excel_file)
df.head()

Unnamed: 0,Hospital name,id,"Basal volume, ml","FU volume, ml","Absolute vol diff, ml",Relative vol diff (FU_vol/Basal_vol),HE vicorob,HE real
0,Hospital Clínic,2098,5.02,4.87,-0.15,0.97,0.0,
1,Hospital Clínic,233632,36.07,39.24,3.171,1.088,0.0,
2,Hospital Clínic,261065,9.5,11.41,1.915,1.202,0.0,
3,Hospital Clínic,34333,,,,,,
4,Hospital Clínic,397280,9.37,8.33,-1.038,0.889,0.0,


In [3]:
# for all the cases of hospital sant pau, make their ids 3 digits by adding leading zeros
df.loc[df['Hospital name'] == 'Hospital Sant Pau', 'id'] = df.loc[df['Hospital name'] == 'Hospital Sant Pau', 'id'].apply(lambda x: str(x).zfill(3))
df.loc[df['Hospital name'] == 'Hospital Vall Hebrón', 'id'] = df.loc[df['Hospital name'] == 'Hospital Vall Hebrón', 'id'].apply(lambda x: str(x).zfill(3))
df.head()

# save updated dataframe to excel file
output_excel = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob/all_data_inc_trueta.xlsx"
df.to_excel(output_excel, index=False)
print(f"Updated dataframe saved to {output_excel}")

Updated dataframe saved to /media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob/all_data_inc_trueta.xlsx


In [4]:
all_files

['Hospital Clínic',
 'Hospital Josep Trueta',
 'Hospital Sant Pau',
 'Hospital Vall Hebrón',
 'README',
 'all_data_inc_trueta.xlsx']

In [5]:
# create a csv file to read the data easily.
# For each patient find Basal and FU paths and save to df columns

# add 2 new columns for the paths
df['basal_img_path'] = ''
df['basal_mask_path'] = ''
df['fu_img_path'] = ''
df['fu_mask_path'] = ''

base_path = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob"

def collect_files(hospital_dir):
    files = []
    for root, _, filenames in os.walk(hospital_dir):
        for fname in filenames:
            if fname.lower().endswith(('.nii', '.nii.gz', '.mha', '.nrrd')):
                files.append(os.path.join(root, fname))
    return files

missing = []
for hospital in df['Hospital name'].unique():
    hosp_dir = os.path.join(base_path, hospital)
    hospital_df = df[df['Hospital name'] == hospital]

    for index, row in hospital_df.iterrows():
        pid = str(row['id'])
        basal_path = os.path.join(hosp_dir, pid, 'Basal')
        fu_path = os.path.join(hosp_dir, pid, 'FU1')

        basal_image = os.path.join(basal_path, f"CT_SS.nii.gz")
        basal_mask = os.path.join(basal_path, f"hematoma_mask_vicorob_reviewed_reoriented.nii.gz")
        fu_image = os.path.join(fu_path, f"CT_SS.nii.gz")
        fu_mask = os.path.join(fu_path, f"hematoma_mask_vicorob_reviewed_reoriented.nii.gz")

        # record paths (empty string if not found)
        df.at[index, 'basal_img_path'] = basal_image if os.path.exists(basal_image) else ''
        df.at[index, 'basal_mask_path'] = basal_mask if os.path.exists(basal_mask) else ''
        df.at[index, 'fu_img_path'] = fu_image if os.path.exists(fu_image) else ''
        df.at[index, 'fu_mask_path'] = fu_mask if os.path.exists(fu_mask) else ''

        if not os.path.exists(basal_image) or not os.path.exists(fu_image):
            missing.append((hospital, pid, os.path.exists(basal_image), os.path.exists(fu_image)))

# summary
print(f"Missing entries (hospital, id, has_basal, has_fu): {len(missing)}")
if missing:
    print(missing[:50])

# save augmented table
df.to_csv('/media/cansu/DiskSpace/Cansu/HE_Prediction/all_data_with_paths.csv', index=False)
print('Saved CSV with paths to all_data_with_paths.csv')

Missing entries (hospital, id, has_basal, has_fu): 24
[('Hospital Clínic', '34333', True, False), ('Hospital Clínic', '4084062', True, False), ('Hospital Clínic', '4731571', True, False), ('Hospital Clínic', '5229590', True, False), ('Hospital Clínic', '5281637', True, False), ('Hospital Clínic', '5295375', True, False), ('Hospital Clínic', '600801', True, False), ('Hospital Clínic', '70293754', True, False), ('Hospital Clínic', '70588150', True, False), ('Hospital Sant Pau', '016', True, False), ('Hospital Sant Pau', '033', True, False), ('Hospital Vall Hebrón', '016', True, False), ('Hospital Vall Hebrón', '017', True, False), ('Hospital Vall Hebrón', '023', True, False), ('Hospital Vall Hebrón', '053', True, False), ('Hospital Vall Hebrón', '054', True, False), ('Hospital Vall Hebrón', '056', False, True), ('Hospital Vall Hebrón', '097', True, False), ('Hospital Vall Hebrón', '121', True, False), ('Hospital Vall Hebrón', '126', True, False), ('Hospital Vall Hebrón', '139', True, Fal

In [8]:
# filter the ones which have both basal and fu images
df_filtered = df[(df['basal_img_path'] != '') & (df['fu_img_path'] != '') & (df['basal_mask_path'] != '') & (df['fu_mask_path'] != '')]
df_filtered.to_csv('/media/cansu/DiskSpace/Cansu/HE_Prediction/data_with_both_basal_fu.csv', index=False)
print('Saved filtered CSV with both basal and fu images to data_with_both_basal_fu.csv')

Saved filtered CSV with both basal and fu images to data_with_both_basal_fu.csv


In [9]:
# compare the filtered dataframe with the original one and print the number of entries removed
print(f"Original dataframe entries: {len(df)}")
print(f"Filtered dataframe entries: {len(df_filtered)}")
print(f"Number of entries removed: {len(df) - len(df_filtered)}")

Original dataframe entries: 482
Filtered dataframe entries: 456
Number of entries removed: 26


In [10]:
# number of entries from eachh hospital in the filtered dataframe
print("Entries per hospital in filtered dataframe:")
print(df_filtered['Hospital name'].value_counts())

Entries per hospital in filtered dataframe:
Hospital name
Hospital Josep Trueta    209
Hospital Vall Hebrón     167
Hospital Clínic           47
Hospital Sant Pau         33
Name: count, dtype: int64


In [12]:
# for each hospital count the number of "he vicorob" columns, if they are 1. and list numbers based on the hospital name
for hospital in df_filtered['Hospital name'].unique():
    hosp_df = df_filtered[df_filtered['Hospital name'] == hospital]
    he_vicorob_counts = hosp_df['HE vicorob'].value_counts()
    print(f"Hospital: {hospital}")
    print(he_vicorob_counts)

Hospital: Hospital Clínic
HE vicorob
0.0    39
1.0     8
Name: count, dtype: int64
Hospital: Hospital Sant Pau
HE vicorob
0.0    24
1.0     9
Name: count, dtype: int64
Hospital: Hospital Vall Hebrón
HE vicorob
0.0    137
1.0     30
Name: count, dtype: int64
Hospital: Hospital Josep Trueta
HE vicorob
0.0    166
1.0     43
Name: count, dtype: int64


In [None]:
# # Remove files with specific name in Hospital Vall Hebrón folder - space cleanup
# path_hospital = "/media/cansu/DiskSpace/Cansu/HE_Prediction/RAINS_vicorob/Hospital Vall Hebrón"

# for cases in os.listdir(path_hospital):
#     basal_folder = os.path.join(path_hospital, cases, "Basal")
#     fu_folder = os.path.join(path_hospital, cases, "FU1")

#     # remove the files if it contains name hematoma_mask_fold0.nii.gz
#     for folder in [basal_folder, fu_folder]:
#         if os.path.exists(folder):
#             for file in os.listdir(folder):
#                 if "hematoma_mask_fold4.nii.gz" in file:
#                     # print(f"Removing file: {os.path.join(folder, file)}")
#                     os.remove(os.path.join(folder, file))

# Redesigning the Dataset and Dataloading process

In [16]:
df = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/data_with_both_basal_fu.csv")
##
# bring all ids to string type and save them as txt file from all hospitals excepts Trueta
df['id'] = df['id'].astype(str).str.zfill(3)
hospitals = df['Hospital name'].unique().tolist()
# hospitals.remove('Hospital Josep Trueta')

for hospital in hospitals:
    hosp_df = df[df['Hospital name'] == hospital]
    ids = hosp_df['id'].tolist()
    output_txt = f"/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/{hospital.replace(' ', '_')}_ids.txt"
    with open(output_txt, 'w') as f:
        for pid in ids:
            f.write(f"{pid}\n")
    print(f"Saved IDs for {hospital} to {output_txt}")

Saved IDs for Hospital Clínic to /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/Hospital_Clínic_ids.txt
Saved IDs for Hospital Sant Pau to /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/Hospital_Sant_Pau_ids.txt
Saved IDs for Hospital Vall Hebrón to /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/Hospital_Vall_Hebrón_ids.txt
Saved IDs for Hospital Josep Trueta to /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/Hospital_Josep_Trueta_ids.txt


In [2]:
import os
# from sklearn.utils.class_weight import compute_class_weight
# import sys; sys.path.insert(0, os.path.abspath("../"))
# from dataset import *
# from utils import *
# from model import *
import torch.utils.data as data
import random
import argparse
import yaml
from tqdm import tqdm
import pandas as pd
import SimpleITK as sitk
from torch.utils.data import Dataset
from pathlib import Path
import torchvision.transforms.functional as T


import numpy as np
import SimpleITK as sitk
import torch

def load_nifti(path):
    return sitk.GetArrayFromImage(sitk.ReadImage(path))

def normalize_ct(volume):
    vmin, vmax = np.percentile(volume, [1, 99])
    volume = np.clip(volume, vmin, vmax)
    return (volume - vmin) / (vmax - vmin + 1e-6)

def has_lesion(mask_slice, threshold=2):
    return mask_slice.sum() > threshold


class SliceIndexBuilder:
    def __init__(self, df, filter_lesion=True, threshold=2):
        self.df = df
        self.filter_lesion = filter_lesion
        self.threshold = threshold
        self.index = []

        self._build()

    def _build(self):
        for row_idx, row in self.df.iterrows():
            mask = load_nifti(row["basal_mask_path"])

            for z in range(mask.shape[0]):
                if self.filter_lesion:
                    if not has_lesion(mask[z], self.threshold):
                        continue

                self.index.append((row_idx, z))

    def __len__(self):
        return len(self.index)
    

from torch.utils.data import Dataset
import torchvision.transforms.functional as TF
import random
import copy

def find_normalization_parameters(image):
    """
    image: numpy array with shape [1, H, W] or [D, H, W]
    """
    norm_img = copy.deepcopy(image)
    norm_parms = (
        np.nanmin(norm_img, axis=(-3, -2, -1), keepdims=True),
        np.nanmax(norm_img, axis=(-3, -2, -1), keepdims=True)
    )
    return norm_parms


def normalize_image(image, parameters):
    """
    image: numpy array
    parameters: (min, max)
    """
    return (image - parameters[0]) / (parameters[1] - parameters[0] + 1e-6)


class HE2DSliceDataset(Dataset):
    def __init__(
        self,
        df,
        slice_index,
        augment=False
    ):
        self.df = df.reset_index(drop=True)
        self.slice_index = slice_index
        self.augment = augment

    def __len__(self):
        return len(self.slice_index)

    def __getitem__(self, idx):
        row_idx, z = self.slice_index[idx]
        row = self.df.iloc[row_idx]

        img_3d = load_nifti(row["basal_img_path"])
        mask_3d = load_nifti(row["basal_mask_path"])

        slice_2d = img_3d[z]                    # [H, W]
        mask_2d = mask_3d[z].astype(np.uint8)

        # add channel dim -> [1, H, W]
        slice_2d = slice_2d[np.newaxis, ...]

        # === ORIGINAL TRAINING NORMALIZATION ===
        norm_params = find_normalization_parameters(slice_2d)
        slice_2d = normalize_image(slice_2d, norm_params)

        img = torch.tensor(slice_2d, dtype=torch.float32)
        mask = torch.tensor(mask_2d, dtype=torch.uint8).unsqueeze(0)

        label = int(row["HE real"]) if not pd.isna(row["HE real"]) else -1
        patient_id = row["id"]

        if self.augment:
            if random.random() < 0.5:
                img = TF.hflip(img)
                mask = TF.hflip(mask)

            if random.random() < 0.3:
                angle = random.uniform(-10, 10)
                img = TF.rotate(img, angle)
                mask = TF.rotate(mask, angle)

        return {
            "image": img,
            "mask": mask,
            "label": torch.tensor(label),
            "patient_id": patient_id,
            "slice_idx": z,
        }

repo_path = os.getcwd()
CONFIG_PATH = repo_path + '/configs'
with open(os.path.join(CONFIG_PATH, "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/configs/config_eff_t5_repeat_1_othermain.yml")) as file:
    config = yaml.safe_load(file)

gpu = config["GPU"]
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
threshold_name = config["threshold_name"]
experiment_sample= config["experiment_sample"]
threshold = config["threshold_percentage"] # 0.5

# # from sklearn.model_selection import StratifiedKFold
# # def split_labeled_dataframe(df, fold=0, n_splits=5):
# #     skf = StratifiedKFold(
# #         n_splits=n_splits,
# #         shuffle=True,
# #         random_state=42
# #     )

# #     y = df["HE real"].astype(int)
# #     splits = list(skf.split(df, y))

# #     train_idx, val_idx = splits[fold]
# #     return df.iloc[train_idx], df.iloc[val_idx]

from torch.utils.data import DataLoader

md_path = repo_path + "/data/data_with_both_basal_fu.csv"
df = pd.read_csv(md_path)
df

Unnamed: 0,Hospital name,id,"Basal volume, ml","FU volume, ml","Absolute vol diff, ml",Relative vol diff (FU_vol/Basal_vol),HE vicorob,HE real,basal_img_path,basal_mask_path,fu_img_path,fu_mask_path
0,Hospital Clínic,2098,5.02,4.87,-0.150,0.970,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
1,Hospital Clínic,233632,36.07,39.24,3.171,1.088,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
2,Hospital Clínic,261065,9.50,11.41,1.915,1.202,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
3,Hospital Clínic,397280,9.37,8.33,-1.038,0.889,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
4,Hospital Clínic,4141022,32.23,37.98,5.753,1.179,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
...,...,...,...,...,...,...,...,...,...,...,...,...
451,Hospital Josep Trueta,pt217,0.45,22.26,21.800,49.060,1.0,1.0,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
452,Hospital Josep Trueta,pt218,7.19,7.24,0.050,1.010,0.0,0.0,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
453,Hospital Josep Trueta,pt219,1.09,1.39,0.300,1.280,0.0,0.0,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
454,Hospital Josep Trueta,pt220,48.01,46.48,-1.530,0.970,0.0,0.0,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...


In [4]:
import os
import torch
import numpy as np
import SimpleITK as sitk
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

def load_nifti(path):
    assert path.endswith((".nii", ".nii.gz")), path
    assert os.path.exists(path), path
    img = sitk.ReadImage(path, sitk.sitkFloat32)
    img = sitk.GetArrayFromImage(img)
    return img

def find_normalization_parameters(image):
    """
    image: numpy array with shape [1, H, W] or [D, H, W]
    """
    norm_img = copy.deepcopy(image)
    norm_parms = (
        np.nanmin(norm_img, axis=(-3, -2, -1), keepdims=True),
        np.nanmax(norm_img, axis=(-3, -2, -1), keepdims=True)
    )
    return norm_parms

def normalize_image(image, parameters):
    """
    image: numpy array
    parameters: (min, max)
    """
    return (image - parameters[0]) / (parameters[1] - parameters[0] + 1e-6)


class HE2DInferenceDataset(Dataset):
    def __init__(self, patient_ids, metadata_df):
        self.patient_ids = patient_ids
        self.df = metadata_df.set_index("id")
        self.samples = []
        self._build_index()

    def _build_index(self):
        for pid in self.patient_ids:
            try:
                row = self.df.loc[pid]
                path = row["basal_img_path"]
                print(f"Loading {pid}: {path}")

                img = load_nifti(path)

                for z in range(img.shape[0]):
                    self.samples.append((pid, z))

            except Exception as e:
                print(f"❌ Failed for pid={pid}")
                raise e

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        pid, z = self.samples[idx]
        row = self.df.loc[pid]

        img = load_nifti(row["basal_img_path"])
        pseudo_label = row["HE vicorob"]
        gt_label = row["HE real"] if not pd.isna(row["HE real"]) else -1

        norm_params = find_normalization_parameters(img[z][np.newaxis, ...])
        img = normalize_image(img[z][np.newaxis, ...], norm_params)
        img = torch.tensor(img, dtype=torch.float32)

        # concat 2 images with the mask image together so that there will be 3 channels
        mask = load_nifti(row["basal_mask_path"])
        mask_slice = mask[z].astype(np.uint8)
        mask = torch.from_numpy(mask_slice[np.newaxis, ...]).long()
        # # concat image + image + mask
        img = torch.cat([img, img], dim=0)
        img = torch.cat([img, mask], dim=0)

        return img, pid, z, pseudo_label, gt_label

checkpoint_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/checkpoints/org_da_sy_es5_hf05_repeat_2711_1_othermain/0/last_model-v1.ckpt"

from model import ImageClassifier3
model = ImageClassifier3.load_from_checkpoint(checkpoint_path)
model.eval().cuda(device=0)

def read_ids(txt_path):
    with open(txt_path, "r") as f:
        return [line.strip() for line in f if line.strip()]

txt_files = {
    # "Hospital_josep_trueta_pseudo": "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Josep_Trueta_pseudo_ids.txt",
    "Hospital_josep_trueta_gt": "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Josep_Trueta_gt_ids.txt", # test for fold 0. 
    # "Hospital_clinic": "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Clínic_ids.txt",
    # "Hospital_sant_pau": "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Sant_Pau_ids.txt",
}

import pandas as pd

md_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/data_with_both_basal_fu.csv"
metadata_df = pd.read_csv(md_path)

results = []

for split_name, txt_path in txt_files.items():
    patient_ids = read_ids(txt_path)
    ds = HE2DInferenceDataset(patient_ids, metadata_df)
    loader = DataLoader(ds, batch_size=4, num_workers=8, shuffle=False)

    with torch.no_grad():
        for img, pid, z, pseudo_label, gt_label in tqdm(loader, desc=f"Inference {split_name}"):
            pseudo_label = np.array(pseudo_label).astype(int)
            gt_label = np.array(gt_label).astype(int)
            img = img.cuda(device=0)
            logits = model(img)
            probs = torch.sigmoid(logits).cpu().numpy().squeeze()
            prediction = (probs >= 0.5).astype(int)

            for i in range(len(pid)):
                results.append({
                    "split": split_name,
                    "patient_id": pid[i],
                    "slice_idx": int(z[i]),
                    "prob_he": float(probs[i]),
                    "pseudo_label": pseudo_label[i],
                    "gt_label": gt_label[i],
                    "prediction": prediction[i],
                })

res_df = pd.DataFrame(results)

patient_df = (
    res_df
    .groupby(["split", "patient_id"])
    .agg(
        mean_prob=("prob_he", "mean"),
        max_prob=("prob_he", "max"),
        n_slices=("prob_he", "count"),
        pseudo_label=("pseudo_label", "first"),
        gt_label=("gt_label", "first"),
        prediction=("prediction", lambda x: int(np.mean(x) >= 0.5)),
        )
    .reset_index()
)

res_df.to_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/test_on_others/slice_level_predictions.csv", index=False)
patient_df.to_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/test_on_others/patient_level_predictions.csv", index=False)

Lightning automatically upgraded your loaded checkpoint from v1.9.0 to v2.0.9. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file checkpoints/org_da_sy_es5_hf05_repeat_2711_1_othermain/0/last_model-v1.ckpt`


Using eff s
Using focal loss
Loading pt196: /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Josep Trueta/pt196/Basal/CT_SS.nii.gz
Loading pt022: /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Josep Trueta/pt022/Basal/CT_SS.nii.gz
Loading pt186: /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Josep Trueta/pt186/Basal/CT_SS.nii.gz
Loading pt181: /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Josep Trueta/pt181/Basal/CT_SS.nii.gz
Loading pt058: /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Josep Trueta/pt058/Basal/CT_SS.nii.gz
Loading pt205: /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Josep Trueta/pt205/Basal/CT_SS.nii.gz
Loading pt086: /media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Josep Trueta/pt086/Basal/CT_SS.nii.gz
Loading pt140: /media/cansu/DiskSpace/Ca

Inference Hospital_josep_trueta_gt: 100%|██████████| 355/355 [01:43<00:00,  3.44it/s]


Todos:
1. get the prediction results using the old code -- dataset and dataloader. for the test set for fold 0. 
2. get the pred results for the same data but for using new dataset and dataloader.

In [3]:
import os
from sklearn.utils.class_weight import compute_class_weight
import sys; sys.path.insert(0, os.path.abspath("../"))
from dataset import *
from utils import *
from model import *
from model import ImageClassifier2
import torch.utils.data as data
import random
import argparse
import yaml
from pytorch_lightning.callbacks import ModelCheckpoint
from tqdm import tqdm

# chcekpoint for the best da_sy(5) model
checkpoint_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/checkpoints/org_da_sy_es5_hf05_repeat_2711_1_othermain/0/last_model-v1.ckpt"

model = ImageClassifier3.load_from_checkpoint(checkpoint_path)
model.eval().cuda(device=0)

unseen_neg_samples = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/not_sampled_negative_52_cases.csv")

X_train =[]
y_train = []
X_val = []
X_test = []
y_val = []
y_test = []

md_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/metadata.csv"

for i in tqdm(range(len(unseen_neg_samples))):
    # add the index numbers of the unseen negative samples to the X_test list
    X_test.append(unseen_neg_samples["index"].tolist()[i])
    y_test.append(unseen_neg_samples["label"].tolist()[i])

indexes = X_train, X_val, X_test, y_train, y_val, y_test

dm = HEPredDataModule(split_indexes=indexes, 
                        filter_slices=True,
                        mask=True, 
                        batch_size=1, 
                        num_workers=8, 
                        use_2d=True, 
                        return_type='image',
                        under_sampling=False,
                        over_sampling=False,
                        threshold=0.5,
                        md_path=md_path,
                        basal_fu=False,
                        roi = False, problem = "prediction",
                        image_size = 512,
                        roi_size=512, lesion=False,
                        test_type = "t5",
                        apply_hflip = False,
                        apply_affine = False,
                        apply_gaussian_blur= False,
                        affine_degree=10,
                        affine_translate=0,
                        affine_scale=1.0,
                        affine_shear=0, 
                        hflip_p = 0.5, affine_p = 0.5,)
dm.setup()

# Read the test dataset from the datamodule

test_loader = dm.test_dataloader()

img_list = []
label_list = []
id_list = []
snum_list = []

for batch in test_loader:
    image, label, id, snum = batch
    img_list.append(image)
    label_list.append(label)
    id_list.append(id)
    snum_list.append(snum)

probabilities = []
for i in range(len(id_list)):
    with torch.no_grad():
        y_hat = model(img_list[i].cuda(device=0))
        probabilities.append(y_hat.squeeze().cpu().numpy().tolist())

#make a dataframe with the id, snum and probabilities
df = pd.DataFrame(list(zip(id_list, snum_list, probabilities)), columns = ["id", "snum", "probabilities"])
# change the id column to integer
df["id"] = df["id"].astype(int)

# for each patient id take the probablity average of the slices
df_patient_averages = pd.DataFrame(columns = ["id", "average_probabilities"])

id_dict = {}
for i in range(len(df)):
    current_id = df["id"][i]
    probability = df["probabilities"][i]
    if current_id in id_dict:
        id_dict[current_id].append(probability)
    else:
        id_dict[current_id] = [probability]

# for each id, take the average of the probablities
for id, probablities in id_dict.items():
    average = sum(probablities) / len(probablities)
    # DataFrame.append is removed in recent pandas versions. Use .loc to add a new row instead.
    df_patient_averages.loc[len(df_patient_averages)] = {"id": id, "average_probabilities": average}
            

Using eff s
Using focal loss


100%|██████████| 52/52 [00:00<00:00, 49909.34it/s]


number of slices after filtering in test  1264
number of slices in the test set:  1264


In [5]:
# add another column as predicted labels where if the probablity is greater than 0.5, label is 1, else 0
df_patient_averages["predicted_labels"] = df_patient_averages["average_probabilities"].apply(lambda x: 1 if x > 0.5 else 0)
df_patient_averages

Unnamed: 0,id,average_probabilities,predicted_labels
0,118,-2.455591,0
1,142,-0.822859,0
2,72,-1.060153,0
3,135,-1.050091,0
4,80,0.556763,1
5,167,-1.479259,0
6,132,0.509654,1
7,78,-1.902653,0
8,62,-1.396133,0
9,63,-2.072023,0


In [6]:
# add the actual labels to the df_patient_averages where all of them are 0's
df_patient_averages["actual_labels"] = 0
df_patient_averages

Unnamed: 0,id,average_probabilities,predicted_labels,actual_labels
0,118,-2.455591,0,0
1,142,-0.822859,0,0
2,72,-1.060153,0,0
3,135,-1.050091,0,0
4,80,0.556763,1,0
5,167,-1.479259,0,0
6,132,0.509654,1,0
7,78,-1.902653,0,0
8,62,-1.396133,0,0
9,63,-2.072023,0,0


In [7]:
def compute_sensitivity_specificity(preds, labels):
    true_positives = (preds == 1) & (labels == 1)
    true_negatives = (preds == 0) & (labels == 0)
    false_positives = (preds == 1) & (labels == 0)
    false_negatives = (preds == 0) & (labels == 1)
    sensitivity = true_positives.sum() / (true_positives.sum() + false_negatives.sum())
    specificity = true_negatives.sum() / (true_negatives.sum() + false_positives.sum())
    return sensitivity, specificity

# calculate the accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_patient_averages["actual_labels"], df_patient_averages["predicted_labels"])

# sensitivity and specificity
sensitivity, specificity = compute_sensitivity_specificity(df_patient_averages["predicted_labels"], df_patient_averages["actual_labels"])

print("Accuracy: ", accuracy)
print("Sensitivity: ", sensitivity)
print("Specificity: ", specificity)

Accuracy:  0.9038461538461539
Sensitivity:  nan
Specificity:  0.9038461538461539


  sensitivity = true_positives.sum() / (true_positives.sum() + false_negatives.sum())


# create new csv files for the ones I will test for (similar way to the negative samples)

In [8]:
data_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/data_with_both_basal_fu.csv"

# read all data 
df = pd.read_csv(data_path)
df

Unnamed: 0,Hospital name,id,"Basal volume, ml","FU volume, ml","Absolute vol diff, ml",Relative vol diff (FU_vol/Basal_vol),HE vicorob,HE real,basal_img_path,basal_mask_path,fu_img_path,fu_mask_path
0,Hospital Clínic,2098,5.02,4.87,-0.150,0.970,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
1,Hospital Clínic,233632,36.07,39.24,3.171,1.088,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
2,Hospital Clínic,261065,9.50,11.41,1.915,1.202,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
3,Hospital Clínic,397280,9.37,8.33,-1.038,0.889,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
4,Hospital Clínic,4141022,32.23,37.98,5.753,1.179,0.0,,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
...,...,...,...,...,...,...,...,...,...,...,...,...
451,Hospital Josep Trueta,pt217,0.45,22.26,21.800,49.060,1.0,1.0,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
452,Hospital Josep Trueta,pt218,7.19,7.24,0.050,1.010,0.0,0.0,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
453,Hospital Josep Trueta,pt219,1.09,1.39,0.300,1.280,0.0,0.0,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...
454,Hospital Josep Trueta,pt220,48.01,46.48,-1.530,0.970,0.0,0.0,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...,/media/cansu/DiskSpace/Cansu/HE_prediction_imp...


In [19]:
hospital_clinic_ids = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits"
# hospital_true_ta_pseudoids = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Josep_Trueta_pseudo_ids.txt"
hospital_santpau_ids = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Sant_Pau_ids.txt"
hospital_vallhebron_ids = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/splits/Hospital_Vall_Hebrón_ids.txt"

data_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/data_with_both_basal_fu.csv"

# read all data 
df = pd.read_csv(data_path)

# read txt files and convert to list
def read_ids(txt_path):
    with open(txt_path, "r") as f:
        return [line.strip() for line in f if line.strip()]
    
# hospital_true_ta_pseudoids_list = read_ids(hospital_true_ta_pseudoids)
hospital_santpau_ids_list = read_ids(hospital_santpau_ids)
hospital_vallhebron_ids_list = read_ids(hospital_vallhebron_ids)
all_ids =  hospital_santpau_ids_list + hospital_vallhebron_ids_list

hos_clinic_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Clínic"
hos_josep_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Josep Trueta"
hos_san_pau_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Sant Pau"
hos_vall_hebron_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/RAINS_vicorob/Hospital Vall Hebrón"

# create a dataframe with columns: patient_id,ct_ss_path,ct_nc_path,mask_path,label,volume,index
data = []
for pid in all_ids:
    # if pid in hospital_true_ta_pseudoids_list:
    #     hospital_path = hos_josep_path
    if pid in hospital_santpau_ids_list:
        hospital_path = hos_san_pau_path
    elif pid in hospital_vallhebron_ids_list:
        hospital_path = hos_vall_hebron_path
    else:
        continue

    basal_path = os.path.join(hospital_path, pid, "Basal")
    ct_ss_path = os.path.join(basal_path, "CT_SS.nii.gz")
    ct_nc_path = os.path.join(basal_path, "CT_NC.nii.gz")
    mask_path = os.path.join(basal_path, "hematoma_mask_vicorob_reviewed_reoriented.nii.gz")

    # labels are from the df 
    label = df.loc[df["id"] == pid, "HE vicorob"].values[0]

    # give their int(pid) as index if it is integer or give a random index
    index = int(pid)

    data.append({
        "patient_id": pid,
        "ct_ss_path": ct_ss_path,
        "ct_nc_path": ct_nc_path,
        "mask_path": mask_path,
        "label": label,
        "index": index
    })
# svc to dataframe
data_df = pd.DataFrame(data)
data_df.to_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/metadata_clinic_trueta_santpau_vallhebron.csv", index=False)

In [21]:
import os
from sklearn.utils.class_weight import compute_class_weight
import sys; sys.path.insert(0, os.path.abspath("../"))
from dataset import *
from utils import *
from model import *
from model import ImageClassifier2
import torch.utils.data as data
import random
import argparse
import yaml
from pytorch_lightning.callbacks import ModelCheckpoint
from tqdm import tqdm

# chcekpoint for the best da_sy(5) model
checkpoint_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/checkpoints/org_da_sy_es5_hf05_repeat_2711_1_othermain/0/last_model-v1.ckpt"

model = ImageClassifier3.load_from_checkpoint(checkpoint_path)
model.eval().cuda(device=0)

unseen_neg_samples = pd.read_csv("/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/metadata_clinic_trueta_santpau_vallhebron.csv")

X_train =[]
y_train = []
X_val = []
X_test = []
y_val = []
y_test = []

md_path = "/media/cansu/DiskSpace/Cansu/HE_prediction_improved/data/metadata_clinic_trueta_santpau_vallhebron.csv"

for i in tqdm(range(len(unseen_neg_samples))):
    # add the index numbers of the unseen negative samples to the X_test list
    X_test.append(unseen_neg_samples["index"].tolist()[i])
    y_test.append(unseen_neg_samples["label"].tolist()[i])

indexes = X_train, X_val, X_test, y_train, y_val, y_test

dm = HEPredDataModule(split_indexes=indexes, 
                        filter_slices=True,
                        mask=True, 
                        batch_size=1, 
                        num_workers=8, 
                        use_2d=True, 
                        return_type='image',
                        under_sampling=False,
                        over_sampling=False,
                        threshold=0.5,
                        md_path=md_path,
                        basal_fu=False,
                        roi = False, problem = "prediction",
                        image_size = 512,
                        roi_size=512, lesion=False,
                        test_type = "t5",
                        apply_hflip = False,
                        apply_affine = False,
                        apply_gaussian_blur= False,
                        affine_degree=10,
                        affine_translate=0,
                        affine_scale=1.0,
                        affine_shear=0, 
                        hflip_p = 0.5, affine_p = 0.5,)
dm.setup()

# Read the test dataset from the datamodule

test_loader = dm.test_dataloader()

img_list = []
label_list = []
id_list = []
snum_list = []

for batch in test_loader:
    image, label, id, snum = batch
    img_list.append(image)
    label_list.append(label)
    id_list.append(id)
    snum_list.append(snum)

probabilities = []
for i in range(len(id_list)):
    with torch.no_grad():
        y_hat = model(img_list[i].cuda(device=0))
        probabilities.append(y_hat.squeeze().cpu().numpy().tolist())

#make a dataframe with the id, snum and probabilities
df = pd.DataFrame(list(zip(id_list, snum_list, probabilities)), columns = ["id", "snum", "probabilities"])
# change the id column to integer
df["id"] = df["id"].astype(int)

# for each patient id take the probablity average of the slices
df_patient_averages = pd.DataFrame(columns = ["id", "average_probabilities"])

id_dict = {}
for i in range(len(df)):
    current_id = df["id"][i]
    probability = df["probabilities"][i]
    if current_id in id_dict:
        id_dict[current_id].append(probability)
    else:
        id_dict[current_id] = [probability]

# for each id, take the average of the probablities
for id, probablities in id_dict.items():
    average = sum(probablities) / len(probablities)
    # DataFrame.append is removed in recent pandas versions. Use .loc to add a new row instead.
    df_patient_averages.loc[len(df_patient_averages)] = {"id": id, "average_probabilities": average}
            

Using eff s
Using focal loss


100%|██████████| 200/200 [00:00<00:00, 97598.70it/s]


RuntimeError: Exception thrown in SimpleITK ImageFileReader_Execute: /tmp/SimpleITK-build/ITK/Modules/IO/NIFTI/src/itkNiftiImageIO.cxx:2135:
ITK ERROR: ITK only supports orthonormal direction cosines.  No orthonormal definition found!

In [None]:
# add another column as predicted labels where if the probablity is greater than 0.5, label is 1, else 0
df_patient_averages["predicted_labels"] = df_patient_averages["average_probabilities"].apply(lambda x: 1 if x > 0.5 else 0)
# add the actual labels to the df_patient_averages where all of them are 0's
df_patient_averages["actual_labels"] = 0