### Preparing CFV splits for
 Retrieve available data on the disk and validate it
* Match data table with it
* Write file names to the table
* Split table into N-cross splits + separate split for test
* Use table as CSV Dataset to retrieve the image data

In [1]:
import os
import glob
import sys; sys.path.append("../")
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import KFold

In [11]:
nfolds = 5
test_split = True

dataset_source = "/home/oleksii/projects/ohif-orthanc-postgres-docker/datasets/2_classification/prostate_class_dataset_demography_final_psa_vol_20240418_feat.csv"
df = pd.read_csv(dataset_source, sep=';')

In [12]:
df.csPCa

0        True
1        True
2        True
3        True
4        True
        ...  
4474    False
4475    False
4476    False
4477    False
4478    False
Name: csPCa, Length: 4479, dtype: bool

In [25]:

def get_alta_dataset(data_path, seg_path):
    studies = os.listdir(data_path)
    records = []
    for study_id in studies:
        
        study_path = os.path.join(data_path, study_id)
        study_path_seg = os.path.join(seg_path, study_id)

        t2ws = glob.glob(os.path.join(study_path, "*tra_t2w.nrrd"))
        adcs = glob.glob(os.path.join(study_path, "*tra_adc.nrrd"))
        dwis = glob.glob(os.path.join(study_path, "*tra_dwi_calc_bval*.nrrd"))
        segs = glob.glob(os.path.join(study_path_seg, "*.seg.nrrd"))
        
        d = {}
        d['study_orthanc_id'] = study_id
        d["t2w"] = t2ws[0] if t2ws else None  
        d["adc"] = adcs[0] if adcs else None  
        d["seg"] = segs[0] if segs else None  
        d["dwis"] = ','.join(dwis) if dwis else None  
        records.append(d)
         
    ds = pd.DataFrame(records)
    print(len(ds))
    return ds

In [28]:

data_path = "/data/oleksii/Prostate-Classification-Datasets-NRRDS/ALTA-Classification-Dataset-orient-preprocess/"
seg_path = "/data/oleksii/Prostate-Classification-Datasets-NRRDS/ALTA-Classification-Dataset-orient-Segmentations/"
dataset_alta = get_alta_dataset(data_path, seg_path)
print(len(dataset_alta))

# keep full cases
dataset_alta = dataset_alta[(~dataset_alta['dwis'].isnull()) & (~dataset_alta['t2w'].isnull()) & (~dataset_alta['adc'].isnull()) & (~dataset_alta['seg'].isnull())]
# keep only prostate cases
dataset_alta = dataset_alta[dataset_alta['dwis'].apply(lambda x: True if ".nrrd" in x.split(",")[0] else False)]

4481
4481


In [29]:
dataset_alta

Unnamed: 0,study_orthanc_id,t2w,adc,seg,dwis
0,bd9ea71b-f9806b9e-fd1683a1-f364452a-b2fac5af,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...
1,d55838e1-51531d28-e9da1f29-926450d2-1bdcbe85,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...
2,c0fc3c30-ac34620e-102b7f21-e6fe2b29-564fe859,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...
3,820deef5-a69db0ef-94f429b6-7a32070c-312a747e,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...
4,0ff11760-bd590db0-78fe0467-335a1905-ee6d9520,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...
...,...,...,...,...,...
4476,f79c558a-20fd8623-9e50577c-0a898c3a-aaad75e5,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...
4477,8712d58f-fb1c4f96-6de8b09b-4b87271b-45258caf,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...
4478,b5dbcc3c-d9e45630-e8f0f35b-411cdad3-d1d0495e,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...
4479,c153e521-48ecf08f-308da823-2d9d8337-deb8c4db,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...,/data/oleksii/Prostate-Classification-Datasets...


In [30]:
# merge preprocessed data with tabular one 
dataset_alta_m = pd.merge(dataset_alta, df, how="inner", on="study_orthanc_id")

In [31]:
dataset_alta_m[dataset_alta_m['csPCa'].isna()]

Unnamed: 0,study_orthanc_id,t2w,adc,seg,dwis,PatientID,class,GS,csPCa,GS_order,...,volume_prostata,PSAd,TZPSAd,PZPSAd,prostatitis,GS6,GS7a,GS7b,GS8,GS9_10


In [32]:
# first, get the test set
# sgkfold = KFold(n_splits=nfolds + 1, shuffle=True, random_state=42)
# splits = list(sgkfold.split(X=df))
# first, get the stratified test set
sgkfold = StratifiedGroupKFold(n_splits=nfolds + 1, shuffle=True, random_state=42)
splits = list(sgkfold.split(X=dataset_alta_m, y=dataset_alta_m['csPCa'].to_list(), groups=dataset_alta_m['PatientID'].to_list()))

# split test from train! 
dataset_test = dataset_alta_m.iloc[splits[0][1]]
dataset_train = dataset_alta_m.iloc[splits[0][0]]

# next, get the cross fold set
sgkfold_train = StratifiedGroupKFold(n_splits=nfolds , shuffle=True, random_state=42)
splits_folds = list(sgkfold_train.split(X=dataset_train, y=dataset_train['csPCa'].to_list(), groups=dataset_train['PatientID'].to_list()))

In [33]:
splits_path = "/home/oleksii/projects/ohif-orthanc-postgres-docker/datasets/2_classification/cross_fold_splits"
os.makedirs(splits_path, exist_ok=True)
df.to_csv(os.path.join(splits_path, os.path.basename(dataset_source)), sep=';', index=False) 

dataset_test.to_csv(os.path.join(splits_path, 'test.csv'), sep=';', index=False)
for i, split in enumerate(splits_folds):
    print(f"Saving fold {i}")
    
    fold_path = os.path.join(splits_path, f'fold_{i}')
    os.makedirs(fold_path, exist_ok=True)
    
    train_fold = dataset_train.iloc[split[0]]
    valid_fold = dataset_train.iloc[split[1]]
    train_fold.to_csv(os.path.join(fold_path, 'train.csv'), sep=';', index=False)
    valid_fold.to_csv(os.path.join(fold_path, 'valid.csv'), sep=';', index=False)

Saving fold 0
Saving fold 1
Saving fold 2
Saving fold 3
Saving fold 4
