In [None]:
# Copyright 2025 Claudio Giovannoni, Carlo Metta, Anna Monreale,
# Salvatore Rinzivillo, Andrea Berti, Sara Colantonio, and
# Francesca Pratesi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Data Mapping and Train/Validation/Test Split

In [None]:
import pandas as pd
import os
import sys

Mounted at /content/drive


In [None]:
# when using google colaboratory
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/ABELE_prostate/claudio/black_box/code/')
import crop_data_utils
import mapping_train_test_split_utils

In [None]:
sys.path.append(os.path.abspath('code'))
import black_box.code.crop_data_utils
import black_box.code.mapping_train_test_split_utils

In [None]:
source_root = '/content/drive/My Drive/ABELE_prostate/claudio/black_box/data/PICAI_crop/prostate_centered/'
destination_root = '/content/drive/My Drive/ABELE_prostate/claudio/black_box/data/CSV/prostate_centered/'
destination_root_train = '/content/drive/My Drive/ABELE_prostate/claudio/black_box/data/CSV/prostate_centered/train/'
destination_root_valid = '/content/drive/My Drive/ABELE_prostate/claudio/black_box/data/CSV/prostate_centered/valid/'
destination_root_test = '/content/drive/My Drive/ABELE_prostate/claudio/black_box/data/CSV/prostate_centered/test/'

In [None]:
crop_data_utils.file_checker(source_root)

1428it [01:05, 21.93it/s] 


Total number of patient folders: 1427
Total number of patient files: 21222
Total size of d: 0.10 Gigabytes





# Mapping

In [None]:
csv_path = '/content/drive/MyDrive/ABELE_prostate/PICAI_original/clinical_information/marksheet.csv'
labels_df = pd.read_csv(csv_path)
print(labels_df.shape)
labels_df.head()

(1500, 12)


Unnamed: 0,patient_id,study_id,mri_date,patient_age,psa,psad,prostate_volume,histopath_type,lesion_GS,lesion_ISUP,case_ISUP,case_csPCa
0,10000,1000000,2019-07-02,73,7.7,,55.0,MRBx,0+0,0.0,0,NO
1,10001,1000001,2016-05-27,64,8.7,0.09,102.0,,,,0,NO
2,10002,1000002,2021-04-18,58,4.2,0.06,74.0,,,,0,NO
3,10003,1000003,2019-04-05,72,13.0,,71.5,SysBx,0+0,0.0,0,NO
4,10004,1000004,2020-10-21,67,8.0,0.1,78.0,SysBx+MRBx,"0+0,0+0",0.0,0,NO


### csv analysis

In [None]:
grouped_labels = labels_df['case_ISUP'].apply(lambda x: '0 and 1' if x in [0, 1] else 'others')
print(grouped_labels.value_counts())

0 and 1    1075
others      425
Name: case_ISUP, dtype: int64


In [None]:
print(labels_df['patient_id'].nunique(),'\n')
print(labels_df['case_csPCa'].value_counts(),'\n')
print(labels_df['case_csPCa'].value_counts()/len(labels_df)*100)

In [None]:
patient_study_counts = labels_df['patient_id'].value_counts()
with_changes = {}
without_changes = {}
for patient_id, study_count in patient_study_counts.items():
    if study_count > 1:
        patient_data = labels_df[labels_df['patient_id'] == patient_id]
        unique_csPCa_values = patient_data['case_csPCa'].unique()
        if len(unique_csPCa_values) > 1:
            with_changes[patient_id] = study_count
        else:
            without_changes[patient_id] = study_count
num_patients_with_changes = len(with_changes)
num_patients_without_changes = len(without_changes)

print("Patients with changes in case_csPCa:")
print(with_changes)
print("Patients without changes in case_csPCa:")
print(without_changes)
print(f"Number of patients with changes: {num_patients_with_changes}")
print(f"Number of patients without changes: {num_patients_without_changes}")

Patients with changes in case_csPCa:
{10540: 2, 10490: 2, 10634: 2, 11054: 2, 10273: 2, 10548: 2}
Patients without changes in case_csPCa:
{10936: 3, 10281: 2, 10905: 2, 10576: 2, 10580: 2, 10629: 2, 10404: 2, 10129: 2, 10417: 2, 10153: 2, 10131: 2, 11059: 2, 11087: 2, 10512: 2, 11004: 2, 11383: 2, 10193: 2}
Number of patients with changes: 6
Number of patients without changes: 17


In [None]:
# create new csv verision, keeping only most recent sutdy to avoid duplicates
patient_labels_df = labels_df.copy()
patient_labels_df['mri_date'] = pd.to_datetime(patient_labels_df['mri_date'])
patient_labels_df = patient_labels_df.sort_values(by='mri_date', ascending=False)

# Drop duplicates, keeping only the first occurrence (most recent study) for each patient_id
patient_labels_df = patient_labels_df.drop_duplicates(subset='patient_id', keep='first')
print(patient_labels_df.shape)
print(patient_labels_df['patient_id'].nunique(),'\n')
print(patient_labels_df['case_csPCa'].value_counts(),'\n')
print(patient_labels_df['case_csPCa'].value_counts()/len(patient_labels_df)*100)

(1476, 12)
1476 

NO     1051
YES     425
Name: case_csPCa, dtype: int64 

NO     71.205962
YES    28.794038
Name: case_csPCa, dtype: float64


In [None]:
output_folder = '/content/drive/My Drive/ABELE_prostate/claudio/black_box/data/CSV/'
os.makedirs(output_folder, exist_ok=True)
patient_ids = []
image_paths = []
labels = []
modalities = []

# Iterate through each row in the CSV file
for index, row in patient_labels_df.iterrows():
    patient_id = row['patient_id']
    case_csPCa = row['case_csPCa']
    label = 0 if case_csPCa == 'NO' else 1
    # Construct the relative image path based on patient ID and image type
    patient_path = os.path.join(source_root,f"{patient_id}")
    if os.path.exists(patient_path):
        for image in os.listdir(patient_path):
            image_path = os.path.join(patient_path, image)
            _, _, image_type, _ = crop_data_utils.parse_image_filename(image, class_label=False)
            patient_ids.append(patient_id)
            image_paths.append(image_path)
            labels.append(label)
            modalities.append(image_type)
# Create a DataFrame from the lists
mapping_df = pd.DataFrame({
    'patient_id': patient_ids,
    'image_path': image_paths,
    'label': labels,
    'modality': modalities
})
# Save the mapping DataFrame as a CSV file
mapping_csv_path = os.path.join(output_folder, 'mapping.csv')
mapping_df.to_csv(mapping_csv_path, index=False)

# Dataset split

In [None]:
labeled_df = pd.read_csv('/content/drive/My Drive/ABELE_prostate/claudio/black_box/data/CSV/mapping.csv')
label_counts = labeled_df['label'].value_counts().sort_index()
total_samples = len(labeled_df)
percentage_per_class = (label_counts / total_samples) * 100

result_df = pd.DataFrame({
    'Count': label_counts,
    'Percentage': percentage_per_class
}).sort_index()

print(labeled_df,labeled_df['patient_id'].nunique(),result_df, sep="\n")

       patient_id                                         image_path  label  \
0           11326  /content/drive/My Drive/ABELE_prostate/claudio...      0   
1           11326  /content/drive/My Drive/ABELE_prostate/claudio...      0   
2           11326  /content/drive/My Drive/ABELE_prostate/claudio...      0   
3           11326  /content/drive/My Drive/ABELE_prostate/claudio...      0   
4           11326  /content/drive/My Drive/ABELE_prostate/claudio...      0   
...           ...                                                ...    ...   
21217       10795  /content/drive/My Drive/ABELE_prostate/claudio...      0   
21218       10795  /content/drive/My Drive/ABELE_prostate/claudio...      0   
21219       10795  /content/drive/My Drive/ABELE_prostate/claudio...      0   
21220       10795  /content/drive/My Drive/ABELE_prostate/claudio...      0   
21221       10795  /content/drive/My Drive/ABELE_prostate/claudio...      0   

      modality  
0          adc  
1          adc  


## train test split (NO validation)

In [None]:
train_df, test_df = mapping_train_test_split_utils.custom_train_test_split(labeled_df,
                                                                           test_size=0.2,
                                                                           random_state=34)
mapping_train_test_split_utils.check_dataframes(train_df,test_df)

In [None]:
# @title export to gdrive
df_path = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/dataframes/no_val/'
traindf_path = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/dataframes/no_val/traindf.csv'
testdf_path = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/dataframes/no_val/testdf.csv'
os.makedirs(df_path, exist_ok=True)

train_df.to_csv(traindf_path,index=False)
test_df.to_csv(testdf_path,index=False)
print('train df shape:',train_df.shape)
print('test df shape:',test_df.shape)

train df shape: (16971, 4)
test df shape: (4251, 4)


## train test split (with validation)

In [None]:
train_df, valid_df, test_df = mapping_train_test_split_utils.custom_train_test_split(labeled_df,
                                                      test_size=0.2,
                                                      val_size=0.5,
                                                      random_state=34)

mapping_train_test_split_utils.check_dataframes(train_df,test_df,valid_df)

Total size: 21222
Train dataframe size: 16971
Validation dataframe size: 2125
Test dataframe size: 2126
Train dataframe unique IDs: 1140
Validation dataframe unique IDs: 143
Test dataframe unique IDs: 143 

Train dataframe samples:     patient_id                                         image_path  label  \
30       10302  /content/drive/My Drive/ABELE_prostate/claudio...      0   
31       10302  /content/drive/My Drive/ABELE_prostate/claudio...      0   
32       10302  /content/drive/My Drive/ABELE_prostate/claudio...      0   
33       10302  /content/drive/My Drive/ABELE_prostate/claudio...      0   
34       10302  /content/drive/My Drive/ABELE_prostate/claudio...      0   

   modality  
30      adc  
31      adc  
32      adc  
33      adc  
34      adc   

Validation dataframe samples:     patient_id                                         image_path  label  \
15       10965  /content/drive/My Drive/ABELE_prostate/claudio...      1   
16       10965  /content/drive/My Drive/ABE

In [None]:
# @title export to gdrive
df_path = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/dataframes/val/'
traindf_path = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/dataframes/val/traindf.csv'
valdf_path = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/dataframes/val/valdf.csv'
testdf_path = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/dataframes/val/testdf.csv'
os.makedirs(df_path, exist_ok=True)

train_df.to_csv(traindf_path,index=False)
valid_df.to_csv(valdf_path,index=False)
test_df.to_csv(testdf_path,index=False)
print('train df shape:',train_df.shape)
print('validation df shape:',valid_df.shape)
print('test df shape:',test_df.shape)

train df shape: (16971, 4)
validation df shape: (2125, 4)
test df shape: (2126, 4)


## Create Separate DF for each Image Modality

In [None]:
# Read the original CSV files
train_df = pd.read_csv(traindf_path)
valid_df = pd.read_csv(valdf_path)
test_df = pd.read_csv(testdf_path)

# Create separate DataFrames for each modality
train_df_adc = train_df[train_df['modality'] == 'adc']
valid_df_adc = valid_df[valid_df['modality'] == 'adc']
test_df_adc = test_df[test_df['modality'] == 'adc']

train_df_t2w = train_df[train_df['modality'] == 't2w']
valid_df_t2w = valid_df[valid_df['modality'] == 't2w']
test_df_t2w = test_df[test_df['modality'] == 't2w']

train_df_hbv = train_df[train_df['modality'] == 'hbv']
valid_df_hbv = valid_df[valid_df['modality'] == 'hbv']
test_df_hbv = test_df[test_df['modality'] == 'hbv']

print('train ADC df shape:',train_df_adc.shape)
print('valid ADC df shape:',valid_df_adc.shape)
print('test ADC df shape:',test_df_adc.shape)
print()
print('train T2W df shape:',train_df_t2w.shape)
print('valid T2w df shape:',valid_df_t2w.shape)
print('teset T2W df shape:',test_df_t2w.shape)
print()
print('train HBV df shape:',train_df_hbv.shape)
print('valid HBV df shape:',valid_df_hbv.shape)
print('teset HBV df shape:',test_df_hbv.shape)

train ADC df shape: (6068, 4)
valid ADC df shape: (725, 4)
test ADC df shape: (742, 4)

train T2W df shape: (6054, 4)
valid T2w df shape: (744, 4)
teset T2W df shape: (720, 4)

train HBV df shape: (6092, 4)
valid HBV df shape: (735, 4)
teset HBV df shape: (742, 4)


In [None]:
# checking class distribution in the dfs
dataframes = [train_df_adc, test_df_adc, valid_df_adc,
              train_df_t2w, test_df_t2w, valid_df_t2w,
              train_df_hbv, test_df_hbv, valid_df_hbv]

for i, df in enumerate(dataframes):
    label_counts = df['label'].value_counts().sort_index()
    total_samples = len(df)

    percentage_per_class = (label_counts / total_samples) * 100

    result_df = pd.DataFrame({
        'Count': label_counts,
        'Percentage': percentage_per_class
    }).sort_index()

    print(f"Class balance for dataframe {i + 1}:\n")
    print(result_df)
    print("\n" + "=" * 40 + "\n")

Class balance for dataframe 1:

   Count  Percentage
0   4449   73.319051
1   1619   26.680949


Class balance for dataframe 2:

   Count  Percentage
0    532   71.698113
1    210   28.301887


Class balance for dataframe 3:

   Count  Percentage
0    527   72.689655
1    198   27.310345


Class balance for dataframe 4:

   Count  Percentage
0   4436   73.273869
1   1618   26.726131


Class balance for dataframe 5:

   Count  Percentage
0    520   72.222222
1    200   27.777778


Class balance for dataframe 6:

   Count  Percentage
0    529   71.102151
1    215   28.897849


Class balance for dataframe 7:

   Count  Percentage
0   4468   73.342088
1   1624   26.657912


Class balance for dataframe 8:

   Count  Percentage
0    532   71.698113
1    210   28.301887


Class balance for dataframe 9:

   Count  Percentage
0    527    71.70068
1    208    28.29932




In [None]:
df_path_adc = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/prostate_centered/val/adc/'
df_path_t2w = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/prostate_centered/val/t2w/'
df_path_hbv = '/content/drive/MyDrive/ABELE_prostate/claudio/black_box/data/CSV/prostate_centered/val/hbv/'

traindf_adc_path = os.path.join(df_path_adc,'traindf_adc.csv')
valdf_adc_path = os.path.join(df_path_adc,'valdf_adc.csv')
testdf_adc_path = os.path.join(df_path_adc,'testdf_adc.csv')

traindf_t2w_path = os.path.join(df_path_t2w,'traindf_t2w.csv')
valdf_t2w_path = os.path.join(df_path_t2w,'valdf_t2w.csv')
testdf_t2w_path = os.path.join(df_path_t2w,'testdf_t2w.csv')

traindf_hbv_path = os.path.join(df_path_hbv,'traindf_hbv.csv')
valdf_hbv_path = os.path.join(df_path_hbv,'valdf_hbv.csv')
testdf_hbv_path = os.path.join(df_path_hbv,'testdf_hbv.csv')

os.makedirs(df_path_adc, exist_ok=True)
os.makedirs(df_path_t2w, exist_ok=True)
os.makedirs(df_path_hbv, exist_ok=True)

train_df_adc.to_csv(traindf_adc_path,index=False)
valid_df_adc.to_csv(valdf_adc_path,index=False)
test_df_adc.to_csv(testdf_adc_path,index=False)

train_df_t2w.to_csv(traindf_t2w_path,index=False)
valid_df_t2w.to_csv(valdf_t2w_path,index=False)
test_df_t2w.to_csv(testdf_t2w_path,index=False)

train_df_hbv.to_csv(traindf_hbv_path,index=False)
valid_df_hbv.to_csv(valdf_hbv_path,index=False)
test_df_hbv.to_csv(testdf_hbv_path,index=False)