In [33]:
import os
import pandas as pd
import numpy as np
# display all the columns and rows in pandas
pd.set_option('display.max_columns', None)

In [23]:
split_dir = '/home/mamur/TUM/MLMI/data/splits'
data_dir = "/home/mamur/TUM/MLMI/data/"

In [3]:
# list the files in splits directory
files = os.listdir(split_dir)

In [4]:
pathologies = []
non_pathologies = ['normal_val', 'normal_test', 'normal_train', 'normal_test_ann', 'ixi_normal_train', '_ann', '_neg']
for file in files:
    # if file does not contain any of the non_pathologies as substring
    if not any([non_pathology in file for non_pathology in non_pathologies]):
        pathology = file.split('.')[0]
        pathologies.append(pathology)

In [5]:
pathologies

['posttreatment',
 'lesions',
 'mass_all',
 'intraventricular',
 'encephalomalacia',
 'absent_septum',
 'mass',
 'edema',
 'other',
 'artefacts',
 'craniatomy',
 'wml',
 'ea_mass',
 'dural',
 'resection',
 'enlarged_ventricles',
 'sinus',
 'local']

In [6]:
# check if there are any duplicates in splits files
train_csv_ixi = os.path.join(split_dir, 'ixi_normal_train.csv')
train_csv_fastMRI = os.path.join(split_dir, 'normal_train.csv')
val_csv = os.path.join(split_dir, 'normal_val.csv')
test_csv_fastMRI = os.path.join(split_dir, 'normal_test.csv')
test_gt_csv_fastMRI = os.path.join(split_dir, 'normal_test_ann.csv')

# Load csv files
train_files_ixi = pd.read_csv(train_csv_ixi)['filename'].tolist()
train_files_fastMRI = pd.read_csv(train_csv_fastMRI)['filename'].tolist()
val_files = pd.read_csv(val_csv)['filename'].tolist()
test_files_fastMRI = pd.read_csv(test_csv_fastMRI)['filename'].tolist()
test_gt_files_fastMRI = pd.read_csv(test_gt_csv_fastMRI)['filename'].tolist()

In [7]:
# intersection between train_csv_ixi and train_csv_fastMRI
duplicated_files = set(train_files_ixi).intersection(set(train_files_fastMRI))

In [8]:
# check if all duplicated files are also in train_csv_fastMRI if true, then we can remove them from train_csv_ixi
if all([duplicated_file in train_files_fastMRI for duplicated_file in duplicated_files]):
    print('All duplicated files are also in train_csv_fastMRI')
    # remove duplicated files from train_csv_ixi
    print(len(duplicated_files))

else:
    print('Not all duplicated files are also in train_csv_fastMRI')
    # remove duplicated files from train_csv_ixi
    # print length of duplicated files
    print(len(duplicated_files))

train_files_ixi = [train_file for train_file in train_files_ixi if train_file not in duplicated_files]

All duplicated files are also in train_csv_fastMRI
130


In [9]:
print("check duplicates again after removing them from train_csv_ixi")
len(set(train_files_ixi).intersection(set(train_files_fastMRI)))

check duplicates again after removing them from train_csv_ixi


0

In [10]:
training_samples = train_files_ixi + train_files_fastMRI
val_samples = val_files
test_samples = test_files_fastMRI
test_samples_gt = test_gt_files_fastMRI

In [11]:
# print the lenght of each set
print('training samples: ', len(training_samples))
print('validation samples: ', len(val_samples))
print('test samples: ', len(test_samples))
print('test samples gt: ', len(test_samples_gt))


training samples:  711
validation samples:  15
test samples:  30
test samples gt:  30


In [12]:
# create a df from training_samples val_samples and test_samples and for each sample add split value and for test samples also add gt image
train_df = pd.DataFrame(training_samples, columns=['image'])
train_df['split'] = 'train'
train_df['gt_positive'] = 'None'
train_df['gt_negative'] = 'None'
train_df['pathology'] = 'None'

val_df = pd.DataFrame(val_samples, columns=['image'])
val_df['split'] = 'val'
val_df['gt_positive'] = 'None'
val_df['gt_negative'] = 'None'
val_df['pathology'] = 'None'

test_df = pd.DataFrame(test_samples, columns=['image'])
test_df['split'] = 'test'
test_df['gt_positive'] = 'None'
test_df['gt_negative'] = 'None'
test_df['pathology'] = 'None'
test_df['gt_image'] = test_samples_gt
# concat all dfs
df = pd.concat([train_df, val_df, test_df], ignore_index=True)

In [13]:
df = df[["image", "split", "pathology", "gt_image", "gt_positive", "gt_negative"]]

In [14]:
df.to_excel('dataset.xlsx', index=False)

In [29]:
def create_pathology_df(pathologies, split_dir):
    # List to hold all the rows before creating a DataFrame
    all_rows = []

    for pathology in pathologies:
        normal_file = pathology + '.csv'
        neg_file = pathology + '_neg.csv'
        ann_file = pathology + '_ann.csv'

        # Load csv files
        normal_files = pd.read_csv(os.path.join(split_dir, normal_file))['filename'].tolist()
        neg_files = pd.read_csv(os.path.join(split_dir, neg_file))['filename'].tolist()
        ann_files = pd.read_csv(os.path.join(split_dir, ann_file))['filename'].tolist() if ann_file != "local_ann.csv" else []

        # Create dictionaries for faster lookup
        neg_files_dict = {format_basename(file): file for file in neg_files}
        ann_files_dict = {format_basename(file): file for file in ann_files} if ann_files else {}

        # Create a row for each csv files contents
        for normal_file_path in normal_files:
            formatted_basename = format_basename(normal_file_path)
            # Find corresponding negative and annotation files, if they exist
            full_mask_file_path = normal_file_path.split('.png')[0] + '_brain_map_full.png'
            # check if full_mask_file_path exists
            if os.path.exists(data_dir + full_mask_file_path.split("./data/")[-1]):
                gt_image = full_mask_file_path
            else:
                gt_image = 'None'
            neg_file_path = neg_files_dict.get(formatted_basename, 'None')
            ann_file_path = ann_files_dict.get(formatted_basename, 'None')

            # Append the new row to the list
            all_rows.append({'image': normal_file_path, "gt_image":gt_image, 'gt_positive': ann_file_path,
                             'gt_negative': neg_file_path, 'pathology': pathology})

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(all_rows, columns=['image', "pathology", "gt_image", 'gt_positive', 'gt_negative'])

def format_basename(file_path):
    """Format the basename of the file according to the specified format."""
    temp = os.path.basename(file_path).split("_")
    return "_".join(part.split(".")[0] for part in temp[:5])

In [30]:
pathology_df = create_pathology_df(pathologies, split_dir=split_dir)

In [34]:
def divide_dataset_into_splits(df, seed=None):
    """
    Randomly divide the dataset into three equal splits: train, test, and val.

    :param df: The DataFrame to split.
    :param seed: Optional; a seed for the random number generator for reproducibility.
    :return: DataFrame with an additional column 'split' indicating the assigned split.
    """
    # Ensure reproducibility if a seed is provided
    if seed is not None:
        np.random.seed(seed)

    # Shuffle the DataFrame
    shuffled_df = df.sample(frac=1).reset_index(drop=True)

    # Calculate the size of each split
    split_size = len(shuffled_df) // 3

    # Assign the splits
    shuffled_df['split'] = 'train'
    shuffled_df.loc[split_size:2*split_size-1, 'split'] = 'test'
    shuffled_df.loc[2*split_size:, 'split'] = 'val'

    return shuffled_df

# Example usage:
# pathology_df = create_pathology_df(pathologies, split_dir)
# split_df = divide_dataset_into_splits(pathology_df, seed=123)


In [35]:
pathology_df = divide_dataset_into_splits(pathology_df, seed=123)

In [38]:
pathology_df = pathology_df[["image", "split", "pathology", "gt_image", "gt_positive", "gt_negative"]]

In [39]:
pathology_df.to_excel('pathologies_dataset.xlsx', index=False)

In [37]:
df

Unnamed: 0,image,split,pathology,gt_image,gt_positive,gt_negative
0,./data/IXI/sub-IXI002_ses-M00_T1w_space-MNI152...,train,,,,
1,./data/IXI/sub-IXI012_ses-M00_T1w_space-MNI152...,train,,,,
2,./data/IXI/sub-IXI013_ses-M00_T1w_space-MNI152...,train,,,,
3,./data/IXI/sub-IXI014_ses-M00_T1w_space-MNI152...,train,,,,
4,./data/IXI/sub-IXI015_ses-M00_T1w_space-MNI152...,train,,,,
...,...,...,...,...,...,...
751,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,test,,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,,
752,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,test,,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,,
753,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,test,,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,,
754,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,test,,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,,


In [41]:
# since there are no common images between df and pathology_df, we can just concat them
df = pd.concat([df, pathology_df], ignore_index=True)

In [42]:
df

Unnamed: 0,image,split,pathology,gt_image,gt_positive,gt_negative
0,./data/IXI/sub-IXI002_ses-M00_T1w_space-MNI152...,train,,,,
1,./data/IXI/sub-IXI012_ses-M00_T1w_space-MNI152...,train,,,,
2,./data/IXI/sub-IXI013_ses-M00_T1w_space-MNI152...,train,,,,
3,./data/IXI/sub-IXI014_ses-M00_T1w_space-MNI152...,train,,,,
4,./data/IXI/sub-IXI015_ses-M00_T1w_space-MNI152...,train,,,,
...,...,...,...,...,...,...
1067,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,val,mass,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,./data/fastMRI/brain_mid_anno_pos_png/file_bra...,./data/fastMRI/brain_mid_anno_neg_png/file_bra...
1068,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,val,mass_all,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,./data/fastMRI/brain_mid_anno_pos_png/file_bra...,./data/fastMRI/brain_mid_anno_neg_png/file_bra...
1069,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,val,posttreatment,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,./data/fastMRI/brain_mid_anno_pos_png/file_bra...,./data/fastMRI/brain_mid_anno_neg_png/file_bra...
1070,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,val,local,./data/fastMRI/brain_mid_png/file_brain_AXT1_2...,,./data/fastMRI/brain_mid_anno_neg_png/file_bra...


In [44]:
# sort on split column
df = df.sort_values(by=['split'])

In [45]:
df.to_excel('master_dataset.xlsx', index=False)