In [87]:
import os
import pickle
import numpy as np
import glob
import random
import re
from collections import Counter
import pandas as pd

class_labels = ["CBFB_MYH11","control","NPM1","PML_RARA","RUNX1_RUNX1T1"]
n_classes = len(class_labels)

#Function to get list of image_paths in one folder
def get_image_path_list(folder_path):
    tif_files = glob.glob(f"{folder_path}/*.tif")
    return tif_files

"""
shuffle list into n new lists:
num_lists: number of new lists
items_per_list: item per each new list
"""
def shuffle_into_lists(original_list, num_lists, items_per_list,seed=4):
    # Shuffle the original list in-place
    random.seed(seed)
    random.shuffle(original_list)
    total_items = len(original_list)
    
    # Check if the specified number of lists and items per list are valid
    if num_lists * items_per_list > total_items:
        raise ValueError("Invalid parameters: Not enough items in the original list.")
    
    result_lists = [original_list[i:i+items_per_list] for i in range(0, num_lists*items_per_list, items_per_list)]
    return result_lists

#exracts the number of image in the file_path e.g. "image_123.tif"
def extract_number_image(file_path):
    # Use a regular expression to find the number after "image_" and before ".tif"
    match = re.search(r'image_(\d+).tif', file_path)
    return int(match.group(1))

# Shuffle into n patients with n images
n_patients=5
n_images=10
experiment_name = "experiment_2"

# Set the path to your data directory
data_directory = '../../data'

In [88]:


# Get a list of all class folders in the data directory
class_folders = [folder for folder in os.listdir(data_directory) if os.path.isdir(os.path.join(data_directory, folder))]
print(class_folders)
# Dictionary to store shuffled image paths for each class
shuffled_images = {}

# Iterate over each class folder
for class_folder in class_folders:
    class_path = os.path.join(data_directory, class_folder)
    print(class_path)
    print(class_folder)
    
    # Get a list of all patient folders in the class folder
    patient_folders = [folder for folder in os.listdir(class_path) if os.path.isdir(os.path.join(class_path, folder))]
    
    # List to store image paths for the current class
    class_image_paths = []
    
    # Iterate over each patient folder in the class
    for patient_folder in patient_folders:
        patient_path = os.path.join(class_path, patient_folder)

        # Get a list of all image file paths of the current patient
        image_paths = get_image_path_list(patient_path)

        # Extend the class_image_paths list with the current patient's image paths
        class_image_paths.extend(image_paths)
    
    shuffled_lists = shuffle_into_lists(class_image_paths, n_patients, n_images)
    shuffled_images[class_folder] = shuffled_lists
print(shuffled_images)
   
# Now you have a dictionary where keys are class folders and values are lists of shuffled image paths
# You can access them like this: shuffled_images['NPM1'], shuffled_images['control'], etc.
        

['PML_RARA', 'CBFB_MYH11', 'NPM1', 'RUNX1_RUNX1T1', 'control']
../../data/PML_RARA
PML_RARA
../../data/CBFB_MYH11
CBFB_MYH11
../../data/NPM1
NPM1
../../data/RUNX1_RUNX1T1
RUNX1_RUNX1T1
../../data/control
control
{'PML_RARA': [['../../data/PML_RARA/CWF/image_153.tif', '../../data/PML_RARA/ALE/image_293.tif', '../../data/PML_RARA/CMR/image_304.tif', '../../data/PML_RARA/ALE/image_138.tif', '../../data/PML_RARA/BHS/image_42.tif', '../../data/PML_RARA/CWF/image_380.tif', '../../data/PML_RARA/ALE/image_478.tif', '../../data/PML_RARA/CWF/image_60.tif', '../../data/PML_RARA/CWF/image_282.tif', '../../data/PML_RARA/BHS/image_116.tif'], ['../../data/PML_RARA/ALE/image_195.tif', '../../data/PML_RARA/BHS/image_384.tif', '../../data/PML_RARA/BHS/image_382.tif', '../../data/PML_RARA/ALE/image_7.tif', '../../data/PML_RARA/CMR/image_76.tif', '../../data/PML_RARA/CMR/image_294.tif', '../../data/PML_RARA/CWF/image_323.tif', '../../data/PML_RARA/CWF/image_270.tif', '../../data/PML_RARA/CMR/image_46.tif'

In [89]:
# Save the new patients
output_folder = '../../artificialdata/'+experiment_name+'/data'
print("Output Folder:", output_folder)

patient_classes = []

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Iterate over each class and its shuffled image lists
for class_folder, shuffled_lists in shuffled_images.items():
    # Create a folder for the current class in the output directory
    output_class_folder = os.path.join(output_folder, class_folder)
    print("Output Class Folder:", output_class_folder)
    os.makedirs(output_class_folder, exist_ok=True)
    
    # Iterate over each shuffled patient folder
    for i, shuffled_patient_folder in enumerate(shuffled_lists):
        # Create a new folder for the shuffled patient
        new_patient_folder = os.path.join(output_class_folder, f'patient_{i+1}')
        os.makedirs(new_patient_folder, exist_ok=True)

        # Store class of each patient in a list for the metadate
        patient_classes.append(class_folder)

        # Sort the shuffled image paths for the current patient
        shuffled_patient_folder.sort()
        
        # Save the shuffled image paths into a text file
        txt_file_path = os.path.join(new_patient_folder, 'images.txt')
        print(txt_file_path)
        with open(txt_file_path, 'w') as txt_file:
            for image_path in shuffled_patient_folder:
                txt_file.write(image_path + '\n')
#print(patient_classes)


Output Folder: ../../artificialdata/experiment_2/data
Output Class Folder: ../../artificialdata/experiment_2/data/PML_RARA
../../artificialdata/experiment_2/data/PML_RARA/patient_1/images.txt
../../artificialdata/experiment_2/data/PML_RARA/patient_2/images.txt
../../artificialdata/experiment_2/data/PML_RARA/patient_3/images.txt
../../artificialdata/experiment_2/data/PML_RARA/patient_4/images.txt
../../artificialdata/experiment_2/data/PML_RARA/patient_5/images.txt
Output Class Folder: ../../artificialdata/experiment_2/data/CBFB_MYH11
../../artificialdata/experiment_2/data/CBFB_MYH11/patient_1/images.txt
../../artificialdata/experiment_2/data/CBFB_MYH11/patient_2/images.txt
../../artificialdata/experiment_2/data/CBFB_MYH11/patient_3/images.txt
../../artificialdata/experiment_2/data/CBFB_MYH11/patient_4/images.txt
../../artificialdata/experiment_2/data/CBFB_MYH11/patient_5/images.txt
Output Class Folder: ../../artificialdata/experiment_2/data/NPM1
../../artificialdata/experiment_2/data/NP

In [90]:

# Iterate over each class and its shuffled image lists
for class_folder, shuffled_lists in shuffled_images.items():
    # Iterate over each shuffled patient folder
    for patient, shuffled_patient_folder in enumerate(shuffled_lists):
        print(f"Save patient {patient + 1} features")
        array_list = []
        previous_patient_id = None

        # Iterate through each image path
        for image_path in shuffled_patient_folder:
            patient_id = image_path[:image_path.find("/image")]
            if previous_patient_id != patient_id:
                features = np.load(os.path.join(patient_id, "fnl34_bn_features_layer_7.npy"))
                previous_patient_id = patient_id

            array_list.append(features[extract_number_image(image_path)])

        # Concatenate all features for one artificial patient
        artificial_features = np.stack(array_list, axis=0)  # Use np.stack for concatenation along a new axis

        # Output directory for saving feature vectors
        output_npy_folder = os.path.join(output_folder, class_folder, f"patient_{patient+1}")
        os.makedirs(output_npy_folder, exist_ok=True)

        # Save the array to the .npy file
        output_npy_file = os.path.join(output_npy_folder, "fnl34_bn_features_layer_7.npy")
        np.save(output_npy_file, artificial_features)


Save patient 1 features
Save patient 2 features
Save patient 3 features
Save patient 4 features
Save patient 5 features
Save patient 1 features
Save patient 2 features
Save patient 3 features
Save patient 4 features


Save patient 5 features
Save patient 1 features
Save patient 2 features
Save patient 3 features
Save patient 4 features
Save patient 5 features
Save patient 1 features
Save patient 2 features
Save patient 3 features
Save patient 4 features
Save patient 5 features
Save patient 1 features
Save patient 2 features
Save patient 3 features
Save patient 4 features
Save patient 5 features


In [91]:
#Save metadata file
columns = ['patient_id', 'sex_1f_2m', 'age', 'bag_label', 'instance_count',
       'leucocytes_per_µl', 'pb_myeloblast', 'pb_promyelocyte',
       'pb_myelocyte', 'pb_metamyelocyte', 'pb_neutrophil_band',
       'pb_neutrophil_segmented', 'pb_eosinophil', 'pb_basophil',
       'pb_monocyte', 'pb_lymph_typ', 'pb_lymph_atyp_react',
       'pb_lymph_atyp_neopl', 'pb_other', 'pb_total']
artifcialmetadata = pd.DataFrame(columns=columns)
artifcialmetadata['patient_id'] = [f"patient{i%5 + 1}" for i in range(n_patients*n_classes)]
artifcialmetadata['bag_label'] = patient_classes
artifcialmetadata['instance_count'] = n_images
artifcialmetadata.to_csv(os.path.dirname(output_folder)+'/metadata.csv', index=False)



In [92]:

experiment_1_directory = output_folder  # Replace with the actual output folder

# Iterate through each AML subtype
for aml_subtype in class_labels:
    class_folder = os.path.join(experiment_1_directory, aml_subtype)

    # Check if the input folder exists
    if os.path.exists(class_folder):
        # Find the number of patients in the input folder
        patient_folders = [folder for folder in os.listdir(class_folder) if folder.startswith("patient_")]

        # Iterate through each patient
        for patient_folder in patient_folders:
            patient_number = int(patient_folder[len("patient_"):])  # Extract the patient number from the folder name
            input_patient_folder = os.path.join(class_folder, patient_folder)

            # Read the existing images.txt file
            txt_file_path = os.path.join(input_patient_folder, 'images.txt')
            with open(txt_file_path, 'r') as txt_file:
                image_file_paths = [line.strip() for line in txt_file.readlines()]

            # Save the image_file_paths using pickle
            with open(os.path.join(input_patient_folder, 'image_file_paths'), 'wb') as fp:
                pickle.dump(image_file_paths, fp)
