In [71]:
import numpy as np
import glob
import os
import random
import re
from collections import Counter
import pandas as pd
import pickle

class_labels = ["CBFB_MYH11","control","NPM1","PML_RARA","RUNX1_RUNX1T1"]
n_classes = len(class_labels)

#Function to get list of image_paths in one folder
def get_image_path_list(folder_path):
    tif_files = glob.glob(f"{folder_path}/*.tif")
    return tif_files


#exracts the number of image in the file_path e.g. "image_123.tif"
def extract_number_image(file_path):
    # Use a regular expression to find the number after "image_" and before ".tif"
    match = re.search(r'image_(\d+).tif', file_path)
    return int(match.group(1))

def get_patient_name(path):
    return re.search(r"/data/\w+/([A-Z]{3})", path).group(1)

def get_class_name(path):
    return re.search(r"/data/(\w+)", path).group(1)

def get_classification_patient(patient_folder):
    probs_path = patient_folder + '/single_cell_probabilities.npy'
    sc_probs = np.load(probs_path)
    sc_class= np.argmax(sc_probs, axis=1)
    return sc_class


In [72]:
data_directory = '../../data'
subtype=data_directory+"/NPM1"
df = pd.DataFrame(columns=["patient","AML_subtype", "SC_Label", "image_path"])
# Save class classification count for each patient in csv file
for folder_class in os.listdir(data_directory):
    folder_class = os.path.join(data_directory, folder_class)
    
    if os.path.isdir(folder_class):
       print(folder_class)
       AML_subtype=get_class_name(folder_class)
       for folder_patient in os.listdir(folder_class):
            folder_patient = os.path.join(folder_class, folder_patient)
            
            if os.path.isdir(folder_patient):
                images=get_image_path_list(folder_patient)
                sc_classes=get_classification_patient(folder_patient)

                for image in images:
                    number=extract_number_image(image)
                    df.loc[len(df)]=[get_patient_name(folder_patient), AML_subtype, sc_classes[number],image]

../../data/PML_RARA


../../data/CBFB_MYH11
../../data/NPM1
../../data/RUNX1_RUNX1T1
../../data/control


In [73]:
aml_subtype="NPM1"
sc_class_labels= ['eosinophil granulocyte', 'reactive lymphocyte',
       'neutrophil granulocyte (segmented)', 'typical lymphocyte',
       'other', 'neutrophil granulocyte (band)', 'monocyte',
       'large granulated lymphocyte', 'atypical promyelocyte',
       'basophil granulocyte', 'smudge cell', 'neoplastic lymphocyte',
       'promyelocyte', 'myelocyte', 'myeloblast', 'metamyelocyte',
       'normo', 'plasma cell', 'hair cell', 'bilobed M3v',
       'mononucleosis']
#df_meanstd=pd.read_csv(data_directory+"/mean_std_sc_class.csv")
df_sc_res=pd.read_csv(data_directory+"/single_cell_results.csv",index_col=0).drop("patient", axis=1)
df_meanstd = df_sc_res.groupby(["AML_subtype"]).agg(["mean","std"])
# Sample data with normal distribution

In [74]:
# Shuffle into n patients with n images
n_patients=5
experiment_name = "experiment_3"

In [75]:
patient_classes=[]
number_images=[]
output_folder = '../../artificialdata/'+experiment_name+'/data'
#Iterate over all AML subtypes 
for aml_subtype in class_labels:
    output_class_folder=output_folder+"/"+aml_subtype
    #Get distribution
    class_means = df_meanstd.loc[aml_subtype,:].loc[:,"mean"].values
    class_variances = df_meanstd.loc[aml_subtype,:].loc[:,"std"].values
    for patient_number in range(n_patients):
       
        #calculate how many images from each sc class
        generated_data = np.random.normal(class_means,class_variances,21).astype(int)
        generated_data = generated_data*(generated_data>0)
        image_file_paths=[]
        #randomly choose sc-image and concatinate them into image_file_paths
        for cell_type_number, cell_type in enumerate(sc_class_labels):
            df_cell_type=df[df["SC_Label"]==cell_type_number]
            #print(cell_type)
            file_path=df_cell_type["image_path"].values
            #print(file_path)
            image_file_paths+=(np.random.choice(file_path, size=generated_data[cell_type_number]).tolist())
            #print(random_image)
        #Store patient classes and number images for meta data
        patient_classes.append(aml_subtype)
        number_images.append(len(image_file_paths))
        new_patient_folder = os.path.join(output_class_folder, f'patient_{patient_number+1}')
        os.makedirs(new_patient_folder, exist_ok=True)

        # Sort the shuffled image paths for the current patient
        image_file_paths.sort()
        # Save the shuffled image paths into a text file
        txt_file_path = os.path.join(new_patient_folder, 'images.txt')
        with open(txt_file_path, 'w') as txt_file:
            for image_path in image_file_paths:
                txt_file.write(image_path + '\n')

        with open( os.path.join(new_patient_folder,"image_file_paths") ,'wb') as fp:
            pickle.dump(image_file_paths,fp)
    

In [76]:
for aml_subtype in class_labels:
    output_class_folder=output_folder+"/"+aml_subtype    
    for patient_number in range(n_patients):
        patient_folder = os.path.join(output_class_folder, f'patient_{patient_number+1}')
        with open( os.path.join(patient_folder,"image_file_paths") ,'rb') as fp:
            image_file_paths=pickle.load(fp)
        print(image_file_paths)
        print(f"Save patient {patient_number + 1} features")
        array_list=[]
        previous_patient_id=None
        # Iterate through each image path
        for image_path in image_file_paths:
            patient_id = image_path[:image_path.find("/image")]
            if previous_patient_id!=patient_id:
                #print(f"New patient: {patient_id}, old patient : {previous_patient_id}")
                features=np.load(patient_id+"/fnl34_bn_features_layer_7.npy")
            array_list.append([features[extract_number_image(image_path)]])
            previous_patient_id=patient_id
        #Concatenate all features for one artificial patient    
        artificial_features = np.concatenate(array_list,axis=0)
        output_npy_file = output_folder+f"/{aml_subtype}/patient_{patient_number+1}/fnl34_bn_features_layer_7.npy"
        # Save the array to the .npy file
        np.save(output_npy_file, artificial_features)

['../../data/CBFB_MYH11/AQK/image_105.tif', '../../data/CBFB_MYH11/AQK/image_156.tif', '../../data/CBFB_MYH11/AQK/image_187.tif', '../../data/CBFB_MYH11/AQK/image_19.tif', '../../data/CBFB_MYH11/AQK/image_20.tif', '../../data/CBFB_MYH11/AQK/image_216.tif', '../../data/CBFB_MYH11/AQK/image_244.tif', '../../data/CBFB_MYH11/AQK/image_253.tif', '../../data/CBFB_MYH11/AQK/image_26.tif', '../../data/CBFB_MYH11/AQK/image_313.tif', '../../data/CBFB_MYH11/AQK/image_323.tif', '../../data/CBFB_MYH11/AQK/image_324.tif', '../../data/CBFB_MYH11/AQK/image_34.tif', '../../data/CBFB_MYH11/AQK/image_5.tif', '../../data/CBFB_MYH11/AQK/image_59.tif', '../../data/CBFB_MYH11/AQK/image_68.tif', '../../data/CBFB_MYH11/AQK/image_93.tif', '../../data/CBFB_MYH11/BJK/image_11.tif', '../../data/CBFB_MYH11/BJK/image_128.tif', '../../data/CBFB_MYH11/BJK/image_146.tif', '../../data/CBFB_MYH11/BJK/image_158.tif', '../../data/CBFB_MYH11/BJK/image_160.tif', '../../data/CBFB_MYH11/BJK/image_177.tif', '../../data/CBFB_MYH

In [78]:
#Save metadata file
columns = ['patient_id', 'sex_1f_2m', 'age', 'bag_label', 'instance_count',
       'leucocytes_per_µl', 'pb_myeloblast', 'pb_promyelocyte',
       'pb_myelocyte', 'pb_metamyelocyte', 'pb_neutrophil_band',
       'pb_neutrophil_segmented', 'pb_eosinophil', 'pb_basophil',
       'pb_monocyte', 'pb_lymph_typ', 'pb_lymph_atyp_react',
       'pb_lymph_atyp_neopl', 'pb_other', 'pb_total']
artifcialmetadata = pd.DataFrame(columns=columns)
artifcialmetadata['patient_id'] = [f"patient{i%5 + 1}" for i in range(n_patients*n_classes)]
artifcialmetadata['bag_label'] = patient_classes
artifcialmetadata['instance_count'] = number_images
artifcialmetadata.to_csv(os.path.dirname(output_folder)+'/metadata.csv', index=False)

In [82]:
path="../../artificialdata/experiment_3/data/CBFB_MYH11/patient_5/image_file_paths"
with open(path, "rb") as fp:
    file_paths=pickle.load(fp)
print(len(file_paths))
path_to_features="../../artificialdata/experiment_3/data/CBFB_MYH11/patient_5/fnl34_bn_features_layer_7.npy"
features=np.load(path_to_features)
print(features.shape)

486
(486, 512, 5, 5)
