In [73]:
import numpy as np
import glob
import os
import random
import re
from collections import Counter
import pandas as pd
import pickle

class_labels = ["CBFB_MYH11","control","NPM1","PML_RARA","RUNX1_RUNX1T1"]
n_classes = len(class_labels)

#Function to get list of image_paths in one folder
def get_image_path_list(folder_path):
    tif_files = glob.glob(f"{folder_path}/*.tif")
    return tif_files


#exracts the number of image in the file_path e.g. "image_123.tif"
def extract_number_image(file_path):
    # Use a regular expression to find the number after "image_" and before ".tif"
    match = re.search(r'image_(\d+).tif', file_path)
    return int(match.group(1))

def get_patient_name(path):
    return re.search(r"/data/\w+/([A-Z]{3})", path).group(1)

def get_class_name(path):
    return re.search(r"/data/(\w+)", path).group(1)

def get_classification_patient(patient_folder):
    probs_path = patient_folder + '/single_cell_probabilities.npy'
    sc_probs = np.load(probs_path)
    sc_class= np.argmax(sc_probs, axis=1)
    return sc_class

data_directory = '../../data'
subtype=data_directory+"/NPM1"
n_patients=5
experiment_name = "experiment_3"
output_folder = '../../artificialdata/'+experiment_name+'/data'
output_folder_csv='../../artificialdata/'+experiment_name


In [74]:
#Iterate over real dataset and store image paths in a dataframe df
df = pd.DataFrame(columns=["patient","AML_subtype", "SC_Label", "image_path"])
for folder_class in os.listdir(data_directory):
    folder_class = os.path.join(data_directory, folder_class)    
    if os.path.isdir(folder_class):
       #print(folder_class)
       AML_subtype=get_class_name(folder_class)
       for folder_patient in os.listdir(folder_class):
            folder_patient = os.path.join(folder_class, folder_patient)
            if os.path.isdir(folder_patient):
                images=get_image_path_list(folder_patient)
                sc_classes=get_classification_patient(folder_patient)
                #print(sc_classes)
                for image in images:
                    number=extract_number_image(image)
                    df.loc[len(df)]=[get_patient_name(folder_patient), AML_subtype, sc_classes[number],image]

In [75]:
df

Unnamed: 0,patient,AML_subtype,SC_Label,image_path
0,CWF,PML_RARA,8,../../data/PML_RARA/CWF/image_377.tif
1,CWF,PML_RARA,14,../../data/PML_RARA/CWF/image_250.tif
2,CWF,PML_RARA,10,../../data/PML_RARA/CWF/image_476.tif
3,CWF,PML_RARA,14,../../data/PML_RARA/CWF/image_244.tif
4,CWF,PML_RARA,10,../../data/PML_RARA/CWF/image_182.tif
...,...,...,...,...
11117,AVL,control,2,../../data/control/AVL/image_9.tif
11118,AVL,control,2,../../data/control/AVL/image_329.tif
11119,AVL,control,4,../../data/control/AVL/image_52.tif
11120,AVL,control,3,../../data/control/AVL/image_305.tif


In [76]:
#calculate mean and std for each cell type that will be later used to sample data with normal distribution
sc_class_labels= ['eosinophil granulocyte', 'reactive lymphocyte',
       'neutrophil granulocyte (segmented)', 'typical lymphocyte',
       'other', 'neutrophil granulocyte (band)', 'monocyte',
       'large granulated lymphocyte', 'atypical promyelocyte',
       'basophil granulocyte', 'smudge cell', 'neoplastic lymphocyte',
       'promyelocyte', 'myelocyte', 'myeloblast', 'metamyelocyte',
       'normo', 'plasma cell', 'hair cell', 'bilobed M3v',
       'mononucleosis']
df_sc_res=pd.read_csv(data_directory+"/single_cell_results.csv",index_col=0).drop("patient", axis=1)
df_meanstd = df_sc_res.groupby(["AML_subtype"]).agg(["mean","std"])

In [77]:
df_meanstd

Unnamed: 0_level_0,eosinophil granulocyte,eosinophil granulocyte,reactive lymphocyte,reactive lymphocyte,neutrophil granulocyte (segmented),neutrophil granulocyte (segmented),typical lymphocyte,typical lymphocyte,other,other,...,normo,normo,plasma cell,plasma cell,hair cell,hair cell,bilobed M3v,bilobed M3v,mononucleosis,mononucleosis
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
AML_subtype,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
CBFB_MYH11,1.2,0.83666,7.0,7.141428,16.8,16.284962,105.6,79.651114,21.2,10.009995,...,1.8,2.48998,5.2,4.549725,1.2,0.83666,0.8,1.30384,11.8,16.177144
NPM1,4.6,5.412947,4.6,6.542171,32.2,56.40656,77.0,64.074176,49.4,34.158454,...,11.6,16.009372,7.8,6.180615,7.4,9.864076,2.8,2.588436,12.6,17.826946
PML_RARA,1.4,1.516575,2.8,3.34664,12.6,11.148991,39.8,56.45972,88.4,36.637413,...,9.2,16.709279,4.4,6.426508,2.4,3.361547,16.0,28.956864,2.8,3.834058
RUNX1_RUNX1T1,0.8,1.788854,1.0,1.414214,10.4,10.899541,75.8,79.67873,25.6,16.379866,...,4.0,3.535534,14.4,10.922454,3.2,3.114482,12.6,12.739702,5.0,4.795832
control,11.6,5.549775,10.0,9.0,179.8,70.489006,50.2,17.412639,66.4,47.395147,...,0.8,1.788854,1.4,2.19089,0.8,0.447214,0.0,0.0,3.8,3.898718


In [78]:
#This cell creates artificial patients and stores the single cell couunts per patient in cell_type_counts_dict, also it counts the selected_images_per_patient as a sanity check
# Dictionary stores cell type counts (how often each cell type appears)
cell_type_counts_dict = {}

# Dictionary stores the selected images and counts them per SC class per patient
selected_images_per_patient = {}

# Iterate over all AML subtypes
for aml_subtype in class_labels:
    output_class_folder = output_folder + "/" + aml_subtype
    # Get distribution
    class_means = df_meanstd.loc[aml_subtype, :].loc[:, "mean"].values
    class_variances = df_meanstd.loc[aml_subtype, :].loc[:, "std"].values
    for patient_number in range(n_patients):
        print(f"Generating data for patient {patient_number+1} of subtype {aml_subtype}...")
        # Calculate how many images from each SC class
        generated_data = np.random.normal(class_means, class_variances, 21).astype(int)
        generated_data = generated_data * (generated_data > 0)
        image_file_paths = []
        # Randomly choose SC image and concatenate them into image_file_paths
        selected_images_count = {}
        for cell_type_number, cell_type in enumerate(sc_class_labels):
            df_cell_type = df[df["SC_Label"] == cell_type_number]
            # print(f"\tImages for cell type {cell_type}...")
            file_path = df_cell_type["image_path"].values
            image_paths = np.random.choice(file_path, size=generated_data[cell_type_number]).tolist()
            print(f"\t\tSelected {len(image_paths)} images for {cell_type}")
            image_file_paths += image_paths
            # Store selected images count per SC class
            selected_images_count[cell_type] = len(image_paths)
        # Store selected images count per SC class per patient
        patient_id = f"{aml_subtype}/patient_{patient_number+1}"
        selected_images_per_patient[patient_id] = selected_images_count
        # Store patient classes and number images for metadata
        new_patient_folder = os.path.join(output_class_folder, f'patient_{patient_number+1}')
        os.makedirs(new_patient_folder, exist_ok=True)

        # Sort the shuffled image paths for the current patient
        image_file_paths.sort()
        # Save the shuffled image paths into a text file
        txt_file_path = os.path.join(new_patient_folder, 'images.txt')
        with open(txt_file_path, 'w') as txt_file:
            for image_path in image_file_paths:
                txt_file.write(image_path + '\n')

        with open(os.path.join(new_patient_folder, "image_file_paths"), 'wb') as fp:
            pickle.dump(image_file_paths, fp)

        # Count cell types in the current patient
        cell_type_count = {cell_type: image_file_paths.count(cell_type) for cell_type in set(image_file_paths)}
        print(f"\tCell type count for patient {patient_number+1}: {cell_type_count}")

        # Add the cell type count to the dictionary with the patient as the key
        # and the cell type count as the value
        cell_type_counts_dict[(aml_subtype, patient_id)] = cell_type_count
        print(f"\tCell type counts dictionary for patient {patient_id}: {cell_type_count}")

# Print selected images count per SC class per patient
print("\nSelected Images Count per SC class per Patient:")
for patient_id, sc_counts in selected_images_per_patient.items():
    print(patient_id)
    for sc_class, count in sc_counts.items():
        print(f"\t{sc_class}: {count}")


Generating data for patient 1 of subtype CBFB_MYH11...
		Selected 1 images for eosinophil granulocyte
		Selected 8 images for reactive lymphocyte
		Selected 22 images for neutrophil granulocyte (segmented)
		Selected 113 images for typical lymphocyte
		Selected 0 images for other
		Selected 4 images for neutrophil granulocyte (band)
		Selected 35 images for monocyte
		Selected 33 images for large granulated lymphocyte
		Selected 8 images for atypical promyelocyte
		Selected 0 images for basophil granulocyte
		Selected 2 images for smudge cell
		Selected 0 images for neoplastic lymphocyte
		Selected 0 images for promyelocyte
		Selected 0 images for myelocyte
		Selected 36 images for myeloblast
		Selected 0 images for metamyelocyte
		Selected 0 images for normo
		Selected 10 images for plasma cell
		Selected 1 images for hair cell
		Selected 1 images for bilobed M3v
		Selected 46 images for mononucleosis
	Cell type count for patient 1: {'../../data/NPM1/BHR/image_11.tif': 1, '../../data/

In [79]:
'''# Print selected images count per SC class per patient
print("\nSelected Images Count per SC class per Patient:")
for patient_id, sc_counts in selected_images_per_patient.items():
    #print(patient_id)
    for sc_class, count in sc_counts.items():
        print(f"\t{sc_class}: {count}")'''
selected_images_per_patient
cell_type_counts_dict

{('CBFB_MYH11',
  'CBFB_MYH11/patient_1'): {'../../data/NPM1/BHR/image_11.tif': 1, '../../data/control/BXR/image_148.tif': 1, '../../data/PML_RARA/BHS/image_26.tif': 1, '../../data/RUNX1_RUNX1T1/BKR/image_325.tif': 1, '../../data/control/AEC/image_295.tif': 1, '../../data/CBFB_MYH11/DQV/image_384.tif': 1, '../../data/CBFB_MYH11/BSN/image_230.tif': 1, '../../data/CBFB_MYH11/AQK/image_16.tif': 1, '../../data/PML_RARA/BIK/image_68.tif': 1, '../../data/CBFB_MYH11/DQV/image_473.tif': 1, '../../data/PML_RARA/ALE/image_221.tif': 1, '../../data/CBFB_MYH11/BJK/image_481.tif': 1, '../../data/control/AEC/image_368.tif': 1, '../../data/NPM1/CZI/image_276.tif': 1, '../../data/RUNX1_RUNX1T1/BKR/image_210.tif': 1, '../../data/CBFB_MYH11/BJK/image_364.tif': 1, '../../data/RUNX1_RUNX1T1/BKR/image_38.tif': 1, '../../data/CBFB_MYH11/AQK/image_374.tif': 1, '../../data/CBFB_MYH11/DQV/image_162.tif': 2, '../../data/CBFB_MYH11/AQK/image_217.tif': 1, '../../data/control/BXR/image_286.tif': 1, '../../data/CBFB

In [80]:
#saving images in npy files
for aml_subtype in class_labels:
    output_class_folder=output_folder+"/"+aml_subtype    
    for patient_number in range(n_patients):
        patient_folder = os.path.join(output_class_folder, f'patient_{patient_number+1}')
        with open( os.path.join(patient_folder,"image_file_paths") ,'rb') as fp:
            image_file_paths=pickle.load(fp)
        array_list=[]
        previous_patient_id=None
        # Iterate through each image path
        for image_path in image_file_paths:
            patient_id = image_path[:image_path.find("/image")]
            if previous_patient_id!=patient_id:
                #print(f"New patient: {patient_id}, old patient : {previous_patient_id}")
                features=np.load(patient_id+"/fnl34_bn_features_layer_7.npy")
            array_list.append([features[extract_number_image(image_path)]])
            previous_patient_id=patient_id 
        #Concatenate all features for one artificial patient    
        artificial_features = np.concatenate(array_list,axis=0)
        output_npy_file = output_folder+f"/{aml_subtype}/patient_{patient_number+1}/fnl34_bn_features_layer_7.npy"
        # Save the array to the .npy file
        np.save(output_npy_file, artificial_features)

In [81]:
#Create metadata including single cell types, not including age, gender and leucocytes_per_µl
# Create a list to hold the rows of the DataFrame
rows = []
'''sc_class_labels= ['eosinophil granulocyte', 'reactive lymphocyte',
       'neutrophil granulocyte (segmented)', 'typical lymphocyte',
       'other', 'neutrophil granulocyte (band)', 'monocyte',
       'large granulated lymphocyte', 'atypical promyelocyte',
       'basophil granulocyte', 'smudge cell', 'neoplastic lymphocyte',
       'promyelocyte', 'myelocyte', 'myeloblast', 'metamyelocyte',
       'normo', 'plasma cell', 'hair cell', 'bilobed M3v',
       'mononucleosis']'''

# Fill in the DataFrame with values from the dictionary
for patient_id, cell_counts in selected_images_per_patient.items():
    myeloblast_count = cell_counts.get('myeloblast', 0)
    # Extract counts for other cell types
    promyelocyte_count = cell_counts.get('promyelocyte', 0)
    myelocyte_count = cell_counts.get('myelocyte', 0)
    metamyelocyte_count = cell_counts.get('metamyelocyte', 0)
    neutrophil_band_count = cell_counts.get('neutrophil granulocyte (band)', 0)
    neutrophil_segmented_count = cell_counts.get('neutrophil granulocyte (segmented)', 0)
    eosinophil_count = cell_counts.get('eosinophil granulocyte', 0)
    basophil_count = cell_counts.get('basophil granulocyte', 0)
    monocyte_count = cell_counts.get('monocyte', 0)
    lymph_typ_count = cell_counts.get('typical lymphocyte', 0)
    lymph_atyp_react_count = cell_counts.get('reactive lymphocyte', 0)
    lymph_atyp_neopl_count = cell_counts.get('neoplastic lymphocyte', 0)
    other_count = cell_counts.get('other', 0)
    total_count = sum(cell_counts.values())#check

  
    index = patient_id.find("/patient")
    bag = patient_id[:index] if index != -1 else patient_id

    id = patient_id.find("/")
    patient = patient_id[id + 1:] if index != -1 else ""

    age = 0 #dummy
    row = {'patient_id': patient,
           'sex_1f_2m': None,  # Placeholder value
           'age': age,
           'bag_label': bag, 
           'instance_count': total_count,#check
           'leucocytes_per_µl': None,  # Placeholder value
           'pb_myeloblast': myeloblast_count,
           'pb_promyelocyte': promyelocyte_count,
           'pb_myelocyte': myelocyte_count,
           'pb_metamyelocyte': metamyelocyte_count,
           'pb_neutrophil_band': neutrophil_band_count,
           'pb_neutrophil_segmented': neutrophil_segmented_count,
           'pb_eosinophil': eosinophil_count,
           'pb_basophil': basophil_count,
           'pb_monocyte': monocyte_count,
           'pb_lymph_typ': lymph_typ_count,
           'pb_lymph_atyp_react': lymph_atyp_react_count,
           'pb_lymph_atyp_neopl': lymph_atyp_neopl_count,
           'pb_other': other_count,
           'pb_total': total_count}#check
    rows.append(row)

# Create DataFrame from the list of rows
artificial_metadata = pd.DataFrame(rows)

# Save the DataFrame to a CSV file
metadata_csv_path = os.path.join(output_folder_csv, 'metadata.csv')
artificial_metadata.to_csv(metadata_csv_path, index=False)
print(f"Metadata saved to {metadata_csv_path}")


Metadata saved to ../../artificialdata/experiment_3/metadata.csv


In [82]:
artificial_metadata


Unnamed: 0,patient_id,sex_1f_2m,age,bag_label,instance_count,leucocytes_per_µl,pb_myeloblast,pb_promyelocyte,pb_myelocyte,pb_metamyelocyte,pb_neutrophil_band,pb_neutrophil_segmented,pb_eosinophil,pb_basophil,pb_monocyte,pb_lymph_typ,pb_lymph_atyp_react,pb_lymph_atyp_neopl,pb_other,pb_total
0,patient_1,,0,CBFB_MYH11,320,,36,0,0,0,4,22,1,0,35,113,8,0,0,320
1,patient_2,,0,CBFB_MYH11,414,,81,1,0,2,3,18,0,0,195,56,5,2,17,414
2,patient_3,,0,CBFB_MYH11,258,,66,2,2,5,19,32,0,0,74,0,16,2,12,258
3,patient_4,,0,CBFB_MYH11,441,,150,2,2,8,0,36,1,0,77,75,1,5,25,441
4,patient_5,,0,CBFB_MYH11,423,,178,1,0,6,4,25,2,0,140,0,2,0,19,423
5,patient_1,,0,control,296,,0,0,0,0,0,117,21,13,15,23,22,1,50,296
6,patient_2,,0,control,396,,2,0,0,0,20,175,10,2,34,42,0,3,102,396
7,patient_3,,0,control,401,,1,0,0,0,13,180,16,8,30,34,13,4,69,401
8,patient_4,,0,control,321,,0,0,0,0,14,126,16,0,17,59,20,5,42,321
9,patient_5,,0,control,246,,0,0,0,0,9,109,16,0,13,22,0,0,43,246
