In [85]:
import numpy as np
import glob
import os
import random
import re
from collections import Counter
import pandas as pd
import pickle

class_labels = ["CBFB_MYH11","control","NPM1","PML_RARA","RUNX1_RUNX1T1"]
n_classes = len(class_labels)

#Function to get list of image_paths in one folder
def get_image_path_list(folder_path):
    tif_files = glob.glob(f"{folder_path}/*.tif")
    return tif_files


#exracts the number of image in the file_path e.g. "image_123.tif"
def extract_number_image(file_path):
    # Use a regular expression to find the number after "image_" and before ".tif"
    match = re.search(r'image_(\d+).tif', file_path)
    return int(match.group(1))

def get_patient_name(path):
    return re.search(r"/data/\w+/([A-Z]{3})", path).group(1)

def get_class_name(path):
    return re.search(r"/data/(\w+)", path).group(1)

def get_classification_patient(patient_folder):
    probs_path = patient_folder + '/single_cell_probabilities.npy'
    sc_probs = np.load(probs_path)
    sc_class= np.argmax(sc_probs, axis=1)
    return sc_class


In [86]:
data_directory = '../../data'
subtype=data_directory+"/NPM1"
df = pd.DataFrame(columns=["patient","AML_subtype", "SC_Label", "image_path"])
# Save class classification count for each patient in csv file
for folder_class in os.listdir(data_directory):
    folder_class = os.path.join(data_directory, folder_class)
    
    if os.path.isdir(folder_class):
       #print(folder_class)
       AML_subtype=get_class_name(folder_class)
       for folder_patient in os.listdir(folder_class):
            folder_patient = os.path.join(folder_class, folder_patient)
            
            if os.path.isdir(folder_patient):
                images=get_image_path_list(folder_patient)
                sc_classes=get_classification_patient(folder_patient)
                #print(sc_classes)
                for image in images:
                    number=extract_number_image(image)
                    df.loc[len(df)]=[get_patient_name(folder_patient), AML_subtype, sc_classes[number],image]

In [87]:
aml_subtype="NPM1"
sc_class_labels= ['eosinophil granulocyte', 'reactive lymphocyte',
       'neutrophil granulocyte (segmented)', 'typical lymphocyte',
       'other', 'neutrophil granulocyte (band)', 'monocyte',
       'large granulated lymphocyte', 'atypical promyelocyte',
       'basophil granulocyte', 'smudge cell', 'neoplastic lymphocyte',
       'promyelocyte', 'myelocyte', 'myeloblast', 'metamyelocyte',
       'normo', 'plasma cell', 'hair cell', 'bilobed M3v',
       'mononucleosis']
#df_meanstd=pd.read_csv(data_directory+"/mean_std_sc_class.csv")
df_sc_res=pd.read_csv(data_directory+"/single_cell_results.csv",index_col=0).drop("patient", axis=1)
df_meanstd = df_sc_res.groupby(["AML_subtype"]).agg(["mean","std"])
# Sample data with normal distribution

In [88]:
# Shuffle into n patients with n images
n_patients=5
experiment_name = "experiment_3"

In [97]:
# Define a dictionary to store cell type counts
cell_type_counts_dict = {}

# Define a dictionary to store the selected images count per SC class per patient
selected_images_per_patient = {}

# Iterate over all AML subtypes
for aml_subtype in class_labels:
    output_class_folder = output_folder + "/" + aml_subtype
    # Get distribution
    class_means = df_meanstd.loc[aml_subtype, :].loc[:, "mean"].values
    class_variances = df_meanstd.loc[aml_subtype, :].loc[:, "std"].values
    for patient_number in range(n_patients):
        print(f"Generating data for patient {patient_number+1} of subtype {aml_subtype}...")
        # Calculate how many images from each SC class
        generated_data = np.random.normal(class_means, class_variances, 21).astype(int)
        generated_data = generated_data * (generated_data > 0)
        image_file_paths = []
        # Randomly choose SC image and concatenate them into image_file_paths
        selected_images_count = {}
        for cell_type_number, cell_type in enumerate(sc_class_labels):
            df_cell_type = df[df["SC_Label"] == cell_type_number]
            # print(f"\tImages for cell type {cell_type}...")
            file_path = df_cell_type["image_path"].values
            image_paths = np.random.choice(file_path, size=generated_data[cell_type_number]).tolist()
            print(f"\t\tSelected {len(image_paths)} images for {cell_type}")
            image_file_paths += image_paths
            # Store selected images count per SC class
            selected_images_count[cell_type] = len(image_paths)
        # Store selected images count per SC class per patient
        patient_id = f"{aml_subtype}/patient_{patient_number+1}"
        selected_images_per_patient[patient_id] = selected_images_count
        # Store patient classes and number images for metadata
        new_patient_folder = os.path.join(output_class_folder, f'patient_{patient_number+1}')
        os.makedirs(new_patient_folder, exist_ok=True)

        # Sort the shuffled image paths for the current patient
        image_file_paths.sort()
        # Save the shuffled image paths into a text file
        txt_file_path = os.path.join(new_patient_folder, 'images.txt')
        with open(txt_file_path, 'w') as txt_file:
            for image_path in image_file_paths:
                txt_file.write(image_path + '\n')

        with open(os.path.join(new_patient_folder, "image_file_paths"), 'wb') as fp:
            pickle.dump(image_file_paths, fp)

        # Count cell types in the current patient
        cell_type_count = {cell_type: image_file_paths.count(cell_type) for cell_type in set(image_file_paths)}
        print(f"\tCell type count for patient {patient_number+1}: {cell_type_count}")

        # Add the cell type count to the dictionary with the patient as the key
        # and the cell type count as the value
        cell_type_counts_dict[(aml_subtype, patient_id)] = cell_type_count
        print(f"\tCell type counts dictionary for patient {patient_id}: {cell_type_count}")

# Print selected images count per SC class per patient
print("\nSelected Images Count per SC class per Patient:")
for patient_id, sc_counts in selected_images_per_patient.items():
    print(patient_id)
    for sc_class, count in sc_counts.items():
        print(f"\t{sc_class}: {count}")


Generating data for patient 1 of subtype CBFB_MYH11...
		Selected 0 images for eosinophil granulocyte
		Selected 8 images for reactive lymphocyte
		Selected 33 images for neutrophil granulocyte (segmented)
		Selected 252 images for typical lymphocyte
		Selected 22 images for other
		Selected 1 images for neutrophil granulocyte (band)
		Selected 116 images for monocyte
		Selected 13 images for large granulated lymphocyte
		Selected 8 images for atypical promyelocyte
		Selected 0 images for basophil granulocyte
		Selected 3 images for smudge cell
		Selected 1 images for neoplastic lymphocyte
		Selected 2 images for promyelocyte
		Selected 0 images for myelocyte
		Selected 122 images for myeloblast
		Selected 2 images for metamyelocyte
		Selected 0 images for normo
		Selected 7 images for plasma cell
		Selected 2 images for hair cell
		Selected 3 images for bilobed M3v
		Selected 3 images for mononucleosis
	Cell type count for patient 1: {'../../data/control/BXR/image_58.tif': 1, '../../d

In [98]:
# Print selected images count per SC class per patient
print("\nSelected Images Count per SC class per Patient:")
for patient_id, sc_counts in selected_images_per_patient.items():
    print(patient_id)
    for sc_class, count in sc_counts.items():
        print(f"\t{sc_class}: {count}")




Selected Images Count per SC class per Patient:
CBFB_MYH11/patient_1
	eosinophil granulocyte: 0
	reactive lymphocyte: 8
	neutrophil granulocyte (segmented): 33
	typical lymphocyte: 252
	other: 22
	neutrophil granulocyte (band): 1
	monocyte: 116
	large granulated lymphocyte: 13
	atypical promyelocyte: 8
	basophil granulocyte: 0
	smudge cell: 3
	neoplastic lymphocyte: 1
	promyelocyte: 2
	myelocyte: 0
	myeloblast: 122
	metamyelocyte: 2
	normo: 0
	plasma cell: 7
	hair cell: 2
	bilobed M3v: 3
	mononucleosis: 3
CBFB_MYH11/patient_2
	eosinophil granulocyte: 0
	reactive lymphocyte: 4
	neutrophil granulocyte (segmented): 17
	typical lymphocyte: 139
	other: 14
	neutrophil granulocyte (band): 7
	monocyte: 96
	large granulated lymphocyte: 45
	atypical promyelocyte: 16
	basophil granulocyte: 0
	smudge cell: 1
	neoplastic lymphocyte: 3
	promyelocyte: 0
	myelocyte: 0
	myeloblast: 127
	metamyelocyte: 8
	normo: 0
	plasma cell: 0
	hair cell: 1
	bilobed M3v: 2
	mononucleosis: 15
CBFB_MYH11/patient_3
	eo

In [99]:
selected_images_per_patient

{'CBFB_MYH11/patient_1': {'eosinophil granulocyte': 0,
  'reactive lymphocyte': 8,
  'neutrophil granulocyte (segmented)': 33,
  'typical lymphocyte': 252,
  'other': 22,
  'neutrophil granulocyte (band)': 1,
  'monocyte': 116,
  'large granulated lymphocyte': 13,
  'atypical promyelocyte': 8,
  'basophil granulocyte': 0,
  'smudge cell': 3,
  'neoplastic lymphocyte': 1,
  'promyelocyte': 2,
  'myelocyte': 0,
  'myeloblast': 122,
  'metamyelocyte': 2,
  'normo': 0,
  'plasma cell': 7,
  'hair cell': 2,
  'bilobed M3v': 3,
  'mononucleosis': 3},
 'CBFB_MYH11/patient_2': {'eosinophil granulocyte': 0,
  'reactive lymphocyte': 4,
  'neutrophil granulocyte (segmented)': 17,
  'typical lymphocyte': 139,
  'other': 14,
  'neutrophil granulocyte (band)': 7,
  'monocyte': 96,
  'large granulated lymphocyte': 45,
  'atypical promyelocyte': 16,
  'basophil granulocyte': 0,
  'smudge cell': 1,
  'neoplastic lymphocyte': 3,
  'promyelocyte': 0,
  'myelocyte': 0,
  'myeloblast': 127,
  'metamyelocyt

In [100]:
for aml_subtype in class_labels:
    output_class_folder=output_folder+"/"+aml_subtype    
    for patient_number in range(n_patients):
        patient_folder = os.path.join(output_class_folder, f'patient_{patient_number+1}')
        with open( os.path.join(patient_folder,"image_file_paths") ,'rb') as fp:
            image_file_paths=pickle.load(fp)
        array_list=[]
        previous_patient_id=None
        # Iterate through each image path
        for image_path in image_file_paths:
            patient_id = image_path[:image_path.find("/image")]
            if previous_patient_id!=patient_id:
                #print(f"New patient: {patient_id}, old patient : {previous_patient_id}")
                features=np.load(patient_id+"/fnl34_bn_features_layer_7.npy")
            array_list.append([features[extract_number_image(image_path)]])
            previous_patient_id=patient_id 
        #Concatenate all features for one artificial patient    
        artificial_features = np.concatenate(array_list,axis=0)
        output_npy_file = output_folder+f"/{aml_subtype}/patient_{patient_number+1}/fnl34_bn_features_layer_7.npy"
        # Save the array to the .npy file
        np.save(output_npy_file, artificial_features)

In [103]:
# Create a list to hold the rows of the DataFrame
rows = []

# Fill in the DataFrame with values from the dictionary
for patient_id, cell_counts in selected_images_per_patient.items():
    myeloblast_count = cell_counts.get('myeloblast', 0)
    # Extract counts for other cell types
    promyelocyte_count = cell_counts.get('promyelocyte', 0)
    myelocyte_count = cell_counts.get('myelocyte', 0)
    metamyelocyte_count = cell_counts.get('metamyelocyte', 0)
    neutrophil_band_count = cell_counts.get('neutrophil_band', 0)
    neutrophil_segmented_count = cell_counts.get('neutrophil_segmented', 0)
    eosinophil_count = cell_counts.get('eosinophil', 0)
    basophil_count = cell_counts.get('basophil', 0)
    monocyte_count = cell_counts.get('monocyte', 0)
    lymph_typ_count = cell_counts.get('lymph_typ', 0)
    lymph_atyp_react_count = cell_counts.get('lymph_atyp_react', 0)
    lymph_atyp_neopl_count = cell_counts.get('lymph_atyp_neopl', 0)
    other_count = cell_counts.get('other', 0)
    total_count = sum(cell_counts.values())

  
    index = patient_id.find("/patient")
    bag = patient_id[:index] if index != -1 else patient_id

    id = patient_id.find("/")
    patient = patient_id[id + 1:] if index != -1 else ""

    age = 0 #dummy
    row = {'patient_id': patient,
           'sex_1f_2m': None,  # Placeholder value
           'age': age,
           'bag_label': bag,  # Placeholder value
           'instance_count': total_count,
           'leucocytes_per_µl': None,  # Placeholder value
           'pb_myeloblast': myeloblast_count,
           'pb_promyelocyte': promyelocyte_count,
           'pb_myelocyte': myelocyte_count,
           'pb_metamyelocyte': metamyelocyte_count,
           'pb_neutrophil_band': neutrophil_band_count,
           'pb_neutrophil_segmented': neutrophil_segmented_count,
           'pb_eosinophil': eosinophil_count,
           'pb_basophil': basophil_count,
           'pb_monocyte': monocyte_count,
           'pb_lymph_typ': lymph_typ_count,
           'pb_lymph_atyp_react': lymph_atyp_react_count,
           'pb_lymph_atyp_neopl': lymph_atyp_neopl_count,
           'pb_other': other_count,
           'pb_total': total_count}
    rows.append(row)

# Create DataFrame from the list of rows
artificial_metadata = pd.DataFrame(rows)

# Save the DataFrame to a CSV file
metadata_csv_path = os.path.join(output_folder, 'metadata.csv')
artificial_metadata.to_csv(metadata_csv_path, index=False)
print(f"Metadata saved to {metadata_csv_path}")


Metadata saved to ../../artificialdata/experiment_3/data/metadata.csv


In [104]:
artificial_metadata


Unnamed: 0,patient_id,sex_1f_2m,age,bag_label,instance_count,leucocytes_per_µl,pb_myeloblast,pb_promyelocyte,pb_myelocyte,pb_metamyelocyte,pb_neutrophil_band,pb_neutrophil_segmented,pb_eosinophil,pb_basophil,pb_monocyte,pb_lymph_typ,pb_lymph_atyp_react,pb_lymph_atyp_neopl,pb_other,pb_total
0,patient_1,,0,CBFB_MYH11,598,,122,2,0,2,0,0,0,0,116,0,0,0,22,598
1,patient_2,,0,CBFB_MYH11,495,,127,0,0,8,0,0,0,0,96,0,0,0,14,495
2,patient_3,,0,CBFB_MYH11,545,,87,0,0,8,0,0,0,0,99,0,0,0,18,545
3,patient_4,,0,CBFB_MYH11,571,,97,2,1,5,0,0,0,0,225,0,0,0,39,571
4,patient_5,,0,CBFB_MYH11,625,,97,0,0,0,0,0,0,0,178,0,0,0,16,625
5,patient_1,,0,control,371,,1,0,0,0,0,0,0,0,0,0,0,0,101,371
6,patient_2,,0,control,253,,0,0,1,0,0,0,0,0,29,0,0,0,53,253
7,patient_3,,0,control,401,,1,0,0,0,0,0,0,0,38,0,0,0,63,401
8,patient_4,,0,control,272,,0,0,0,0,0,0,0,0,21,0,0,0,31,272
9,patient_5,,0,control,168,,0,0,0,1,0,0,0,0,19,0,0,0,38,168
