In [164]:
import random
import os
import re
import shutil
import pandas as pd
import numpy as np

#copies patients of real data to mixed data and 
#returns dict that contains number of real patients per leukemia class
def copyfiles_real(src_folder, dest_folder):
    patients_per_folder = {}  # Dictionary to store the number of patients per folder
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file == 'fnl34_bn_features_layer_7.npy':
                src_path = os.path.join(root, file)
                # Extract class and patient information from the source path
                class_name = get_class_name(src_path)
                patient_name = get_patient_name(src_path)
                # Update patients_per_folder dictionary
                folder_path = os.path.join(dest_folder, class_name)
                patients_per_folder[folder_path] = patients_per_folder.get(folder_path, 0) + 1
                # Create destination folders if they don't exist
                dest_path = os.path.join(dest_folder, class_name, patient_name)
                os.makedirs(dest_path, exist_ok=True)
                # Copy the file to the destination
                shutil.copy(src_path, os.path.join(dest_path, file))
    
    # Print the number of patients per folder
    #print(patients_per_folder)
    return patients_per_folder

def get_patient_name(path):
    return re.search(r"/data/\w+/([0-9a-zA-Z_]*)", path).group(1)

def get_class_name(path):
    return re.search(r"/data/(\w+)", path).group(1)

'''def copyfiles_art(src_folder, dest_folder, patient_counts):
    class_names = sorted(os.listdir(src_folder))
    for class_index, class_name in enumerate(class_names):
        if class_name not in patient_counts:
            break
        patients = os.listdir(os.path.join(src_folder, class_name))
        num_patients = min(patient_counts[class_name], len(patients))
        selected_patients = random.sample(patients, num_patients)
        for patient in selected_patients:
            patient_path = os.path.join(src_folder, class_name, patient)
            for file in os.listdir(patient_path):
                src_path = os.path.join(patient_path, file)
                dest_path = os.path.join(dest_folder, class_name, patient)
                os.makedirs(dest_path, exist_ok=True)
                shutil.copy(src_path, os.path.join(dest_path, file))'''
#copy artificial patients
#return names of these patients per class as tuple (class, patient)
def copyfiles_art(src_folder, dest_folder, patient_counts):
    selected_patients = []
    class_names = sorted(os.listdir(src_folder))
    for class_index, class_name in enumerate(class_names):
        if class_name not in patient_counts:
            break
        patients = os.listdir(os.path.join(src_folder, class_name))
        num_patients = min(patient_counts[class_name], len(patients))
        selected_patient_names = random.sample(patients, num_patients)
        for patient_name in selected_patient_names:
            selected_patients.append((patient_name, class_name))
            patient_path = os.path.join(src_folder, class_name, patient_name)
            for file in os.listdir(patient_path):
                src_path = os.path.join(patient_path, file)
                dest_path = os.path.join(dest_folder, class_name, patient_name)
                os.makedirs(dest_path, exist_ok=True)
                shutil.copy(src_path, os.path.join(dest_path, file))
    #print(selected_patients)
    return selected_patients

In [165]:
#This cell copies feature files from real data and artificialdata (based on exp 3) to a new folder called mixeddata
#Specify your source and destination folders
src_data_folder = '../../data'
src_artificialdata_folder = '../../artificialdata/experiment_3/data'
mixeddata_folder = '../../mixeddata'
percentage=[10,20,30,40,50]
for p in percentage:
    folder_name = f"{p}_percent"
    int_mixeddata_folder = os.path.join(mixeddata_folder, folder_name)
    dest_mixeddata_folder = os.path.join(int_mixeddata_folder, 'data')
    
    os.makedirs(dest_mixeddata_folder, exist_ok=True)
    art_percentage=p
    #Copy files from data folder
    patients_per_folder=copyfiles_real(src_data_folder, dest_mixeddata_folder)

    num_art_patient = {}
    for key, value in patients_per_folder.items():
        folder_name = os.path.basename(key)
        num_art_patient[folder_name] = round((value * art_percentage)/(100-art_percentage))

    #Copy files from artificialdata folder
    selected_patients=copyfiles_art(src_artificialdata_folder, dest_mixeddata_folder, num_art_patient)
    
    # Create DataFrame from selected_patients with all specified columns
    
    # This cell creates metadata.csv 
    df1 = pd.read_csv(os.path.dirname(src_data_folder)+"/metadata.csv")
    df2 = pd.read_csv(os.path.dirname(src_artificialdata_folder)+"/metadata.csv")

    #TODO: remove patients that were not selected

    selected_patients_df = pd.DataFrame(selected_patients, columns=['patient_id', 'bag_label'])
    filtered_df2 = df2.merge(selected_patients_df, on=['patient_id', 'bag_label'], how='inner')    

    # Concatenate the dataframes
  
    #df1_selected = df1[['patient_id', 'bag_label']]
    #filtered_df2_selected = filtered_df2[['patient_id', 'bag_label']]

    # Concatenate the selected dataframes
    conc_df = pd.concat([df1, filtered_df2], ignore_index=True)        
    #conc_df = pd.concat([df1, filtered_df2], ignore_index=True)
    
    # Write the concatenated dataframe to a new CSV file
    conc_df.to_csv(os.path.dirname(dest_mixeddata_folder)+'/metadata.csv', index=False)
    
    
    # Creating output folder
    output_folder_path = os.path.join(int_mixeddata_folder, 'output')
    os.makedirs(output_folder_path, exist_ok=True)

    # Creating datafolder_1 folder within output folder
    datafolder_1_path = os.path.join(output_folder_path, 'result_folder_1')
    os.makedirs(datafolder_1_path, exist_ok=True)


In [166]:
conc_df

Unnamed: 0,patient_id,sex_1f_2m,age,bag_label,instance_count,leucocytes_per_¬µl,pb_myeloblast,pb_promyelocyte,pb_myelocyte,pb_metamyelocyte,...,pb_neutrophil_segmented,pb_eosinophil,pb_basophil,pb_monocyte,pb_lymph_typ,pb_lymph_atyp_react,pb_lymph_atyp_neopl,pb_other,pb_total,leucocytes_per_µl
0,ABF,2.0,444.0,RUNX1_RUNX1T1,497,4441.0,87.0,0.0,1.0,0.0,...,2.0,0.0,0.0,2.0,8.0,0.0,0.0,0.0,100.0,
1,AEC,2.0,35.0,control,488,,0.0,0.0,0.0,0.0,...,77.0,2.0,1.0,4.0,16.0,0.0,0.0,0.0,100.0,
2,AIH,2.0,496.0,control,196,,0.0,0.0,0.0,0.0,...,63.0,2.0,1.0,4.0,30.0,0.0,0.0,0.0,100.0,
3,ALA,2.0,751.0,NPM1,494,138.0,85.0,0.0,1.0,0.0,...,4.0,0.0,0.0,3.0,6.0,0.0,0.0,0.0,100.0,
4,ALE,1.0,438.0,PML_RARA,487,85.0,96.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,100.0,
5,AQK,1.0,463.0,CBFB_MYH11,420,,23.0,0.0,0.0,0.0,...,7.0,0.0,0.0,14.0,56.0,0.0,,0.0,100.0,
6,ATM,1.0,301.0,control,347,,0.0,0.0,0.0,0.0,...,69.0,3.0,2.0,4.0,22.0,0.0,0.0,0.0,100.0,
7,AVL,2.0,371.0,control,388,,0.0,0.0,0.0,0.0,...,66.0,2.0,1.0,4.0,27.0,0.0,0.0,0.0,100.0,
8,BHG,2.0,658.0,RUNX1_RUNX1T1,497,35.0,72.0,3.0,7.0,0.0,...,9.0,0.0,0.0,2.0,4.0,0.0,,0.0,100.0,
9,BHR,1.0,399.0,NPM1,488,30.0,82.0,0.0,0.0,0.0,...,1.0,2.0,0.0,7.0,8.0,0.0,0.0,0.0,100.0,


In [167]:
filtered_df2

Unnamed: 0,patient_id,sex_1f_2m,age,bag_label,instance_count,leucocytes_per_µl,pb_myeloblast,pb_promyelocyte,pb_myelocyte,pb_metamyelocyte,pb_neutrophil_band,pb_neutrophil_segmented,pb_eosinophil,pb_basophil,pb_monocyte,pb_lymph_typ,pb_lymph_atyp_react,pb_lymph_atyp_neopl,pb_other,pb_total
0,patient_1,,,CBFB_MYH11,246,,,,,,,,,,,,,,,
1,patient_2,,,CBFB_MYH11,374,,,,,,,,,,,,,,,
2,patient_3,,,CBFB_MYH11,593,,,,,,,,,,,,,,,
3,patient_4,,,CBFB_MYH11,440,,,,,,,,,,,,,,,
4,patient_5,,,CBFB_MYH11,437,,,,,,,,,,,,,,,
5,patient_1,,,control,521,,,,,,,,,,,,,,,
6,patient_2,,,control,388,,,,,,,,,,,,,,,
7,patient_3,,,control,470,,,,,,,,,,,,,,,
8,patient_4,,,control,363,,,,,,,,,,,,,,,
9,patient_5,,,control,270,,,,,,,,,,,,,,,


In [168]:
conc_df

Unnamed: 0,patient_id,sex_1f_2m,age,bag_label,instance_count,leucocytes_per_¬µl,pb_myeloblast,pb_promyelocyte,pb_myelocyte,pb_metamyelocyte,...,pb_neutrophil_segmented,pb_eosinophil,pb_basophil,pb_monocyte,pb_lymph_typ,pb_lymph_atyp_react,pb_lymph_atyp_neopl,pb_other,pb_total,leucocytes_per_µl
0,ABF,2.0,444.0,RUNX1_RUNX1T1,497,4441.0,87.0,0.0,1.0,0.0,...,2.0,0.0,0.0,2.0,8.0,0.0,0.0,0.0,100.0,
1,AEC,2.0,35.0,control,488,,0.0,0.0,0.0,0.0,...,77.0,2.0,1.0,4.0,16.0,0.0,0.0,0.0,100.0,
2,AIH,2.0,496.0,control,196,,0.0,0.0,0.0,0.0,...,63.0,2.0,1.0,4.0,30.0,0.0,0.0,0.0,100.0,
3,ALA,2.0,751.0,NPM1,494,138.0,85.0,0.0,1.0,0.0,...,4.0,0.0,0.0,3.0,6.0,0.0,0.0,0.0,100.0,
4,ALE,1.0,438.0,PML_RARA,487,85.0,96.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,100.0,
5,AQK,1.0,463.0,CBFB_MYH11,420,,23.0,0.0,0.0,0.0,...,7.0,0.0,0.0,14.0,56.0,0.0,,0.0,100.0,
6,ATM,1.0,301.0,control,347,,0.0,0.0,0.0,0.0,...,69.0,3.0,2.0,4.0,22.0,0.0,0.0,0.0,100.0,
7,AVL,2.0,371.0,control,388,,0.0,0.0,0.0,0.0,...,66.0,2.0,1.0,4.0,27.0,0.0,0.0,0.0,100.0,
8,BHG,2.0,658.0,RUNX1_RUNX1T1,497,35.0,72.0,3.0,7.0,0.0,...,9.0,0.0,0.0,2.0,4.0,0.0,,0.0,100.0,
9,BHR,1.0,399.0,NPM1,488,30.0,82.0,0.0,0.0,0.0,...,1.0,2.0,0.0,7.0,8.0,0.0,0.0,0.0,100.0,


In [169]:
selected_patients_df


Unnamed: 0,patient_id,bag_label
0,patient_4,CBFB_MYH11
1,patient_3,CBFB_MYH11
2,patient_1,CBFB_MYH11
3,patient_2,CBFB_MYH11
4,patient_5,CBFB_MYH11
5,patient_1,NPM1
6,patient_2,NPM1
7,patient_3,NPM1
8,patient_4,NPM1
9,patient_5,NPM1


In [170]:
filtered_df2

Unnamed: 0,patient_id,sex_1f_2m,age,bag_label,instance_count,leucocytes_per_µl,pb_myeloblast,pb_promyelocyte,pb_myelocyte,pb_metamyelocyte,pb_neutrophil_band,pb_neutrophil_segmented,pb_eosinophil,pb_basophil,pb_monocyte,pb_lymph_typ,pb_lymph_atyp_react,pb_lymph_atyp_neopl,pb_other,pb_total
0,patient_1,,,CBFB_MYH11,246,,,,,,,,,,,,,,,
1,patient_2,,,CBFB_MYH11,374,,,,,,,,,,,,,,,
2,patient_3,,,CBFB_MYH11,593,,,,,,,,,,,,,,,
3,patient_4,,,CBFB_MYH11,440,,,,,,,,,,,,,,,
4,patient_5,,,CBFB_MYH11,437,,,,,,,,,,,,,,,
5,patient_1,,,control,521,,,,,,,,,,,,,,,
6,patient_2,,,control,388,,,,,,,,,,,,,,,
7,patient_3,,,control,470,,,,,,,,,,,,,,,
8,patient_4,,,control,363,,,,,,,,,,,,,,,
9,patient_5,,,control,270,,,,,,,,,,,,,,,
