In [3]:
import random
import os
import re
import shutil
import pandas as pd

#copies patients of real data to mixed data and returns dict that contains number of real patients per leukemia class
def copyfiles_real(src_folder, dest_folder):
    patients_per_folder = {}  # Dictionary to store the number of patients per folder
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file == 'fnl34_bn_features_layer_7.npy':
                src_path = os.path.join(root, file)
                # Extract class and patient information from the source path
                class_name = get_class_name(src_path)
                patient_name = get_patient_name(src_path)
                # Update patients_per_folder dictionary
                folder_path = os.path.join(dest_folder, class_name)
                patients_per_folder[folder_path] = patients_per_folder.get(folder_path, 0) + 1
                # Create destination folders if they don't exist
                dest_path = os.path.join(dest_folder, class_name, patient_name)
                os.makedirs(dest_path, exist_ok=True)
                # Copy the file to the destination
                shutil.copy(src_path, os.path.join(dest_path, file))
    
    # Print the number of patients per folder
    print(patients_per_folder)
    return patients_per_folder

def get_patient_name(path):
    return re.search(r"/data/\w+/([0-9a-zA-Z_]*)", path).group(1)

def get_class_name(path):
    return re.search(r"/data/(\w+)", path).group(1)

def copyfiles_art(src_folder, dest_folder, patient_counts):
    class_names = sorted(os.listdir(src_folder))
    for class_index, class_name in enumerate(class_names):
        if class_name not in patient_counts:
            break
        patients = os.listdir(os.path.join(src_folder, class_name))
        num_patients = min(patient_counts[class_name], len(patients))
        selected_patients = random.sample(patients, num_patients)
        for patient in selected_patients:
            patient_path = os.path.join(src_folder, class_name, patient)
            for file in os.listdir(patient_path):
                src_path = os.path.join(patient_path, file)
                dest_path = os.path.join(dest_folder, class_name, patient)
                os.makedirs(dest_path, exist_ok=True)
                shutil.copy(src_path, os.path.join(dest_path, file))

In [4]:
#This cell copies feature files from real data and artificialdata (based on exp 3) to a new folder called mixeddata
#Specify your source and destination folders
src_data_folder = '../../data'
src_artificialdata_folder = '../../artificialdata/experiment_3/data'
mixeddata_folder = '../../mixeddata'
percentage=[10,20,30,40,50]
for p in percentage:
    folder_name = f"{p}_percent"
    int_mixeddata_folder = os.path.join(mixeddata_folder, folder_name)
    dest_mixeddata_folder = os.path.join(int_mixeddata_folder, 'data')
    
    os.makedirs(dest_mixeddata_folder, exist_ok=True)
    art_percentage=p
    #Copy files from data folder
    patients_per_folder=copyfiles_real(src_data_folder, dest_mixeddata_folder)

    num_art_patient = {}
    for key, value in patients_per_folder.items():
        folder_name = os.path.basename(key)
        num_art_patient[folder_name] = round((value * art_percentage)/(100-art_percentage))

    print(num_art_patient)
    #Copy files from artificialdata folder
    copyfiles_art(src_artificialdata_folder, dest_mixeddata_folder, num_art_patient)

    # This cell creates metadata.csv 
    df1 = pd.read_csv(os.path.dirname(src_data_folder)+"/metadata.csv")
    df2 = pd.read_csv(os.path.dirname(src_artificialdata_folder)+"/metadata.csv")

    # Concatenate the dataframes
    conc_df = pd.concat([df1, df2], ignore_index=True)

    # Write the concatenated dataframe to a new CSV file
    conc_df.to_csv(os.path.dirname(dest_mixeddata_folder)+'/metadata.csv', index=False)
    
    
    # Creating output folder
    output_folder_path = os.path.join(int_mixeddata_folder, 'output')
    os.makedirs(output_folder_path, exist_ok=True)

    # Creating datafolder_1 folder within output folder
    datafolder_1_path = os.path.join(output_folder_path, 'result_folder_1')
    os.makedirs(datafolder_1_path, exist_ok=True)

{'../../mixeddata/10_percent/data/PML_RARA': 5, '../../mixeddata/10_percent/data/CBFB_MYH11': 5, '../../mixeddata/10_percent/data/NPM1': 5, '../../mixeddata/10_percent/data/RUNX1_RUNX1T1': 5, '../../mixeddata/10_percent/data/control': 5}
{'PML_RARA': 1, 'CBFB_MYH11': 1, 'NPM1': 1, 'RUNX1_RUNX1T1': 1, 'control': 1}
{'../../mixeddata/20_percent/data/PML_RARA': 5, '../../mixeddata/20_percent/data/CBFB_MYH11': 5, '../../mixeddata/20_percent/data/NPM1': 5, '../../mixeddata/20_percent/data/RUNX1_RUNX1T1': 5, '../../mixeddata/20_percent/data/control': 5}
{'PML_RARA': 1, 'CBFB_MYH11': 1, 'NPM1': 1, 'RUNX1_RUNX1T1': 1, 'control': 1}
{'../../mixeddata/30_percent/data/PML_RARA': 5, '../../mixeddata/30_percent/data/CBFB_MYH11': 5, '../../mixeddata/30_percent/data/NPM1': 5, '../../mixeddata/30_percent/data/RUNX1_RUNX1T1': 5, '../../mixeddata/30_percent/data/control': 5}
{'PML_RARA': 2, 'CBFB_MYH11': 2, 'NPM1': 2, 'RUNX1_RUNX1T1': 2, 'control': 2}
{'../../mixeddata/40_percent/data/PML_RARA': 5, '../