In [1]:
import os.path
from tqdm import tqdm
import torch
import torch.nn.functional as F
from dataset_mixed import *  # dataset
from model import *  # actual MIL model
from sklearn import metrics as metrics
import csv
import os
import shutil
import pandas as pd
import re

In [2]:
CLASSES = ['control', 'RUNX1_RUNX1T1', 'NPM1', 'CBFB_MYH11', 'PML_RARA']
num_classes = 5
seed=42
experiment_source=f'experiment_3_seed{seed}'
real_data_source='/mnt/volume/shared/data_file/data'
SOURCE_FOLDER = '/mnt/volume/shared/data_file/artificialdata/'+experiment_source
TARGET_FOLDER = '/mnt/volume/shared/exp4/dropout_original/result_folder_1'
output_folder = f'/mnt/volume/shared/data_file/mixed_uncertain_seed{seed}'

def get_patient_name(path):
    return re.search(r"/data/\w+/([0-9a-zA-Z_]*)", path).group(1)

def get_class_name(path):
    return re.search(r"/data/(\w+)", path).group(1)

In [3]:
patients = {}
with open(os.path.join(SOURCE_FOLDER,'metadata.csv'), newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)
    idx = 0
    for line in reader:
        key = line[0] + "_" + line[3]
        patients[key] = [os.path.join(SOURCE_FOLDER,
                                         "data",
                                         line[3],
                                         line[0],
                                         "fnl34_bn_features_layer_7.npy"), line[3]]

In [4]:
def update_misclassification_count(probability_vector, one_hot_target, current_misclassification_count):
    """
    Update misclassification count based on the given probability vector and one-hot encoded target vector.

    Parameters:
    - probability_vector: Tensor representing the probability vector.
    - one_hot_target: Tensor representing the one-hot encoded target vector.
    - current_misclassification_count: Current misclassification count.

    Returns:
    - Updated misclassification count.
    """
    #print("Target: ", one_hot_target)
    #print(probability_vector)
    # Convert probability vector to one-hot encoded vector
    one_hot_prediction = torch.zeros_like(probability_vector)
    one_hot_prediction[0, torch.argmax(probability_vector).item()] = 1
    #print("One hot prediction is: ", one_hot_prediction)

    # Find the index where the target is 1
    target_index = torch.argmax(one_hot_target).item()
    #print("Target index is :", target_index)

    # Check for misclassification and update count
    if torch.argmax(one_hot_prediction).item() != target_index:
        current_misclassification_count += 1

    return current_misclassification_count

In [5]:
# Number of Monte Carlo samples
num_samples = 10

class_converter = {}

with open(os.path.join(TARGET_FOLDER,'class_conversion.csv'), newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)
    for line in reader:
        class_converter[line[1]] = int(line[0])

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


# load best performing model, and launch on test set
model = torch.load(os.path.join(TARGET_FOLDER, "state_dictmodel.pt"),map_location="cpu")
model = model.to(device)
model.train()

# Initialize arrays to store uncertainties
all_uncertainties = {}
missclassification_counts = {}
max_uncertainties = {}
sum_uncertainties = {}

# Perform Monte Carlo Dropout
with torch.no_grad():
    for p in patients.keys():
        # Make predictions with dropout enabled multiple times
        pred = []
        missclassification_count = 0
        for j in range(num_samples):
            path, lbl_name = patients[p]
            lbl = np.zeros(5)
            lbl[class_converter[lbl_name]] = 1

            bag = np.load(path)

            bag = torch.tensor(bag).to(device)
            bag = torch.unsqueeze(bag,0)
            prediction, _, _, _,  = model(bag)
            pred.append(F.softmax(prediction,dim=1).cpu().detach().numpy())

            # Class missclassification count update
            missclassification_count = update_misclassification_count(F.softmax(prediction,dim=1), torch.tensor(lbl), missclassification_count)

        # Convert the list of tensors to a PyTorch tensor
        pred_tensor = [torch.from_numpy(arr) for arr in pred]
        pred_tensor = torch.stack(pred_tensor)
        
        # Compute mean and uncertainty
        mean_prediction = pred_tensor.mean(dim=0)
        #print('Mean prediction is:', mean_prediction)
        
        uncertainty = pred_tensor.std(dim=0)
        #print('Uncertainty tensor is:', uncertainty)
        
        uncertainty_value_max = torch.max(uncertainty).item()
        uncertainty_value_sum = torch.sum(uncertainty).item()
        path=os.path.dirname(path)
        # Save uncertainty results along with the path
        max_uncertainties[p] = {'path': path, 'data': uncertainty.cpu().numpy().squeeze(), 'uncertainty': uncertainty_value_max}
        sum_uncertainties[p] = {'path': path, 'data': uncertainty.cpu().numpy().squeeze(), 'uncertainty': uncertainty_value_sum}
        missclassification_counts[p] = {'path': path, 'uncertainty' : missclassification_count / num_samples}

In [6]:
def sort_and_print(uncertainties):
    # Sort misclassification counts dictionary by values (highest to lowest)
    sorted_uncertainties = dict(sorted(uncertainties.items(), key=lambda item: item[1]['uncertainty'], reverse=True))
    
    # Print sorted misclassification counts
    for p, data in sorted_uncertainties.items():
        uncertainty_rate = data['uncertainty']
        print(f"Patient {p}: Uncertainty - {uncertainty_rate:.4}")


In [7]:
sort_and_print(max_uncertainties)
sort_and_print(sum_uncertainties)
sort_and_print(missclassification_counts)

Patient patient_30_CBFB_MYH11: Uncertainty - 0.2385
Patient patient_10_PML_RARA: Uncertainty - 0.2088
Patient patient_19_NPM1: Uncertainty - 0.1975
Patient patient_34_RUNX1_RUNX1T1: Uncertainty - 0.185
Patient patient_8_RUNX1_RUNX1T1: Uncertainty - 0.1847
Patient patient_32_NPM1: Uncertainty - 0.1827
Patient patient_25_NPM1: Uncertainty - 0.1752
Patient patient_7_CBFB_MYH11: Uncertainty - 0.1724
Patient patient_32_PML_RARA: Uncertainty - 0.1697
Patient patient_20_PML_RARA: Uncertainty - 0.166
Patient patient_21_PML_RARA: Uncertainty - 0.166
Patient patient_24_RUNX1_RUNX1T1: Uncertainty - 0.1625
Patient patient_8_NPM1: Uncertainty - 0.1617
Patient patient_11_NPM1: Uncertainty - 0.156
Patient patient_10_RUNX1_RUNX1T1: Uncertainty - 0.1554
Patient patient_31_PML_RARA: Uncertainty - 0.1552
Patient patient_27_NPM1: Uncertainty - 0.155
Patient patient_11_RUNX1_RUNX1T1: Uncertainty - 0.1539
Patient patient_39_PML_RARA: Uncertainty - 0.1539
Patient patient_33_PML_RARA: Uncertainty - 0.1532
Pat

In [8]:
def select_paths(uncertainties, percentage):
    sorted_uncertainties = dict(sorted(uncertainties.items(), key=lambda item: item[1]['uncertainty'], reverse=True))
    
    # Calculate the number of paths based on the percentage
    num_paths = int(len(sorted_uncertainties) * (percentage / 100.0))
    
    # Select the top percentage of paths
    selected_paths = {p: data['path'] for p, data in list(sorted_uncertainties.items())[:num_paths]}
    
    return selected_paths

metadata_path = SOURCE_FOLDER + "/metadata.csv"

def create_metadata(selected_paths, target_folder):
    # Load the existing metadata CSV file into a DataFrame
    metadata_df = pd.read_csv(metadata_path)

    # Dictionary to store the extracted information
    metadata_dict = {}

    # Loop over each key in selected_paths
    for key in selected_paths:
        path = selected_paths[key]

        # Extract patient_id and bag_label from the key
        match = re.search(r'\d', key)
        if match:
            index_of_second_digit = key.find('_', match.start() + 1)
            if index_of_second_digit != -1:
                patient_id = key[:index_of_second_digit + 1]  # Include everything up to the second underscore
                patient_id = patient_id[:-1]
                bag_label = key[index_of_second_digit + 1:]  # Remove the first underscore in bag_label
            else:
                # If no second underscore found, consider the entire key as patient_id
                patient_id = key
                bag_label = ''
        else:
            # If no digit found, consider the entire key as patient_id
            patient_id = key
            bag_label = ''

        # Filter the metadata DataFrame based on patient_id and bag_label
        metadata_row = metadata_df[(metadata_df['patient_id'] == patient_id) & (metadata_df['bag_label'] == bag_label)]

        # Check if any matching rows are found
        if not metadata_row.empty:
            # Get the first matching row (assuming there's only one match)
            metadata_dict[key] = metadata_row.iloc[0].to_dict()
        else:
            # Handle the case when no matching rows are found
            print(f"No metadata found for {key}: {path}")

    # Create a new DataFrame from the extracted metadata dictionary
    new_metadata_df = pd.DataFrame.from_dict(metadata_dict, orient='index')


    # Save the new metadata DataFrame to a CSV file in the target folder
    new_metadata_path = os.path.join(target_folder, 'metadata_uncertain_patients.csv')
    new_metadata_df.to_csv(new_metadata_path, index=False)

    return new_metadata_path


#returns dict that contains number of real patients per leukemia class
def get_realpatients_filepaths_dictionary(src_folder):
    paths = {}
    for class_name in os.listdir(src_folder):
        class_path=os.path.join(src_folder, class_name)
        for file in  os.listdir(class_path):
            src_path = os.path.join(class_path, file)
            # Extract class and patient information from the source path
            patient_name = get_patient_name(src_path)
            if class_name not in paths.keys():
                paths[class_name] = []
            paths[class_name].append(src_path)
    return paths

In [9]:
def save_patient_filepaths(selected_paths, new_folder,paths_real_patients):
    # save filepaths to the new folder
    print(f"Save file paths for uncertain patients in {new_folder}")
    os.makedirs(new_folder, exist_ok=True)
    # save filepath in dictionary paths with keys as class and values list of file paths
    paths_uncertain_patients = {}
    #Initilialise 
    for class_name in CLASSES:
        if class_name not in paths_uncertain_patients.keys():
                    paths_uncertain_patients[class_name] = []
            
    for p, path in selected_paths.items():   
        class_name = get_class_name(path)
        paths_uncertain_patients[class_name].append(path)    
    
    #Concatenate the two dictionaries
    paths_mixed_patients={}
    for key, value in paths_real_patients.items():
        paths_mixed_patients[key] = value
        paths_mixed_patients[key] += paths_uncertain_patients[key]
    print(paths_mixed_patients)   
    
    #Save ditionary
    with open(new_folder+'/file_paths.pkl', 'wb') as f:
        pickle.dump(paths_mixed_patients, f)


In [10]:
# Iterate over different percentages from 10 to 50 and save uncertain patients
paths_real_patients=get_realpatients_filepaths_dictionary(real_data_source)
for percentage in range(10, 51, 10):
    # Define folder to save the uncertain patients
    new_folder_misscl = output_folder+f'/misscl_{percentage}_percent'
    new_folder_max = output_folder+f'/max_{percentage}_percent'
    new_folder_sum = output_folder+f'/sum_{percentage}_percent'

    # Select uncertain paths based on the current percentage for missclassification_counts
    selected_misscl_paths = select_paths(missclassification_counts, percentage)
    # Save uncertain patients to the new folder
    save_patient_filepaths(selected_misscl_paths, new_folder_misscl,paths_real_patients)
    create_metadata(selected_misscl_paths, new_folder_misscl)

    # Select uncertain paths based on the current percentage for max_uncertainties
    selected_max_paths = select_paths(max_uncertainties, percentage)
    # Save uncertain patients to the new folder
    save_patient_filepaths(selected_max_paths, new_folder_max,paths_real_patients)
    create_metadata(selected_max_paths, new_folder_max)

    # Select uncertain paths based on the current percentage for sum_uncertainties
    selected_sum_paths = select_paths(sum_uncertainties, percentage)
    # Save uncertain patients to the new folder
    save_patient_filepaths(selected_sum_paths, new_folder_sum,paths_real_patients)
    create_metadata(selected_sum_paths, new_folder_sum)


Save file paths for uncertain patients in /mnt/volume/shared/data_file/mixed_uncertain_seed42/misscl_10_percent
{'control': ['/mnt/volume/shared/data_file/data/control/CIQ', '/mnt/volume/shared/data_file/data/control/FDW', '/mnt/volume/shared/data_file/data/control/ICS', '/mnt/volume/shared/data_file/data/control/WXO', '/mnt/volume/shared/data_file/data/control/NXO', '/mnt/volume/shared/data_file/data/control/ATM', '/mnt/volume/shared/data_file/data/control/MPP', '/mnt/volume/shared/data_file/data/control/DJJ', '/mnt/volume/shared/data_file/data/control/UNX', '/mnt/volume/shared/data_file/data/control/UOU', '/mnt/volume/shared/data_file/data/control/AVL', '/mnt/volume/shared/data_file/data/control/WSD', '/mnt/volume/shared/data_file/data/control/ZNK', '/mnt/volume/shared/data_file/data/control/TBZ', '/mnt/volume/shared/data_file/data/control/RDV', '/mnt/volume/shared/data_file/data/control/MKF', '/mnt/volume/shared/data_file/data/control/LCW', '/mnt/volume/shared/data_file/data/control

In [11]:

def concatenate_metadata(original_metadata_path,uncertain_patients_folder):
    # Load the original metadata CSV file
    original_metadata_df = pd.read_csv(original_metadata_path)

    # Loop over folders in uncertain_patients
    for folder in os.listdir(uncertain_patients_folder):
        folder_path = os.path.join(uncertain_patients_folder, folder)
        #print(folder_path)

        # Check if the item in the uncertain_patients_folder is a directory
        if os.path.isdir(folder_path):
            # Look for metadata.csv in each folder
            metadata_file_path = os.path.join(folder_path, 'metadata_uncertain_patients.csv')
            print("metadata is:")
            print(metadata_file_path)

            if os.path.exists(metadata_file_path):
                # Load the metadata from the current folder
                uncertain_metadata_df = pd.read_csv(metadata_file_path)

                # Concatenate with the original metadata
                concatenated_metadata_df = pd.concat([original_metadata_df, uncertain_metadata_df], ignore_index=True)
                # Save the concatenated metadata to a new CSV file in the output folder
                output_folder_new = uncertain_patients_folder + f'/{folder}'
                os.makedirs(output_folder_new, exist_ok=True)
                output_file_path = os.path.join(output_folder_new, 'metadata.csv')
                print("Output is:")
                print(output_file_path)
                concatenated_metadata_df.to_csv(output_file_path, index=False)

In [12]:
original_metadata_path="/mnt/volume/shared/data_file/metadata.csv"
concatenate_metadata(original_metadata_path,output_folder)

metadata is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/misscl_30_percent/metadata_uncertain_patients.csv
Output is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/misscl_30_percent/metadata.csv
metadata is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/sum_40_percent/metadata_uncertain_patients.csv
Output is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/sum_40_percent/metadata.csv
metadata is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/sum_50_percent/metadata_uncertain_patients.csv
Output is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/sum_50_percent/metadata.csv
metadata is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/max_20_percent/metadata_uncertain_patients.csv
Output is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/max_20_percent/metadata.csv
metadata is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/sum_10_percent/metadata_uncertain_patients.csv
Output is:
/mnt/volume/shared/data_file/mixed_uncertain_seed42/sum_10_