In [1]:
import os.path
from tqdm import tqdm
import torch
import torch.nn.functional as F
from dataset import *  # dataset
from model import *  # actual MIL model
from sklearn import metrics as metrics
import csv
import os
import shutil

In [2]:
CLASSES = ['control', 'RUNX1_RUNX1T1', 'NPM1', 'CBFB_MYH11', 'PML_RARA']
num_classes = 5

SOURCE_FOLDER = '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41'
TARGET_FOLDER = '/mnt/volume/shared/exp4/dropout_original/result_folder_1'

In [3]:
patients = {}
with open(os.path.join(SOURCE_FOLDER,'metadata.csv'), newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)
    idx = 0
    for line in reader:
        key = line[0] + "_" + line[3]
        patients[key] = [os.path.join(SOURCE_FOLDER,
                                         "data",
                                         line[3],
                                         line[0],
                                         "fnl34_bn_features_layer_7.npy"), line[3]]

In [4]:
patients

{'patient_1_CBFB_MYH11': ['/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_1/fnl34_bn_features_layer_7.npy',
  'CBFB_MYH11'],
 'patient_2_CBFB_MYH11': ['/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_2/fnl34_bn_features_layer_7.npy',
  'CBFB_MYH11'],
 'patient_3_CBFB_MYH11': ['/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_3/fnl34_bn_features_layer_7.npy',
  'CBFB_MYH11'],
 'patient_4_CBFB_MYH11': ['/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_4/fnl34_bn_features_layer_7.npy',
  'CBFB_MYH11'],
 'patient_5_CBFB_MYH11': ['/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_5/fnl34_bn_features_layer_7.npy',
  'CBFB_MYH11'],
 'patient_6_CBFB_MYH11': ['/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_6/fnl34_bn_features_layer_7.npy',
  'CBFB_M

In [5]:
def update_misclassification_count(probability_vector, one_hot_target, current_misclassification_count):
    """
    Update misclassification count based on the given probability vector and one-hot encoded target vector.

    Parameters:
    - probability_vector: Tensor representing the probability vector.
    - one_hot_target: Tensor representing the one-hot encoded target vector.
    - current_misclassification_count: Current misclassification count.

    Returns:
    - Updated misclassification count.
    """
    #print("Target: ", one_hot_target)
    #print(probability_vector)
    # Convert probability vector to one-hot encoded vector
    one_hot_prediction = torch.zeros_like(probability_vector)
    one_hot_prediction[0, torch.argmax(probability_vector).item()] = 1
    #print("One hot prediction is: ", one_hot_prediction)

    # Find the index where the target is 1
    target_index = torch.argmax(one_hot_target).item()
    #print("Target index is :", target_index)

    # Check for misclassification and update count
    if torch.argmax(one_hot_prediction).item() != target_index:
        current_misclassification_count += 1

    return current_misclassification_count

In [6]:
# Number of Monte Carlo samples
num_samples = 10

class_converter = {}

with open(os.path.join(TARGET_FOLDER,'class_conversion.csv'), newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)
    for line in reader:
        class_converter[line[1]] = int(line[0])

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


# load best performing model, and launch on test set
model = torch.load(os.path.join(TARGET_FOLDER, "state_dictmodel.pt"),map_location="cpu")
model = model.to(device)
model.train()

# Initialize arrays to store uncertainties
all_uncertainties = {}
missclassification_counts = {}
max_uncertainties = {}
sum_uncertainties = {}

# Perform Monte Carlo Dropout
with torch.no_grad():
    for p in patients.keys():
        # Make predictions with dropout enabled multiple times
        pred = []
        missclassification_count = 0
        for j in range(num_samples):
            path, lbl_name = patients[p]
            lbl = np.zeros(5)
            lbl[class_converter[lbl_name]] = 1

            bag = np.load(path)

            bag = torch.tensor(bag).to(device)
            bag = torch.unsqueeze(bag,0)
            prediction, _, _, _,  = model(bag)
            pred.append(F.softmax(prediction,dim=1).cpu().detach().numpy())

            # Class missclassification count update
            missclassification_count = update_misclassification_count(F.softmax(prediction,dim=1), torch.tensor(lbl), missclassification_count)

        # Convert the list of tensors to a PyTorch tensor
        pred_tensor = [torch.from_numpy(arr) for arr in pred]
        pred_tensor = torch.stack(pred_tensor)
        
        # Compute mean and uncertainty
        mean_prediction = pred_tensor.mean(dim=0)
        #print('Mean prediction is:', mean_prediction)
        
        uncertainty = pred_tensor.std(dim=0)
        #print('Uncertainty tensor is:', uncertainty)
        
        uncertainty_value_max = torch.max(uncertainty).item()
        uncertainty_value_sum = torch.sum(uncertainty).item()

        # Save uncertainty results along with the path
        max_uncertainties[p] = {'path': path, 'data': uncertainty.cpu().numpy().squeeze(), 'uncertainty': uncertainty_value_max}
        sum_uncertainties[p] = {'path': path, 'data': uncertainty.cpu().numpy().squeeze(), 'uncertainty': uncertainty_value_sum}
        missclassification_counts[p] = {'path': path, 'uncertainty' : missclassification_count / num_samples}

In [7]:
def sort_and_print(uncertainties):
    # Sort misclassification counts dictionary by values (highest to lowest)
    sorted_uncertainties = dict(sorted(uncertainties.items(), key=lambda item: item[1]['uncertainty'], reverse=True))
    
    # Print sorted misclassification counts
    for p, data in sorted_uncertainties.items():
        uncertainty_rate = data['uncertainty']
        print(f"Patient {p}: Uncertainty - {uncertainty_rate:.4}")


In [27]:
sort_and_print(max_uncertainties)

Patient patient_21_NPM1: Uncertainty - 0.2158
Patient patient_32_PML_RARA: Uncertainty - 0.2128
Patient patient_27_NPM1: Uncertainty - 0.205
Patient patient_1_RUNX1_RUNX1T1: Uncertainty - 0.2017
Patient patient_16_CBFB_MYH11: Uncertainty - 0.1858
Patient patient_33_RUNX1_RUNX1T1: Uncertainty - 0.1829
Patient patient_41_NPM1: Uncertainty - 0.1817
Patient patient_24_NPM1: Uncertainty - 0.1796
Patient patient_23_PML_RARA: Uncertainty - 0.1692
Patient patient_12_PML_RARA: Uncertainty - 0.1691
Patient patient_22_NPM1: Uncertainty - 0.1678
Patient patient_27_CBFB_MYH11: Uncertainty - 0.1677
Patient patient_8_PML_RARA: Uncertainty - 0.1671
Patient patient_30_PML_RARA: Uncertainty - 0.1669
Patient patient_4_NPM1: Uncertainty - 0.1656
Patient patient_6_PML_RARA: Uncertainty - 0.161
Patient patient_24_RUNX1_RUNX1T1: Uncertainty - 0.1578
Patient patient_24_CBFB_MYH11: Uncertainty - 0.156
Patient patient_18_CBFB_MYH11: Uncertainty - 0.1549
Patient patient_16_PML_RARA: Uncertainty - 0.1514
Patient 

{'patient_21_NPM1': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/NPM1/patient_21/fnl34_bn_features_layer_7.npy',
  'data': array([2.9297078e-02, 1.3487830e-04, 2.0779137e-01, 6.4921798e-04,
         2.1575654e-01], dtype=float32),
  'uncertainty': 0.21575653553009033},
 'patient_32_PML_RARA': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/PML_RARA/patient_32/fnl34_bn_features_layer_7.npy',
  'data': array([0.01287644, 0.21275061, 0.09623634, 0.1570558 , 0.00322883],
        dtype=float32),
  'uncertainty': 0.2127506136894226},
 'patient_27_NPM1': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/NPM1/patient_27/fnl34_bn_features_layer_7.npy',
  'data': array([2.0399259e-01, 5.3999072e-05, 2.0502192e-01, 2.4955377e-03,
         1.3213155e-03], dtype=float32),
  'uncertainty': 0.2050219178199768},
 'patient_1_RUNX1_RUNX1T1': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment

In [28]:
sort_and_print(sum_uncertainties)

Patient patient_32_PML_RARA: Uncertainty - 0.4821
Patient patient_21_NPM1: Uncertainty - 0.4536
Patient patient_1_RUNX1_RUNX1T1: Uncertainty - 0.4194
Patient patient_12_PML_RARA: Uncertainty - 0.4192
Patient patient_27_NPM1: Uncertainty - 0.4129
Patient patient_30_PML_RARA: Uncertainty - 0.3995
Patient patient_24_RUNX1_RUNX1T1: Uncertainty - 0.3846
Patient patient_16_CBFB_MYH11: Uncertainty - 0.3769
Patient patient_24_NPM1: Uncertainty - 0.3694
Patient patient_16_PML_RARA: Uncertainty - 0.3689
Patient patient_8_PML_RARA: Uncertainty - 0.3687
Patient patient_33_RUNX1_RUNX1T1: Uncertainty - 0.3682
Patient patient_41_NPM1: Uncertainty - 0.3657
Patient patient_13_RUNX1_RUNX1T1: Uncertainty - 0.3488
Patient patient_23_NPM1: Uncertainty - 0.3487
Patient patient_28_RUNX1_RUNX1T1: Uncertainty - 0.3486
Patient patient_22_NPM1: Uncertainty - 0.3482
Patient patient_25_PML_RARA: Uncertainty - 0.3443
Patient patient_23_PML_RARA: Uncertainty - 0.3437
Patient patient_18_CBFB_MYH11: Uncertainty - 0.34

{'patient_32_PML_RARA': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/PML_RARA/patient_32/fnl34_bn_features_layer_7.npy',
  'data': array([0.01287644, 0.21275061, 0.09623634, 0.1570558 , 0.00322883],
        dtype=float32),
  'uncertainty': 0.48214802145957947},
 'patient_21_NPM1': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/NPM1/patient_21/fnl34_bn_features_layer_7.npy',
  'data': array([2.9297078e-02, 1.3487830e-04, 2.0779137e-01, 6.4921798e-04,
         2.1575654e-01], dtype=float32),
  'uncertainty': 0.4536290764808655},
 'patient_1_RUNX1_RUNX1T1': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/RUNX1_RUNX1T1/patient_1/fnl34_bn_features_layer_7.npy',
  'data': array([1.6254070e-01, 5.1617655e-05, 2.0171596e-01, 4.9626329e-03,
         5.0148252e-02], dtype=float32),
  'uncertainty': 0.41941916942596436},
 'patient_12_PML_RARA': {'path': '/mnt/volume/shared/data_file/artificialda

In [29]:
sort_and_print(missclassification_counts)

Patient patient_11_CBFB_MYH11: Uncertainty - 1.0
Patient patient_22_CBFB_MYH11: Uncertainty - 1.0
Patient patient_30_CBFB_MYH11: Uncertainty - 1.0
Patient patient_33_CBFB_MYH11: Uncertainty - 1.0
Patient patient_40_CBFB_MYH11: Uncertainty - 1.0
Patient patient_32_control: Uncertainty - 1.0
Patient patient_3_NPM1: Uncertainty - 1.0
Patient patient_5_NPM1: Uncertainty - 1.0
Patient patient_6_NPM1: Uncertainty - 1.0
Patient patient_20_NPM1: Uncertainty - 1.0
Patient patient_32_NPM1: Uncertainty - 1.0
Patient patient_5_PML_RARA: Uncertainty - 1.0
Patient patient_10_PML_RARA: Uncertainty - 1.0
Patient patient_18_PML_RARA: Uncertainty - 1.0
Patient patient_20_PML_RARA: Uncertainty - 1.0
Patient patient_28_PML_RARA: Uncertainty - 1.0
Patient patient_34_PML_RARA: Uncertainty - 1.0
Patient patient_2_RUNX1_RUNX1T1: Uncertainty - 1.0
Patient patient_3_RUNX1_RUNX1T1: Uncertainty - 1.0
Patient patient_4_RUNX1_RUNX1T1: Uncertainty - 1.0
Patient patient_5_RUNX1_RUNX1T1: Uncertainty - 1.0
Patient pati

{'patient_11_CBFB_MYH11': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_11/fnl34_bn_features_layer_7.npy',
  'uncertainty': 1.0},
 'patient_22_CBFB_MYH11': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_22/fnl34_bn_features_layer_7.npy',
  'uncertainty': 1.0},
 'patient_30_CBFB_MYH11': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_30/fnl34_bn_features_layer_7.npy',
  'uncertainty': 1.0},
 'patient_33_CBFB_MYH11': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_33/fnl34_bn_features_layer_7.npy',
  'uncertainty': 1.0},
 'patient_40_CBFB_MYH11': {'path': '/mnt/volume/shared/data_file/artificialdata/experiment_3_seed1_41/data/CBFB_MYH11/patient_40/fnl34_bn_features_layer_7.npy',
  'uncertainty': 1.0},
 'patient_32_control': {'path': '/mnt/volume/shared/data_file/artificialdata/exp

In [12]:
def select_paths(uncertainties, criterion):
    sorted_uncertainties = dict(sorted(uncertainties.items(), key=lambda item: item[1]['uncertainty'], reverse=True))
    selected_paths = {p: data['path'] for p, data in list(sorted_uncertainties.items())[:criterion]}
    return selected_paths

In [16]:
def save_uncertain_patients(selected_paths, new_folder):
    # Copy files to the new folder
    for p, path in selected_paths.items():
        # Extract path until 'fnl34_bn_features_layer_7.npy'
        extraction_index = path.find('fnl34_bn_features_layer_7.npy')
        # Extract path after the '/data...' word
        data_index = path.find('/artificialdata/experiment_3_seed1_41/data/') + len('/artificialdata/experiment_3_seed1_41')
        extracted_path = path[data_index:extraction_index-1]
        source_path = path[:extraction_index-1]
    
        # Create target subfolder structure in the new folder
        target_folder = new_folder + extracted_path
        os.makedirs(target_folder, exist_ok=True)
    
        # List of additional files to copy
        additional_files = ["image_file_paths", "images.txt"]
    
        # Copy the content from the source to the destination, including additional files
        for root, dirs, files in os.walk(source_path):
            for file in files:
                if file in additional_files or file.endswith("fnl34_bn_features_layer_7.npy"):
                    source_file = os.path.join(root, file)
                    relative_path = os.path.relpath(source_file, source_path)
                    destination_file = os.path.join(target_folder, relative_path)
                    os.makedirs(os.path.dirname(destination_file), exist_ok=True)
                    shutil.copy2(source_file, destination_file)


In [17]:
# Define folder to save the uncertain patients
new_folder = '/mnt/volume/shared/data_file/artificialdata/uncertain_patients_misscl'
selected_misscl_paths = select_paths(missclassification_counts, 10)
save_uncertain_patients(selected_misscl_paths, new_folder)

In [18]:
# Define folder to save the uncertain patients
new_folder = '/mnt/volume/shared/data_file/artificialdata/uncertain_patients_max'
selected_max_paths = select_paths(max_uncertainties, 10)
save_uncertain_patients(selected_max_paths, new_folder)

In [None]:
# Define folder to save the uncertain patients
new_folder = '/mnt/volume/shared/data_file/artificialdata/uncertain_patients_sum'
selected_sum_paths = select_paths(sum_uncertainties, 10)
save_uncertain_patients(selected_sum_paths, new_folder)