In [1]:
!git clone https://github.com/andreazenotto/tempProjectRepo.git

Cloning into 'tempProjectRepo'...
remote: Enumerating objects: 114, done.[K
remote: Counting objects: 100% (114/114), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 114 (delta 51), reused 92 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (114/114), 22.68 KiB | 7.56 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [1]:
%%capture
!apt update && apt install -y openslide-tools
!pip install openslide-python

In [2]:
import gdown
import os
from tqdm import tqdm
import pandas as pd
import sys
import shutil
import numpy as np

sys.path.append('tempProjectRepo/mesothelioma_project/src')
from wsi_utils import load_wsi, extract_patches, count_patches

In [3]:
data_csv = "tempProjectRepo/mesothelioma_project/data/raw-data.csv"
output_dir = "tempProjectRepo/mesothelioma_project/data/patches"

diagnosis_map = {"E": "epithelioid", "S": "sarcomatoid", "B": "biphasic"}

df = pd.read_csv(data_csv, delimiter=r"\s+")

## Finetuning satTresh on the patches of the first slide

In [15]:
slide = load_wsi("M-1.ndpi")
values = range(15, 45, 5)
dir = "patches_test"

for satThresh in values:
    print(f"Tresh = {satThresh} -> {count_patches(slide, 1, satThresh)}")

Tresh = 15 -> 5284
Tresh = 20 -> 3973
Tresh = 25 -> 3591
Tresh = 30 -> 3369
Tresh = 35 -> 3133
Tresh = 40 -> 2922


## Count patches

In [21]:
count_dict = {}

for _, row in df.iloc[:].iterrows():
    filename = row['filename']
    gdrive_id = row['id']
    diagnosis_code = row['diagnosis']

    name = filename.split(".")[0] + "_" + diagnosis_code.lower()
    count_dict[name] = 0

    gdown.download(id=gdrive_id, quiet=True)

    # Load the WSI and count the patches
    slide = load_wsi(filename)
    count_dict[name] = count_patches(slide)

print(count_dict)

{'M-1_b': 3369, 'M-2_e': 4881, 'M-3_e': 1884, 'M-4_e': 2787, 'M-5_e': 5104, 'M-6_b': 3649, 'M-7_e': 369, 'M-8_e': 1772, 'M-9_e': 1308, 'M-10_e': 2305, 'M-11_e': 1151, 'M-12_e': 1965, 'M-13_e': 8193, 'M-14_e': 3052, 'M-15_e': 851, 'M-16_e': 1687, 'M-17_e': 3466, 'M-18_e': 2136, 'M-19_e': 3329, 'M-20_e': 1402, 'M-21_e': 881, 'M-22_e': 3677, 'M-23_e': 3966, 'M-24_b': 2414, 'M-25_e': 1507, 'M-26_e': 3502, 'M-27_e': 5910, 'M-28_e': 1594, 'M-29_e': 5198, 'M-30_b': 4696, 'M-31_e': 737, 'M-32_b': 3951, 'M-33_e': 3023, 'M-34_e': 4370, 'M-35_e': 2746, 'M-37_e': 3069, 'M-38_e': 4032, 'M-39_e': 6056, 'M-40_b': 368, 'M-41_e': 186, 'M-42_e': 1136, 'M-43_e': 19, 'M-44_e': 1186, 'M-45_e': 6439, 'M-46_e': 5111, 'M-47_b': 548, 'M-48_e': 1206, 'M-49_e': 1979, 'M-50_e': 2002, 'M-51_b': 3006, 'M-52_e': 5701, 'M-53_e': 6762, 'M-54_e': 1851, 'M-55_e': 3516, 'M-56_e': 5695, 'M-57_e': 2745, 'M-58_e': 5367, 'M-59_e': 7822, 'M-60_e': 7646, 'M-61_e': 1047, 'M-62_b': 1317, 'M-63_e': 2320, 'M-64_e': 4319, 'M-65_s':

In [26]:
def topk_wsi_names(dict_count_patches, k=5):

    # print(dict_count_patches)

    # Idea to realize the function
    # get all names, then create three masks:
    # - e_mask, b_mask, s_mask
    # for key in keys key.split('_')[1], then store info about index in the corresponding mask

    d_keys = list(dict_count_patches.keys())

    # Mask to each label
    e_mask = np.zeros(len(d_keys))
    b_mask = np.zeros(len(d_keys))
    s_mask = np.zeros(len(d_keys))
    for index in range(len(d_keys)):
        if 'e' in d_keys[index]:
            e_mask[index] = 1
        elif 'b' in d_keys[index]:
            b_mask[index] = 1
        else:
            s_mask[index] = 1

    # flatter the dict to a list of values ( order is immutated )
    np_dict = np.array(list(dict_count_patches.values()))

    # get top k featuers for each label ( index i position is the i-th 1 element in mask )
    e_topk_indices = np.argpartition(np_dict[e_mask == 1], k-1)[-k:]
    b_topk_indices = np.argpartition(np_dict[b_mask == 1], k-1)[-k:]
    s_topk_indices = np.argpartition(np_dict[s_mask == 1], k-1)[-k:]

    # compute original position with respect to the original dictionary
    combinations = [(e_topk_indices, e_mask, 'e'), (b_topk_indices, b_mask, 'b'), (s_topk_indices, s_mask, 's')]

    # devo prendere, tra gli indici marcati come 1 in e_mask, gli indici in posizione 1 e in posizione 2 ( scarto prendo prendo scarto)
    topk_indices = []
    for topk_list, mask, label in combinations:
      indices_of_interest = []
      #  print(topk_list, mask, label)
      for i in range(len(topk_list)):
          index = topk_list[i]
          for j in range(len(mask)):
              if mask[j] == 1:
                  if index == 0:
                    indices_of_interest.append(j)
                    break
                  else:
                    index -= 1
                    continue
              else:
                  continue
        # collecting all indices to easy extract only that ones in the next cell
      topk_indices.extend(indices_of_interest)
      print(f"Top {k} indices for label {label}:", end='\t')
      for index in indices_of_interest:
        print(d_keys[index], end= '\t')
      print()

    return topk_indices

In [28]:
topk_indices = topk_wsi_names(count_dict, k=5)

Top 5 indices for label e:	M-59_e	M-13_e	M-70_e	M-68_e	M-85_e	
Top 5 indices for label b:	M-30_b	M-73_b	M-108_b	M-90_b	M-92_b	
Top 5 indices for label s:	M-65_s	M-101_s	M-86_s	M-114_s	M-87_s	


## Segmentation and Patching

In [5]:
topk_indices = [57, 12, 68, 66, 82, 29, 71, 101, 87, 89, 63, 96, 83, 107, 84]

In [None]:
for _, row in tqdm(df.iloc[topk_indices].iterrows(), total=len(topk_indices)):
    filename = row['filename']
    gdrive_id = row['id']
    diagnosis_code = row['diagnosis']
    diagnosis_name = diagnosis_map[diagnosis_code]

    down.download(id=gdrive_id, quiet=True)

    slide_id = os.path.splitext(filename)[0]
    slide_output_dir = os.path.join(output_dir, diagnosis_name, slide_id)

    # Load and process the WSI
    slide = load_wsi(os.path.join(drive_path, filename))
    extract_patches(slide, slide_output_dir)