In [None]:
!git clone https://github.com/MLinApp-polito/2025-machine-learning-in-applications-project-mlia-2025-fp06.git

In [None]:
%%capture
!apt update && apt install -y openslide-tools
!pip install openslide-python

In [None]:
import gdown
import os
from tqdm import tqdm
import pandas as pd
import sys
import shutil
import numpy as np

sys.path.append('tempProjectRepo/mesothelioma_project/src')
from wsi_utils import load_wsi, extract_patches, count_patches

In [None]:
data_csv = "tempProjectRepo/mesothelioma_project/data/raw-data.csv"
output_dir = "tempProjectRepo/mesothelioma_project/data/patches"

diagnosis_map = {"E": "epithelioid", "S": "sarcomatoid", "B": "biphasic"}

df = pd.read_csv(data_csv, delimiter=r"\s+")

## Finetuning satTresh on the patches of the first slide

In [1]:
slide = load_wsi("M-101.ndpi")
values = range(15, 45, 5)
dir = "patches_test"

for satThresh in values:
    print(f"Tresh = {satThresh} -> {count_patches(slide, 1, satThresh)}")

Tresh = 15 -> 2579
Tresh = 20 -> 2315
Tresh = 25 -> 2121
Tresh = 30 -> 1962
Tresh = 35 -> 1704
Tresh = 40 -> 1403


## Count patches

In [None]:
count_dict = {}

for _, row in df.iloc[:].iterrows():
    filename = row['filename']
    gdrive_id = row['id']
    diagnosis_code = row['diagnosis']

    name = filename.split(".")[0] + "_" + diagnosis_code.lower()
    count_dict[name] = 0

    gdown.download(id=gdrive_id, quiet=True)

    # Load the WSI and count the patches
    slide = load_wsi(filename)
    count_dict[name] = count_patches(slide)

In [None]:
def topk_wsi_names(dict_count_patches, k=5):

    # print(dict_count_patches)

    # Idea to realize the function
    # get all names, then create three masks:
    # - e_mask, b_mask, s_mask
    # for key in keys key.split('_')[1], then store info about index in the corresponding mask

    d_keys = list(dict_count_patches.keys())

    # Mask to each label
    e_mask = np.zeros(len(d_keys))
    b_mask = np.zeros(len(d_keys))
    s_mask = np.zeros(len(d_keys))
    for index in range(len(d_keys)):
        if 'e' in d_keys[index]:
            e_mask[index] = 1
        elif 'b' in d_keys[index]:
            b_mask[index] = 1
        else:
            s_mask[index] = 1

    # flatter the dict to a list of values ( order is immutated )
    np_dict = np.array(list(dict_count_patches.values()))

    # get top k featuers for each label ( index i position is the i-th 1 element in mask )
    e_topk_indices = np.argpartition(np_dict[e_mask == 1], k-1)[-k:]
    b_topk_indices = np.argpartition(np_dict[b_mask == 1], k-1)[-k:]
    s_topk_indices = np.argpartition(np_dict[s_mask == 1], k-1)[-k:]

    # compute original position with respect to the original dictionary
    combinations = [(e_topk_indices, e_mask, 'e'), (b_topk_indices, b_mask, 'b'), (s_topk_indices, s_mask, 's')]

    # devo prendere, tra gli indici marcati come 1 in e_mask, gli indici in posizione 1 e in posizione 2 ( scarto prendo prendo scarto)
    topk_indices = []
    for topk_list, mask, label in combinations:
      indices_of_interest = []
      #  print(topk_list, mask, label)
      for i in range(len(topk_list)):
          index = topk_list[i]
          for j in range(len(mask)):
              if mask[j] == 1:
                  if index == 0:
                    indices_of_interest.append(j)
                    break
                  else:
                    index -= 1
                    continue
              else:
                  continue
        # collecting all indices to easy extract only that ones in the next cell
      topk_indices.extend(indices_of_interest)
      print(f"Top {k} indices for label {label}:", end='\t')
      for index in indices_of_interest:
        print(d_keys[index], end= '\t')
      print()

    return topk_indices

In [None]:
topk_indices = topk_wsi_names(count_dict, k=5)

Top 5 indices for label e:	M-59_e	M-13_e	M-70_e	M-68_e	M-85_e	
Top 5 indices for label b:	M-30_b	M-73_b	M-108_b	M-90_b	M-92_b	
Top 5 indices for label s:	M-65_s	M-101_s	M-86_s	M-114_s	M-87_s	


## Segmentation and Patching

In [None]:
for _, row in tqdm(df.iloc[topk_indices].iterrows(), total=len(topk_indices)):
    filename = row['filename']
    gdrive_id = row['id']
    diagnosis_code = row['diagnosis']
    diagnosis_name = diagnosis_map[diagnosis_code]

    gdown.download(id=gdrive_id, quiet=True)

    slide_id = os.path.splitext(filename)[0]
    slide_output_dir = os.path.join(output_dir, diagnosis_name, slide_id)

    # Load and process the WSI
    slide = load_wsi(filename)
    extract_patches(slide, slide_output_dir)

100%|██████████| 15/15 [1:08:22<00:00, 273.47s/it]
