In [None]:
!git clone https://github.com/andreazenotto/tempProjectRepo.git

In [None]:
%%capture
!apt update && apt install -y openslide-tools
!pip install openslide-python

In [None]:
import gdown
import os
from tqdm import tqdm
import pandas as pd
import sys

sys.path.append('tempProjectRepo/mesothelioma_project/src')
from wsi_utils import load_wsi, extract_patches, count_patches

In [None]:
data_csv = "tempProjectRepo/mesothelioma_project/data/raw-data.csv"
output_dir = "tempProjectRepo/mesothelioma_project/data/patches"

diagnosis_map = {"E": "epithelioid", "S": "sarcomatoid", "B": "biphasic"}

df = pd.read_csv(data_csv, delimiter=r"\s+")
start_idx = 0
end_idx = len(df)

## Finetuning satTresh on the patches of the first slide

In [None]:
slide = load_wsi("M-1.ndpi")
values = [23, 25, 27, 29, 30]

for satThresh in values:
    print(f"Tresh = {satTresh} -> {count_patches(slide, "patches_test", level=1, thresh=satThresh)}")

## Count patches

In [None]:
def topk_wsi_names(dict_count_patches, k=5):

    # print(dict_count_patches)

    # Idea to realize the function
    # get all names, then create three masks:
    # - e_mask, b_mask, s_mask
    # for key in keys key.split('_')[1], then store info about index in the corresponding mask

    d_keys = list(dict_count_patches.keys())

    # Mask to each label
    e_mask = np.zeros(len(d_keys))
    b_mask = np.zeros(len(d_keys))
    s_mask = np.zeros(len(d_keys))
    for index in range(len(d_keys)):
        if 'epithelioid' in d_keys[index]:
            e_mask[index] = 1
        elif 'biphasic' in d_keys[index]:
            b_mask[index] = 1
        else:
            s_mask[index] = 1

    # flatter the dict to a list of values ( order is immutated )
    np_dict = np.array(list(dict_count_patches.values()))

    # get top k featuers for each label ( index i position is the i-th 1 element in mask )
    e_topk_indices = np.argpartition(np_dict[e_mask == 1], -k)[-k:]
    b_topk_indices = np.argpartition(np_dict[b_mask == 1], -k)[-k:]
    s_topk_indices = np.argpartition(np_dict[s_mask == 1], -k)[-k:]

    # compute original position with respect to the original dictionary
    combinations = [(e_topk_indices, e_mask, 'e'), (b_topk_indices, b_mask, 'b'), (s_topk_indices, s_mask, 's')]

    # devo prendere, tra gli indici marcati come 1 in e_mask, gli indici in posizione 1 e in posizione 2 ( scarto prendo prendo scarto)
    topk_indices = []
    for topk_list, mask, label in combinations:
      indices_of_interest = []
      #  print(topk_list, mask, label)
      for i in range(len(topk_list)):
          index = topk_list[i]
          for j in range(len(mask)):
              if mask[j] == 1:
                  if index == 0:
                    indices_of_interest.append(j)
                    break
                  else:
                    index -= 1
                    continue
              else:
                  continue

      # collecting all indices to easy extract only that ones in the next cell
      topk_indices.extend(indices_of_interest)
      print(f"Top {k} indices for label {label}:", end='\t')
      for index in indices_of_interest:
        print(d_keys[index], end= '\t')
      print()

      return topk_indices

In [None]:
count_dict = {}

for _, row in tqdm(df.iloc[start_idx:end_idx].iterrows(), total=len(df)):
    filename = row['filename']
    gdrive_id = row['id']
    diagnosis_code = row['diagnosis']
    diagnosis_name = diagnosis_map[diagnosis_code]

    name = filename.split(".")[0] + " - " + diagnosis_name
    count_dict[name] = 0

    wsi_url = f"https://drive.google.com/uc?id={gdrive_id}"
    gdown.download(wsi_url, quiet=False)

    slide_id = os.path.splitext(filename)[0]
    slide_output_dir = os.path.join(output_dir, diagnosis_name, slide_id)

    # Load and process the WSI
    slide = load_wsi(filename)
    count_dict[filename] = count_patches(slide, slide_output_dir, level=1)

topk_indices = topk_wsi_names(count_dict, k=5)

## Segmentation and Patching

In [None]:
for _, row in tqdm(df.iloc[topk_indices].iterrows(), total=len(df)):
    filename = row['filename']
    gdrive_id = row['id']
    diagnosis_code = row['diagnosis']
    diagnosis_name = diagnosis_map[diagnosis_code]

    wsi_url = f"https://drive.google.com/uc?id={gdrive_id}"
    gdown.download(wsi_url, quiet=False)

    slide_id = os.path.splitext(filename)[0]
    slide_output_dir = os.path.join(output_dir, diagnosis_name, slide_id)

    # Load and process the WSI
    slide = load_wsi(filename)
    extract_patches(slide, slide_output_dir, level=1, threshold=30)

In [None]:
from google.colab import files
import shutil

# Percorso della cartella da scaricare
folder_path = "tempProjectRepo/mesothelioma_project/data/patches"

# Nome del file zip da creare
zip_filename = "patches.zip"

# Comprimi la cartella
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', folder_path)

# Scarica il file zip
files.download(zip_filename)