In [1]:
import glob
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from openslide import OpenSlide
import h5py
from omegaconf import OmegaConf
from matplotlib import patches
from PIL import Image
import json
from shapely import Polygon
from shapely.plotting import plot_polygon
from shapely.geometry import mapping
from shapely.geometry import box
from shapely.geometry import Polygon
from rtree import index
from collections import Counter
from tqdm import tqdm
import re

In [2]:
conf_preproc = OmegaConf.load("conf/preproc.yaml")
conf_preproc = conf_preproc["project_mil"]

In [None]:
DATA_ROOT = conf_preproc.data_root_dir
DATA_ROOT

In [4]:
slides = np.sort(np.array( glob.glob( os.path.join(DATA_ROOT, "*.mrxs")) ))

In [None]:
slides.shape, slides[:5]

## Loading of raw metadata and general cleaning

In [6]:
metadata_df = pd.read_excel(conf_preproc.metadata_xls)
metadata_df.drop(columns=['Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21'], inplace=True)

In [None]:
metadata_df

### clean column "distant metastasis yes/no"

In [8]:
metadata_df["Distant metastasis yes/no"] = metadata_df["Distant metastasis yes/no"].apply(lambda x: 1 if x == "yes" else 0)

In [9]:
Counter(metadata_df["Distant metastasis yes/no"])

Counter({0: 232, 1: 63})

### add column "M", for this cohort, M=0 for all patients

In [10]:
metadata_df["M"] = 0

In [None]:
metadata_df

### clean column "T"

In [None]:
Counter(metadata_df["T"])

In [13]:
T_mapping = {
    '1c': 1,
    '1b': 1,
    '1': 1,
    '4b': 4,
    '4c': 4
}

In [14]:
metadata_df['T'] = metadata_df['T'].apply(lambda x: T_mapping.get(str(x), x))
metadata_df['T'] = metadata_df['T'].astype(int)

In [None]:
Counter(metadata_df["T"])

### clean column "N"

In [None]:
Counter(metadata_df["N"])

In [17]:
metadata_df = metadata_df[metadata_df['N'] != 'x'].copy()
metadata_df.reset_index(inplace=True, drop=True)

In [18]:
metadata_df.loc[:, 'N'] = metadata_df['N'].astype(int)

In [None]:
metadata_df

In [None]:
Counter(metadata_df["N"])

In [None]:
metadata_df

In [22]:
conditions = [
    (metadata_df['T'] == 1) & (metadata_df['N'] == 0) & (metadata_df['M'] == 0),
    
    ((metadata_df['T'] == 0) & (metadata_df['N'] == 1) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 1) & (metadata_df['N'] == 1) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 2) & (metadata_df['N'] == 0) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 2) & (metadata_df['N'] == 1) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 3) & (metadata_df['N'] == 0) & (metadata_df['M'] == 0)),
    
    ((metadata_df['T'] == 0) & (metadata_df['N'] == 2) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 1) & (metadata_df['N'] == 2) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 2) & (metadata_df['N'] == 2) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 3) & (metadata_df['N'] == 2) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 3) & (metadata_df['N'] == 1) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 4) & (metadata_df['N'] == 1) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 4) & (metadata_df['N'] == 2) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 4) & (metadata_df['N'] == 0) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 0) & (metadata_df['N'] == 3) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 1) & (metadata_df['N'] == 3) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 2) & (metadata_df['N'] == 3) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 3) & (metadata_df['N'] == 3) & (metadata_df['M'] == 0)) |
    ((metadata_df['T'] == 4) & (metadata_df['N'] == 3) & (metadata_df['M'] == 0)),
    
    (metadata_df['M'] == 1)
]

In [23]:
choices = [1, 2, 3, 4]

In [24]:
metadata_df.loc[:, 'clinical_stage'] = np.select(conditions, choices, default=None)

In [None]:
metadata_df

In [None]:
Counter(metadata_df["clinical_stage"])

In [27]:
metadata_df.to_csv("metadata_csv.csv", index=False)

## Collect slides to biosy bags and create labels_df

#### slides on disk

In [None]:
conf_preproc.mask_save_dir

In [None]:
h5_files_on_disk  = np.sort(np.array( glob.glob( os.path.join(conf_preproc.mask_save_dir, "*.h5")) ))
h5_files_on_disk.shape, h5_files_on_disk[:10]

In [30]:
h5_file_names_on_disk = np.array([os.path.basename(f) for f in h5_files_on_disk])
h5_file_names_on_disk.shape, h5_file_names_on_disk[:10]

((337,),
 array(['1.h5', '10.1.h5', '10.2.h5', '100.h5', '101.h5', '102.h5',
        '103.h5', '104.h5', '105.1.h5', '105.2.h5'], dtype='<U8'))

In [31]:
def extract_file_name_without_version(name):
    return name.rsplit(".", 2)[0]

In [32]:
h5_file_names_on_disk_without_version = np.array(
    list(dict.fromkeys([extract_file_name_without_version(name) for name in h5_file_names_on_disk]))
)
h5_file_names_on_disk_without_version.shape, h5_file_names_on_disk_without_version[:10]

((294,),
 array(['1', '10', '100', '101', '102', '103', '104', '105', '106', '107'],
       dtype='<U4'))

#### metadata df

In [33]:
def extract_base_slide_number(value):
    # If the value is not a string, convert it to string
    if not isinstance(value, str):
        value = str(value)
    
    # Split the value by commas if there are multiple slides
    slide_numbers = value.split(',')
    
    # Extract the base slide number by matching the numeric part and any trailing alphabetic characters
    base_slide_numbers = [re.match(r'^\d+[a-zA-Z]*', slide.strip()).group() for slide in slide_numbers]
    
    # Return the first unique base slide number (since all versions belong to the same patient)
    return base_slide_numbers[0]

In [34]:
# Apply the function to the "Slide number" column
metadata_df["base_slide_number"] = metadata_df["Slide number"].apply(extract_base_slide_number)

slide_number_col_index = metadata_df.columns.get_loc("Slide number")
metadata_df.insert(slide_number_col_index + 1, "base_slide_number", metadata_df.pop("base_slide_number"))

In [None]:
metadata_df

In [37]:
slide_number_in_table_without_version = metadata_df["base_slide_number"].unique()
slide_number_in_table_without_version.shape, slide_number_in_table_without_version[:20]

((287,),
 array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
        '13', '14', '15', '16', '17', '18', '19', '20'], dtype=object))

In [38]:
set(slide_number_in_table_without_version) - set(h5_file_names_on_disk_without_version) # in table but no h5 on disk

{'65'}

In [39]:
set(h5_file_names_on_disk_without_version) - set(slide_number_in_table_without_version) # in disk but not in table

{'188', '222', '260', '274', '288', '41', '51', '97'}

In [None]:
# remove slides from table if there is no emb h5 on disk
no_h5_on_disk_idx = metadata_df[ metadata_df["base_slide_number"].isin( list(set (metadata_df["base_slide_number"]) - set(h5_file_names_on_disk_without_version)) ) ].index.values
metadata_df = metadata_df.drop(list(no_h5_on_disk_idx)).reset_index(drop=True)
metadata_df

### generate labels_df

In [41]:
# Step 1: Create a list to store the new rows
new_rows = []

# Step 2: Iterate through each Base Slide Number
for base_slide_number in metadata_df['base_slide_number'].unique():
    # Create a regex pattern that matches the base slide number followed by an optional dot and version number
    pattern = re.compile(rf'^{base_slide_number}(\.\d+)?\.h5$')

    # Find matching slides in the h5 files using regex
    matching_slides = [slide for slide in h5_file_names_on_disk if pattern.search(slide)]

    #print(f"base_slide_number: {base_slide_number}")
    #print(f"matching_slides: {matching_slides}")

    # If there are matching slides, create new rows for each slide
    for slide in matching_slides:
        # Find the corresponding row(s) in the original DataFrame
        matching_rows = metadata_df[metadata_df['base_slide_number'] == base_slide_number]
        for _, row in matching_rows.iterrows():
            # Create a new row with the slide_id
            new_row = row.to_dict()
            new_row['slide_id'] = slide
            new_rows.append(new_row)

# Step 3: Create the new DataFrame from the list of new rows
labels_df = pd.DataFrame(new_rows)

# Step 4: Reorder the columns to make slide_id the second column
cols = ['Slide number', 'base_slide_number', 'slide_id'] + [col for col in labels_df.columns if col not in ['Slide number', 'base_slide_number', 'slide_id']]
labels_df = labels_df[cols]

In [None]:
metadata_df

In [None]:
labels_df

In [45]:
labels_df["base_slide_number"].unique().shape

(286,)

In [46]:
# save metadata_df and labels_df

In [51]:
def get_biopsy_idx_to_slide_idx(labels_df):
    gby_temp = list(labels_df.groupby('base_slide_number', sort=False))
    biopsy_idx_to_slide_idx = np.array( [ l[1].index.values for l in gby_temp ], dtype=object )
    return biopsy_idx_to_slide_idx

In [52]:
biopsy_idx_to_slide_idx = get_biopsy_idx_to_slide_idx(labels_df)

In [53]:
biopsy_idx_to_slide_idx[:20]

array([array([0]), array([1]), array([2, 3]), array([4, 5]),
       array([6, 7]), array([ 8,  9, 10]), array([11]), array([12, 13]),
       array([14]), array([15, 16]), array([17]), array([18]),
       array([19]), array([20]), array([21, 22]), array([23]),
       array([24]), array([25]), array([26, 27, 28]), array([29])],
      dtype=object)