# WSI pre-processing 

## 1.1 obtain classified patches for each WSI

Please refer to https://github.com/choosehappy/HistoQC to install HistoQC and then run the following:

python -m histoqc -c v2.1 -n 3 "/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/WSIs/NKI/*.mrxs" -o "/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/QCs/NKI"

This outputs a _mask_use.png file that contains the detected foreground tissue regions


Then refer to https://github.com/cancerbioinformatics/NBT-Classifier to obtain classified patches for each WSI and then run the following:

python main.py \
  --wsi_folder /scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/WSIs/NKI \
  --mask_folder /scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/QCs/NKI \
  --output_folder /scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/FEATURES/NKI \
  --model_type TC_512 \
  --patch_size_microns 128 \
  --roi_width 250
  
This outputs a slide1_TC_512_patch.csv file that contains the detected and classified patches.

## 1.2 extract features from patches using pre-trained models

Please run the following

python extractFeatures.py --model UNI --stain augmentation --cohort NKI

## 1.3 clean data
The following step further clean the data by removing invalid slides that either failed to obtain features or contain epithelium patches (with a confidence higher than 0.9) less than 5.

In [23]:
import os
import h5py
import glob
import numpy as np
import pandas as pd
from pathlib import Path
from utils_model import add_ageGroup, parse_wsi_id


def print_summary(df):
    df["patient_id"] = df["patient_id"].astype(str)
    print(f"Number of unique WSI IDs: {df['wsi_id'].nunique()}")
    print(f"Number of unique patient IDs: {df['patient_id'].nunique()}")
    print(f"Overall age range: {df['age'].min()} - {df['age'].max()}")
        
    age_groups, counts = np.unique(df["age_group"], return_counts=True)
    print("Unique age groups and counts:", dict(zip(age_groups, counts)))
    print("\nAge range per cohort:")
    
    for cohort, group in df.groupby("cohort"):
        print(f"  {cohort}: {group['age'].min()} - {group['age'].max()}")
    
    df = df.groupby(["age_group", "cohort"]).agg(
        num_patients=("patient_id", "nunique"),
        num_wsis=("wsi_id", "nunique")
    ).reset_index()
    
    pivot_df = df.pivot(index="cohort", columns="age_group", values=["num_wsis", "num_patients"])
    formatted_df = pivot_df.apply(lambda x: x["num_wsis"].astype(str) + "/" + x["num_patients"].astype(str), axis=1)
    formatted_df["Total"] = df.groupby("cohort")["num_wsis"].sum().astype(str) + "/" + df.groupby("cohort")["num_patients"].sum().astype(str)
    col_sum_wsis = df.groupby("age_group")["num_wsis"].sum()
    col_sum_patients = df.groupby("age_group")["num_patients"].sum()
    formatted_df.loc["Total"] = col_sum_wsis.astype(str) + "/" + col_sum_patients.astype(str)
    formatted_df.loc["Total", "Total"] = df["num_wsis"].sum().astype(str) + "/" + df["num_patients"].sum().astype(str)
    print(formatted_df)

    
    
def clean_data(meta_pt, FEATURES, model_name="UNI", stainFunc="reinhard"):
    clinic_df = pd.read_csv(meta_pt)
    clinic_df = add_ageGroup(clinic_df)
    
    print("-" * 30)
    print("filtering features...")
    h5_dict = {"wsi_id": [], "h5df": []} 
    for wsi_id in list(clinic_df["wsi_id"]):
        file = glob.glob(f'{FEATURES}/*/{wsi_id}*/{wsi_id}*{model_name}*{stainFunc}*.h5')
        if file:
            for i in file:
                h5_dict["wsi_id"].append(i.split("/")[-2])
                h5_dict["h5df"].append(i)  
                
    h5_df = pd.DataFrame(h5_dict)  
    clinic_df = clinic_df.merge(h5_df, on="wsi_id", how="right")  

    valid_wsi = []
    valid_patches = []
    for fea_pt in clinic_df["h5df"]: 
        with h5py.File(fea_pt, "r") as file:
            bag = np.array(file["embeddings"])
            bag = np.squeeze(bag)
            img_id = np.array(file["patch_id"])
        img_id = [i.decode("utf-8") for i in img_id]
        bag_df = pd.DataFrame(bag)
        bag_df.index = img_id
    
        csv_pt = glob.glob(f"{fea_pt.split('_bagFeature_')[0]}*patch.csv")[0]
        df = pd.read_csv(csv_pt)
    
        valid_id = list(df['patch_id'][df['TC_epi'] > 0.9])
        valid_id = list(set(valid_id) & set(bag_df.index))
        valid_patches.extend(valid_id)
        if valid_id:
            wsi_id = parse_wsi_id(valid_id[0])
            valid_wsi.extend([wsi_id] * len(valid_id))
    
    a, b = np.unique(valid_wsi, return_counts=True)
    filtered_a = [i for i, count in zip(a, b) if count >= 5]
    
    print("-" * 30)
    print("filtering patches...")
    clinic_df = clinic_df[clinic_df["wsi_id"].isin(filtered_a)].copy()
    clinic_df["h5df"] = [Path(i) for i in list(clinic_df["h5df"])]

    return clinic_df

In [39]:
FEATURES = "/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/FEATURES"
meta_pt = "/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/Metadata/train_NR.csv"
train_df = clean_data(meta_pt, FEATURES)
print_summary(train_df)
train_df.to_csv("/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/Metadata/train_NR_clean.csv", index=False)

Number of unique WSI IDs: 747
Number of unique patient IDs: 747
Overall age range: 19 - 90
Unique age groups and counts: {0: 225, 1: 180, 2: 157, 3: 185}

Age range per cohort:
  SGK: 19 - 90
age_group        0        1        2        3    Total
cohort                                                
SGK        225/225  180/180  157/157  185/185  747/747
Total      225/225  180/180  157/157  185/185  747/747


In [40]:
FEATURES = "/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/FEATURES"
meta_pt = "/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/Metadata/test_NR.csv"
NR_df = clean_data(meta_pt, FEATURES, model_name="UNI", stainFunc="augmentation")
print_summary(NR_df)
NR_df.to_csv("/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/Metadata/test_NR_clean.csv", index=False)

Number of unique WSI IDs: 131
Number of unique patient IDs: 45
Overall age range: 17 - 73
Unique age groups and counts: {0: 69, 1: 29, 2: 14, 3: 19}

Age range per cohort:
  BCI: 31 - 31
  EPFL: 17 - 39
  KHP: 22 - 73
age_group          0         1         2         3   Total
cohort                                                    
BCI          1.0/1.0   nan/nan   nan/nan   nan/nan     1/1
EPFL         8.0/8.0   1.0/1.0   nan/nan   nan/nan     9/9
KHP        60.0/18.0  28.0/8.0  14.0/4.0  19.0/5.0  121/35
Total          69/27      29/9      14/4      19/5  131/45


In [41]:
FEATURES = "/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/FEATURES"
meta_pt = "/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/Metadata/test_BRCA.csv"
BRCA_df = clean_data(meta_pt, FEATURES, model_name="UNI", stainFunc="augmentation")
print_summary(BRCA_df)
BRCA_df.to_csv("/scratch_tmp/prj/cb_normalbreast/prj_BreastAgeNet/Metadata/test_BRCA_clean.csv", index=False)

Number of unique WSI IDs: 250
Number of unique patient IDs: 143
Overall age range: 20 - 58
Unique age groups and counts: {0: 122, 1: 93, 2: 19, 3: 16}

Age range per cohort:
  BCI: 20 - 56
  KHP: 27 - 58
  NKI: 21 - 58
age_group       0      1      2     3    Total
cohort                                        
BCI           4/4    1/1    3/3   1/1      9/9
KHP         77/27  72/28   10/4  13/6   172/65
NKI         41/41  20/20    6/6   2/2    69/69
Total      122/72  93/49  19/13  16/9  250/143
