"""
Copyright 2026 Zsolt Bedőházi

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

### Similar grouping as the original patient leve, but here we group the slides on biopsy level only!

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from collections import Counter
import h5py
import glob
from omegaconf import OmegaConf
import ast

In [2]:
# Load config
preproc_conf = OmegaConf.load("../conf/preproc.yaml")
preproc_conf = preproc_conf['classic_mil_on_embeddings_bag']['jmcs_224_224_patches']

### Get labels and slide biopsy map

In [3]:
final_df = pd.read_csv( preproc_conf.cv_splits+'cv_splits_multi_stratified_sklearn_s_a_r_mo_paper_patients_rev/'+'final_df.csv' )
final_df.shape

(575, 15)

In [None]:
final_df.head(2)

In [5]:
merged_df_latest = pd.read_csv(preproc_conf.cv_splits + "merged_df_latest.csv", index_col=0)

In [6]:
merged_df_latest.shape

(804, 45)

In [7]:
merged_df_latest = merged_df_latest[merged_df_latest.patient_ngsci_id.isin(final_df.patient_ngsci_id.values)]

In [8]:
merged_df_latest.shape

(715, 45)

In [9]:
# Merge on patient_ngsci_id
merged_df_latest = merged_df_latest.merge(
    final_df[['patient_ngsci_id', 'Pathological']],
    on='patient_ngsci_id',
    how='left'  # or 'inner' depending on your use case
)

In [10]:
merged_df_latest.shape

(715, 46)

In [11]:
slide_embeddings_folder = preproc_conf.uni_finetuned_emb_dir_macenko_improved_v2

slide_embeddings_files = np.array(sorted(os.listdir(slide_embeddings_folder)))
slide_embeddings_files.shape, slide_embeddings_files[:3], slide_embeddings_files[-3:]

((11880,),
 array(['00046960-74ec-4750-b972-c4bf78eb40d6.h5',
        '0004acbe-72b3-44b6-bd13-cb6c08347716.h5',
        '0006656e-ad7e-41b0-abf2-bd9287714e54.h5'], dtype='<U39'),
 array(['fffc201f-1fbf-4839-9e84-76dd2d5e05dc.h5',
        'fffe280f-4ab0-446b-bc47-c879df1b972e.h5',
        'fffe5f42-8627-45c3-b000-0afc2fa1dd1f.h5'], dtype='<U39'))

In [12]:
# this is not really needed ! filepath will be constructed to load these

### Group embeddings into biopsies

In [13]:
def load_h5_file_features(filename):
    with h5py.File(filename, "r") as f:
        coords = f['coords'][()]
        features = f['features'][()]
        return coords, features

In [14]:
def save_h5_file_biopsy_bag(filename, features, coords, slides, label):
    with h5py.File(filename, "w") as f:
        f.create_dataset("features", data=features)
        f.create_dataset("coords", data=coords)
        f.create_dataset("slides", data=slides)
        f.create_dataset("label", data=label)

In [15]:
int(merged_df_latest.Pathological.iloc[0])

1

In [None]:
preproc_conf.uni_finetuned_biopsy_dir_macenko_improved_v2_biopsies_rev

In [17]:
save_dir = preproc_conf.uni_finetuned_biopsy_dir_macenko_improved_v2_biopsies_rev
os.makedirs(save_dir, exist_ok=True)

empty_bag_count = 0

for b in tqdm(range(merged_df_latest.shape[0])):
    
    features_biopsy = [] # save grouped embeddings here
    coords_biopsy = [] # grouped coordinates
    slides_biopsy = [] # grouped slide names
    label_biopsy = int(merged_df_latest.Pathological.iloc[b]) # target stage to predict
    #print(label_biopsy)
    
    slide_list = eval(merged_df_latest.slide_id.iloc[b]) # get slide list for current biopsy
    #print(slide_list)
    
    for s in slide_list:
        #print(slide_embeddings_folder+labels_df.slide_id.iloc[s] + '.h5')
        try:
            coords, features = load_h5_file_features( slide_embeddings_folder+s+ '.h5' )
            
            features_biopsy.append(features.astype(np.float32))
            coords_biopsy.append(coords)
            slides_biopsy.append( np.repeat(s, coords.shape[0], axis=0) )
            
        except Exception as e:
            if "[Errno 2] Unable to open file" in str(e):
                pass
                #print(f"Missing {e}")
            #if "single positional indexer is out-of-bounds" in str(e):
            #    pass  # Do nothing, silently ignore the error
    
    try:
        # concat along axis 0, so along samples
        features_biopsy = np.concatenate(features_biopsy).astype(np.float32) 
        coords_biopsy = np.concatenate(coords_biopsy)
        slides_biopsy = np.concatenate(slides_biopsy).astype(np.bytes_) # need to convert for h5
        
        #print(save_dir + biopsy_df.biopsy_id.iloc[b] + '.h5')
        save_h5_file_biopsy_bag( save_dir + merged_df_latest.biopsy_id.iloc[b] + '.h5', 
                             features_biopsy,
                             coords_biopsy,
                             slides_biopsy,
                             label_biopsy )
    except Exception as e:
        empty_bag_count += 1
        print(e) # biopsy bag with slide or slides that are all empty or missing (can be due to filtered dataset)
    
    #break

100%|██████████| 715/715 [06:39<00:00,  1.79it/s]


In [18]:
empty_bag_count

0

### Check

In [19]:
def load_h5_file_biopsy_bag(filename):
    with h5py.File(filename, "r") as f:
        features = f['features'][()]
        coords = f['coords'][()]
        slides = f['slides'][()]
        label = f['label'][()]
        return features, coords, slides, label

In [20]:
save_dir = preproc_conf.uni_finetuned_biopsy_dir_macenko_improved_v2_biopsies_rev

In [23]:
!ls $save_dir | wc -l

715


In [24]:
file_to_check = save_dir+'67e0a116-6f4b-4a4c-93c8-21418ca230a4.h5'
features, coords, slides, label = load_h5_file_biopsy_bag(file_to_check)
features.shape, coords.shape, slides.shape, label

((17926, 1024), (17926, 2), (17926,), np.int64(1))

In [25]:
features[0].dtype

dtype('float32')

In [None]:
save_dir