"""
Copyright 2026 Zsolt Bedőházi

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from collections import Counter
import h5py
import glob
from omegaconf import OmegaConf
import ast

In [2]:
# Load config
preproc_conf = OmegaConf.load("../conf/preproc.yaml")
preproc_conf = preproc_conf['classic_mil_on_embeddings_bag']['jmcs_224_224_patches']

### Get labels and slide biopsy map

In [3]:
final_df_with_excluded_slides_cluster_tsne_1_2_3 = pd.read_csv('../cv_splits_paper/cv_splits_multi_stratified_sklearn_s_a_r_mo_paper_patients_rev/'+'final_df_with_excluded_slides_cluster_tsne_1_2_3.csv' )
final_df_with_excluded_slides_cluster_tsne_1_2_3.shape

(574, 15)

In [None]:
final_df_with_excluded_slides_cluster_tsne_1_2_3.head(2)

In [5]:
slide_embeddings_folder = preproc_conf.uni_finetuned_emb_dir_macenko_improved_v2

slide_embeddings_files = np.array(sorted(os.listdir(slide_embeddings_folder)))
slide_embeddings_files.shape, slide_embeddings_files[:3], slide_embeddings_files[-3:]

((11880,),
 array(['00046960-74ec-4750-b972-c4bf78eb40d6.h5',
        '0004acbe-72b3-44b6-bd13-cb6c08347716.h5',
        '0006656e-ad7e-41b0-abf2-bd9287714e54.h5'], dtype='<U39'),
 array(['fffc201f-1fbf-4839-9e84-76dd2d5e05dc.h5',
        'fffe280f-4ab0-446b-bc47-c879df1b972e.h5',
        'fffe5f42-8627-45c3-b000-0afc2fa1dd1f.h5'], dtype='<U39'))

In [6]:
final_df_with_excluded_slides_cluster_tsne_1_2_3['slide_id'].apply(lambda x: len(ast.literal_eval(x))).sum()

np.int64(9489)

### Group embeddings into biopsies

In [7]:
def load_h5_file_features(filename):
    with h5py.File(filename, "r") as f:
        coords = f['coords'][()]
        features = f['features'][()]
        return coords, features

In [8]:
def save_h5_file_biopsy_bag(filename, features, coords, slides, label):
    with h5py.File(filename, "w") as f:
        f.create_dataset("features", data=features)
        f.create_dataset("coords", data=coords)
        f.create_dataset("slides", data=slides)
        f.create_dataset("label", data=label)

In [9]:
int(final_df_with_excluded_slides_cluster_tsne_1_2_3.Pathological.iloc[0])

1

In [10]:
final_df_with_excluded_slides_cluster_tsne_1_2_3.shape[0]

574

In [None]:
preproc_conf.uni_finetuned_biopsy_dir_macenko_improved_v2_patients_excluded_slides_cluster_tsne_1_2_3_rev

In [12]:
save_dir = preproc_conf.uni_finetuned_biopsy_dir_macenko_improved_v2_patients_excluded_slides_cluster_tsne_1_2_3_rev
os.makedirs(save_dir, exist_ok=True)

empty_bag_count = 0

for b in tqdm(range(final_df_with_excluded_slides_cluster_tsne_1_2_3.shape[0])):
    
    features_biopsy = [] # save grouped embeddings here
    coords_biopsy = [] # grouped coordinates
    slides_biopsy = [] # grouped slide names
    label_biopsy = int(final_df_with_excluded_slides_cluster_tsne_1_2_3.Pathological.iloc[b]) # target stage to predict
    #print(label_biopsy)
    
    slide_list = eval(final_df_with_excluded_slides_cluster_tsne_1_2_3.slide_id.iloc[b]) # get slide list for current biopsy
    #print(slide_list)
    
    for s in slide_list:
        #print(slide_embeddings_folder+labels_df.slide_id.iloc[s] + '.h5')
        try:
            coords, features = load_h5_file_features( slide_embeddings_folder+s+ '.h5' )
            
            features_biopsy.append(features.astype(np.float32))
            coords_biopsy.append(coords)
            slides_biopsy.append( np.repeat(s, coords.shape[0], axis=0) )
            
        except Exception as e:
            if "[Errno 2] Unable to open file" in str(e):
                pass
                #print(f"Missing {e}")
            #if "single positional indexer is out-of-bounds" in str(e):
            #    pass  # Do nothing, silently ignore the error
    
    try:
        # concat along axis 0, so along samples
        features_biopsy = np.concatenate(features_biopsy).astype(np.float32) 
        coords_biopsy = np.concatenate(coords_biopsy)
        slides_biopsy = np.concatenate(slides_biopsy).astype(np.bytes_) # need to convert for h5
        
        #print(save_dir + biopsy_df.biopsy_id.iloc[b] + '.h5')
        save_h5_file_biopsy_bag( save_dir + final_df_with_excluded_slides_cluster_tsne_1_2_3.patient_ngsci_id.iloc[b] + '.h5', 
                             features_biopsy,
                             coords_biopsy,
                             slides_biopsy,
                             label_biopsy )
    except Exception as e:
        empty_bag_count += 1
        print(e) # biopsy bag with slide or slides that are all empty or missing (can be due to filtered dataset)
    
    #break

100%|██████████| 574/574 [06:57<00:00,  1.37it/s]


In [13]:
empty_bag_count

0

### Check slide number after tsne slide filtering

In [14]:
def load_h5_file_biopsy_bag(filename):
    with h5py.File(filename, "r") as f:
        features = f['features'][()]
        coords = f['coords'][()]
        slides = f['slides'][()]
        label = f['label'][()]
        return features, coords, slides, label

In [15]:
!ls $save_dir | wc -l

574


In [16]:
file_to_check = save_dir+'01d90f36-0a15-4962-872d-80d6db3f5519.h5'
features, coords, slides, label = load_h5_file_biopsy_bag(file_to_check)
features.shape, coords.shape, slides.shape, label

((8748, 1024), (8748, 2), (8748,), np.int64(3))

In [17]:
features[0].dtype

dtype('float32')

In [18]:
total_slides = 0

for p in tqdm(final_df_with_excluded_slides_cluster_tsne_1_2_3.patient_ngsci_id.values):

    file_to_check = save_dir + p + ".h5"
    features, coords, slides, label = load_h5_file_biopsy_bag(file_to_check)
    
    total_slides += len(set(slides))

100%|██████████| 574/574 [00:33<00:00, 17.22it/s]


In [19]:
total_slides

9489