In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from collections import Counter
import h5py
import random
from omegaconf import OmegaConf
import glob

In [2]:
conf_preproc = OmegaConf.load("../../conf/preproc.yaml")
conf_preproc = conf_preproc["project_mil"]

In [3]:
biopsy_df = pd.read_csv("../../biopsy_df.csv")
labels_df = pd.read_csv("../../labels_df.csv")
biopsy_df.shape, labels_df.shape

((286, 17), (329, 18))

In [None]:
biopsy_df

In [None]:
labels_df

In [6]:
def get_biopsy_idx_to_slide_idx(labels_df):
    gby_temp = list(labels_df.groupby('base_slide_number', sort=False))
    biopsy_idx_to_slide_idx = np.array( [ l[1].index.values for l in gby_temp ], dtype=object )
    return biopsy_idx_to_slide_idx

In [7]:
biopsy_idx_to_slide_idx = get_biopsy_idx_to_slide_idx(labels_df)

In [8]:
biopsy_idx_to_slide_idx[:5]

array([array([0]), array([1]), array([2, 3]), array([4, 5]),
       array([6, 7])], dtype=object)

In [None]:
slide_embeddings_folder = conf_preproc["uni_finetuned_emb_dir_macenko_new_bracs"]

slide_embeddings_files = np.array(sorted(os.listdir(slide_embeddings_folder)))

In [10]:
slide_embeddings_files.shape, slide_embeddings_files[:3]

((337,), array(['1.h5', '10.1.h5', '10.2.h5'], dtype='<U8'))

In [11]:
def load_h5_file_features(filename):
    with h5py.File(filename, "r") as f:
        coords = f['coords'][()]
        features = f['features'][()]
        return coords, features

In [12]:
def save_h5_file_biopsy_bag(filename, features, coords, slides, label):
    with h5py.File(filename, "w") as f:
        f.create_dataset("features", data=features)
        f.create_dataset("coords", data=coords)
        f.create_dataset("slides", data=slides)
        f.create_dataset("label", data=label)

In [13]:
coords, features = load_h5_file_features(slide_embeddings_folder + slide_embeddings_files[0])
coords.shape, features.shape

((165, 2), (165, 1024))

## Collect embeddings for every biopsy


In [None]:
save_dir = conf_preproc["uni_finetuned_biopsy_dir_macenko_new_bracs"]

os.makedirs(save_dir, exist_ok=True)
save_dir

In [15]:
for b in tqdm(range(biopsy_idx_to_slide_idx.shape[0])):
    
    features_biopsy = [] # save grouped embeddings here
    coords_biopsy = [] # grouped coordinates
    slides_biopsy = [] # grouped slide names
    label_biopsy = biopsy_df.clinical_stage.iloc[b] # target stage to predict
    
    slide_list = biopsy_idx_to_slide_idx[b] # get slide list for current biopsy
    #print("slide_list: ", slide_list)
    
    for s in slide_list:
        #print("slide emb to read: ", slide_embeddings_folder+labels_df.slide_id.iloc[s])
        try:
            coords, features = load_h5_file_features( slide_embeddings_folder+\
                                                      labels_df.slide_id.iloc[s] )
            features_biopsy.append(features.astype(np.float16))
            coords_biopsy.append(coords)
            slides_biopsy.append( np.repeat(labels_df.slide_id.iloc[s], coords.shape[0], axis=0) )
            
        except Exception as e:
            print('Missing', e)
    
    # concat along axis 0, so along samples
    features_biopsy = np.concatenate(features_biopsy).astype(np.float16) 
    coords_biopsy = np.concatenate(coords_biopsy)
    slides_biopsy = np.concatenate(slides_biopsy).astype(np.string_) # need to convert for h5

    #print("save_dir:", save_dir + biopsy_df.base_slide_number.iloc[b] + '.h5')
    save_h5_file_biopsy_bag( save_dir + biopsy_df.base_slide_number.iloc[b] + '.h5', 
                             features_biopsy,
                             coords_biopsy,
                             slides_biopsy,
                             label_biopsy )

    #break

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 286/286 [00:00<00:00, 344.92it/s]
