In [None]:
import sys
import os
import random
import glob
import ngsci
import h5py
import cv2
import yaml
import matplotlib
import torch
import hydra
import numpy as np
import pandas as pd
from tqdm import tqdm
from openslide import OpenSlide
from PIL import Image
from collections import Counter
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
from omegaconf import DictConfig
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

In [None]:
preproc_conf = OmegaConf.load("/home/ngsci/project/tuberculosis_detection/conf/preproc.yaml") 

In [None]:
preproc_conf = preproc_conf['hipt_stage3_on_embeddings_bag']["uni_224_224_patches"]

In [None]:
data_root_dir = preproc_conf["data_root_dir"]
data_root_dir

In [None]:
tb_labels = pd.read_csv(data_root_dir + "v1/" + "tb-labels.csv")

In [None]:
tb_labels

In [None]:
tb_labels.shape

In [None]:
tb_labels["image_dir"] = tb_labels["file_path"].apply(lambda x: os.path.basename(os.path.dirname(x)))

In [None]:
tb_labels.head(3)

### Generate local test set (10%)

In [None]:
n_splits = 8

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=87)

In [None]:
data_idx = tb_labels.index.values

train_splits = []
val_splits = []

for train_idx, val_idx in skf.split(data_idx, tb_labels.iloc[data_idx]['tb_positive']):
    
    train_splits.append(train_idx)
    val_splits.append(val_idx)
    
    break

In [None]:
len(train_splits[0]), len(val_splits[0])

In [None]:
print(np.unique(tb_labels.iloc[val_splits[0]]['tb_positive'], return_counts=True))

In [None]:
tb_labels_test = tb_labels.iloc[val_splits[0]]
tb_labels_test

### save test set

In [None]:
tb_labels_test.to_csv(f'{preproc_conf["cv_split_dir_10fold"]}test_split_stratified.csv', index=False)

### Generate 10 train, val folds

In [None]:
tb_labels_rest = tb_labels.iloc[ ~np.in1d(tb_labels.index.values, val_splits[0]) ].reset_index(drop=True)

In [None]:
tb_labels_rest

In [None]:
n_splits = 10

In [None]:
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=87)

In [None]:
data_idx = tb_labels_rest.index.values

train_splits = []
val_splits = []

for train_idx, val_idx in skf.split(data_idx, tb_labels_rest.iloc[data_idx]['tb_positive']):
    
    train_splits.append(train_idx)
    val_splits.append(val_idx)

In [None]:
len(train_splits[0]), len(val_splits[0])

In [None]:
for i in range(n_splits):
    print(np.unique(tb_labels_rest.iloc[train_splits[i]]['tb_positive'], return_counts=True))

In [None]:
# check if there is any overlap in the val sets
for i in range(n_splits-1):
    print(list(set(val_splits[0]) & set(val_splits[i+1])))

### save folds

In [None]:
for s in range(n_splits):
    # save train set
    tb_labels_rest.iloc[train_splits[s]].to_csv(f'{preproc_conf["cv_split_dir_10fold"]}train_split_stratified_{s}.csv', index=False)
    
    # save val set
    tb_labels_rest.iloc[val_splits[s]].to_csv(f'{preproc_conf["cv_split_dir_10fold"]}val_split_stratified_{s}.csv', index=False)