In [None]:
import os
from pathlib import Path
import glob
import openslide
import random
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image
import h5py
import torch
import shutil

In [None]:
brca_dir = Path().home() / 'datasets' / 'brca-psj-path'
clam_train_dir = brca_dir / 'contest-phase-2' / 'clam-preprocessing-train'
clam_train_test = brca_dir / 'contest-phase-2' / 'clam-preprocessing-holdout'
patches_dir = clam_train_dir / 'patches'
stitches_dir = clam_train_dir / 'stitches'
stitches_dir_test = clam_train_test / 'stitches'


BRCA_ROOT = Path().home() / "datasets" / "brca-psj-path"
CONTEST_DIR = BRCA_ROOT / "contest-phase-2"
IMAGE_ROOT = CONTEST_DIR / "png-downsampled-train"
IMAGE_ROOT_TEST = CONTEST_DIR / "png-downsampled-holdout"
TABLE_DIR = CONTEST_DIR / "csv-train"
LOGGER_DIR = Path().home() / "logs"

In [None]:
slide_biop_df = pd.read_csv(CONTEST_DIR / "slide-manifest-train.csv")
outcomes_df = pd.read_csv(TABLE_DIR / "outcomes.csv")
slide_stage_df = slide_biop_df.merge(outcomes_df, on="biopsy_id")
def stage_to_int(stage):
    if stage == "0":
        return 0
    elif stage == "IA" or stage == "IB":
        return 1
    elif stage == "IIA" or stage == "IIB":
        return 2
    elif stage == "IIIA" or stage == "IIIB" or stage == "IIIC":
        return 3
    elif stage == "IV":
        return 4
    elif stage == "No Stage Rec":
        return 1
    else:
        return np.nan


slide_stage_df["stage"] = slide_stage_df["stage"].apply(stage_to_int)
train_slides_df = (slide_stage_df[["slide_id", "biopsy_id", "stage", "slide_path"]].copy().dropna(how="any").reset_index(drop=True))
train_slides_df["stage"] = train_slides_df["stage"].astype(int)
train_slides_df.head(5)

In [None]:
labels = train_slides_df['stage'].tolist()
path_data = train_slides_df['slide_path'].tolist()

In [None]:
train_data, eval_data, train_labels, eval_labels = train_test_split(path_data, labels, stratify=labels, test_size=0.1, random_state=1234)

In [None]:
print('Training Data: {} - Validation Data: {}'.format(len(train_data), len(eval_data)))

In [None]:
def make_dirs():
    directory = os.path.join('/','home','ngsci','project', 'ami-ahead-wombcare')
    for split in ['training', 'validation']:
        if not os.path.exists(os.path.join(directory, split)):
            os.mkdir(os.path.join(directory, split))
        split_dir = os.path.join(directory, split)
        for category in range(5):
            if not os.path.exists(os.path.join(split_dir, str(category))):
                os.mkdir(os.path.join(split_dir, str(category)))

make_dirs()

In [None]:
def get_image(path_folder, ndpi_filepath, output_dir):
    slide_id = Path(ndpi_filepath).stem
    stitch_fp = path_folder / f'{slide_id}.png'
    shutil.copy(stitch_fp, output_dir)

In [None]:
def create_images_to_dir(dataset_split, data_paths, data_labels):
    directory = os.path.join('/','home','ngsci','project', 'ami-ahead-wombcare', dataset_split)
    total_data = len(data_paths)
    for index in tqdm(range(total_data), desc = '{} Dataset Creation Progress'.format(dataset_split.capitalize())):
        data_path, data_label = data_paths[index], data_labels[index]
        output_directory = os.path.join(directory, str(data_label))
        get_image(IMAGE_ROOT, data_path, output_directory)

create_images_to_dir('training', train_data, train_labels)
create_images_to_dir('validation', eval_data, eval_labels)

train_data_dir = os.path.join('/','home','ngsci', 'project', 'ami-ahead-wombcare', 'training')
train_slides_fp = os.path.join(train_data_dir,'*','*')
train_slides_list = glob.glob(train_slides_fp)
print('Train Images: {}'.format(len(train_slides_list)))

eval_data_dir = os.path.join('/','home','ngsci', 'project', 'ami-ahead-wombcare', 'validation')
eval_slides_fp = os.path.join(eval_data_dir,'*','*')
eval_slides_list = glob.glob(eval_slides_fp)
print('Eval Images: {}'.format(len(eval_slides_list)))

In [None]:
testing_manifest = pd.read_csv(os.path.join(CONTEST_DIR, "slide-manifest-holdout.csv"))
testing_slides = testing_manifest.slide_id.tolist()

def create_test_images_to_dir(dataset_split, slides_ids):
    directory = os.path.join('/','home','ngsci','project', 'ami-ahead-wombcare', dataset_split)
    total_data = len(slides_ids)
    for index in tqdm(range(total_data), desc ="{} Dataset Creation Progress".format(dataset_split.capitalize())):
        stitch_fp = IMAGE_ROOT_TEST / f'{slides_ids[index]}.png'
        shutil.copy(stitch_fp, directory)


create_test_images_to_dir('testing', testing_slides)
test_data_dir = os.path.join('/','home','ngsci', 'project', 'ami-ahead-wombcare', 'testing')
test_slides_fp = os.path.join(test_data_dir,'*')
test_slides_list = glob.glob(test_slides_fp)
print('Testing Images: {}'.format(len(test_slides_list)))

In [None]:
def get_image_stats(path, numpy=False, to_float=False):
    img = Image.open(path)
    if numpy:
        arr = np.asarray(img)
        if to_float:
            arr = arr / 255.0
        return arr
    else:
        return img

In [None]:
# compute means and stds for normalization
means_train, means_valid = [], []
stds_train, stds_valid = [], []

for path in train_slides_list:
    arr = get_image_stats(path, numpy=True, to_float=True)
    means_train.append(arr.mean(axis=(0, 1)))
    stds_train.append(arr.std(axis=(0, 1)))

for path in eval_slides_list:
    arr = get_image_stats(path, numpy=True, to_float=True)
    means_valid.append(arr.mean(axis=(0, 1)))
    stds_valid.append(arr.std(axis=(0, 1)))

MEANS_TRAIN = np.vstack(means_train).mean(axis=0)
STDS_TRAIN = np.vstack(stds_train).mean(axis=0)

MEANS_VALID = np.vstack(means_valid).mean(axis=0)
STDS_VALID = np.vstack(stds_valid).mean(axis=0)

print(MEANS_TRAIN, STDS_TRAIN)
print(MEANS_VALID, STDS_VALID)