In [1]:
# Author: Bonaventure F. P. Dossou - bonaventure.dossou@mila.quebec
# Data preparation and Dataset creation for the models training
# Please run this only once (more details on Solution.md)
import os
from pathlib import Path
import glob
import openslide
import random
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
data_dir = os.path.join('/','home','ngsci','datasets','brca-psj-path')
slide_dir = os.path.join(data_dir, 'ndpi')
slides_fp = os.path.join(slide_dir,'*','*')
slides_list = glob.glob(slides_fp)

In [3]:
v1_slide_biopsy_map_fp = os.path.join(data_dir, 'v1', 'slide-biopsy-map.csv')
v2_slide_biopsy_map_fp = os.path.join(data_dir, 'v2', 'slide-biopsy-map.csv')

v1_outcomes_fp = os.path.join(data_dir, 'v1', 'outcomes.csv')
v2_outcomes_fp = os.path.join(data_dir, 'v2', 'outcomes.csv')

v1_slide_biopsy_map_df = pd.read_csv(v1_slide_biopsy_map_fp)
v2_slide_biopsy_map_df = pd.read_csv(v2_slide_biopsy_map_fp)
v1_outcomes_df = pd.read_csv(v1_outcomes_fp)
v2_outcomes_df = pd.read_csv(v2_outcomes_fp)

slide_biopsy_map_df = pd.concat([v1_slide_biopsy_map_df, v2_slide_biopsy_map_df]).drop_duplicates()
outcomes_df = pd.concat([v1_outcomes_df, v2_outcomes_df]).drop_duplicates(subset='biopsy_id')
outcomes_df['stage'] = outcomes_df['stage'].str.replace("A", "").str.replace("B", "").str.replace("C", "")

52323 3269


In [4]:
biopsy_dict = {bio_id: stage for bio_id, stage in zip(outcomes_df['biopsy_id'].tolist(), outcomes_df['stage'].tolist())}

In [5]:
slide_biopsy_map_df['stage'] = slide_biopsy_map_df['biopsy_id'].apply(lambda x: biopsy_dict[x])
slide_biopsy_map_df.dropna(subset='stage', inplace=True)
slide_biopsy_map_df = slide_biopsy_map_df[slide_biopsy_map_df['stage'].isin(['0', 'I', 'II', 'III', 'IV', 'No Stage Rec'])]
dict_ = {'0': 0, 'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'No Stage Rec': 1} # based on the stats most breast cancer had stage 1
slide_biopsy_map_df['stage'] = slide_biopsy_map_df['stage'].apply(lambda x: dict_[x])

In [6]:
labels = slide_biopsy_map_df['stage'].tolist()
path_data = slide_biopsy_map_df['slide_path'].tolist()

In [7]:
train_data, eval_data, train_labels, eval_labels = train_test_split(path_data, labels, stratify=labels, test_size=0.2, random_state=42)

In [8]:
print('Training Data: {} - Validation Data: {}'.format(len(train_data), len(eval_data)))

Training Data: 32146 - Validation Data: 8037


In [9]:
def make_dirs():
    directory = os.path.join('/','home','ngsci','project', 'breast_cancer')
    for split in ['train', 'val']:
        if not os.path.exists(os.path.join(directory, split)):
            os.mkdir(os.path.join(directory, split))
        split_dir = os.path.join(directory, split)
        for category in range(5):
            if not os.path.exists(os.path.join(split_dir, str(category))):
                os.mkdir(os.path.join(split_dir, str(category)))

make_dirs()

In [10]:
def downsample_slide(ndpi_filepath, output_dir):
    slide_id = Path(ndpi_filepath).stem
    with openslide.OpenSlide(ndpi_filepath) as openslide_obj:
        dim = openslide_obj.dimensions
        new_dim = (224, 224)
        image = openslide_obj.get_thumbnail(new_dim)
        image.save(os.path.join(output_dir, f"{slide_id}.png"))

In [None]:
def create_images_to_dir(dataset_split, data_paths, data_labels):
    directory = os.path.join('/','home','ngsci','project', 'breast_cancer', dataset_split)
    total_data = len(data_paths)
    for index in tqdm(range(1554+3406, total_data), desc ="Data Creation Progress"):
        data_path, data_label = data_paths[index], data_labels[index]
        output_directory = os.path.join(directory, str(data_label))
        downsample_slide(data_path, output_directory)

create_images_to_dir('train', train_data, train_labels)
create_images_to_dir('val', eval_data, eval_labels)

train_data_dir = os.path.join('/','home','ngsci', 'project', 'breast_cancer', 'train')
train_slides_fp = os.path.join(train_data_dir,'*','*')
train_slides_list = glob.glob(train_slides_fp)
print('Train Images :{}'.format(len(train_slides_list)))

eval_data_dir = os.path.join('/','home','ngsci', 'project', 'breast_cancer', 'val')
eval_slides_fp = os.path.join(eval_data_dir,'*','*')
eval_slides_list = glob.glob(eval_slides_fp)
print('Eval Images :{}'.format(len(eval_slides_list)))

In [5]:
test_data = os.path.join('/','home','ngsci','datasets', 'brca-psj-path', 'basic-downsampling', 'holdout')
test_slides_fp = os.path.join(test_data, '*')
test_slides_list = glob.glob(test_slides_fp)
print(len(test_slides_list))

16607
