In [1]:
from pathlib import Path

from fastai.vision import *
from fastai.data.transforms import get_image_files
from fastai.vision.core import PILImage, PILMask

import numpy as np
from skimage.morphology.convex_hull import convex_hull_image
from PIL import Image

from matplotlib import pyplot as plt

from tqdm.notebook import tqdm

import os
import cv2
import shutil

from matplotlib import rcParams

%matplotlib inline

# figure size in inches optional
rcParams['figure.figsize'] = 16 ,8

In [2]:
# Load the raw dataset and gt masks
root_path = Path("/scratch/rc4499/pneumonia_clean")
img_path = root_path/"imgs"
mask_path = root_path/"masks_smooth"

img_fnames = get_image_files(img_path)
# mask_fnames = get_image_files(mask_path)

In [3]:
def get_mask_path(dataset_type, fpath):
    if dataset_type == "pneumonia":
        return mask_path/fpath.name
    if dataset_type == "hospital_systems":
        return mask_path/fpath.name

def create_instances(img_fnms, mask_fnms):
    result = []
    for fname in img_fnms:
        mask_name = get_mask_path("pneumonia", fname)
        result.append((fname, mask_name))
        
    return result

In [4]:
instances = create_instances(img_fnames, mask_fnames)

len(instances)

NameError: name 'mask_fnames' is not defined

In [13]:
from sklearn.model_selection import train_test_split

def create_splits(df, already_split = True, combine=False):
    """
    Create the training/val/test split
    """
    if combine:
        return {"combined": df}
    
    train_ratio = 0.80
    validation_ratio = 0.10
    test_ratio = 0.10
    df_train, df_val, df_test = [], [], []
    if already_split:
        for p in df:
            if "/train/" in str(p):
                df_train.append(p)
            elif "/val/" in str(p):
                df_val.append(p)
            elif "/test/" in str(p):
                df_test.append(p)
            else:
                raise Exception("Split not found")
    else:
        # train is now 80% of the entire data set
        # the _junk suffix means that we drop that variable completely
        df_train, df_val = train_test_split(df, test_size=1 - train_ratio, random_state=0)

        # # test is now 10% of the initial data set
        # # validation is now 10% of the initial data set
        df_val, df_test = train_test_split(df_val, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42) 

    print(f"Split {len(df)} instances into train:{len(df_train)}, val:{len(df_val)}, test:{len(df_test)}")
    
    return {"train": df_train, "val": df_val, "test": df_test}

In [18]:
MASK_VERSIONS = ["none", "raw", "convex_hull"]
# MASK_VERSIONS = ["convex_hull"]

splits = create_splits(instances, already_split=False)
# len(splits["combined"])
# splits = {"combined": img_fnames}

Split 5856 instances into train:4684, val:586, test:586


In [19]:
def label_func_pneumonia(fname):
    fname = str(fname)
    if "/NORMAL/" in fname:
        return "NORMAL"
    elif "/PNEUMONIA/" in fname:
        return "PNEUMONIA"
    else:
        raise Exception("Invalid file path")

In [20]:
from PIL import Image, ImageOps

def resize_img(im, desired_size=448):
    old_size = im.shape[:2] # old_size is in (height, width) format

    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])

    # new_size should be in (width, height) format
    im = cv2.resize(im, (new_size[1], new_size[0]))

    delta_w = desired_size - new_size[1]
    delta_h = desired_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)

    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    return new_im

In [21]:
def label_func(fname):
    return 0 if "_0.png" in fname else 1

In [22]:
output_path = Path("/scratch/rc4499/masked/pneumonia")

for mask_version in tqdm(MASK_VERSIONS, desc="Mask version"):
    for split_name, split_files in tqdm(splits.items(), desc="Data Split"):

        output_dir = output_path/mask_version/split_name
        
        shutil.rmtree(output_dir, ignore_errors=True)

        for inst in tqdm(split_files, total=len(split_files)):
            img_fname = inst[0]
            mask_fname = inst[1]
            # mask_fname = ""
        
            # load the original input image and display it to our screen
            # image = np.asarray(PILImage.create(img_fname))
            image = cv2.imread(str(img_fname))
            resized_img = resize_img(image)
            mask = np.asarray(PILMask.create(mask_fname))

            if mask_version == "none":
                mask[:,:] = 255
            elif mask_version == "raw":
                pass
            elif mask_version == "convex_hull":
                hull = convex_hull_image(mask)
                mask = hull.astype(np.uint8) * 255
            else:
                raise Exception(f"Invalid mask version {mask_version}")
            
            masked = cv2.bitwise_and(resized_img, resized_img, mask=mask)
            # masked = resized_img
            
            # # display images
            # fig, ax = plt.subplots(1,4)
            # ax[0].imshow(image)
            # ax[1].imshow(resized_img);
            # ax[2].imshow(mask, cmap="gray")
            # ax[3].imshow(masked)
            im = Image.fromarray(masked)
            os.makedirs(f"{output_dir}/{label_func_pneumonia(img_fname)}", exist_ok=True)
            im.save(f"{output_dir}/{label_func_pneumonia(img_fname)}/{img_fname.name}")

Mask version:   0%|          | 0/3 [00:00<?, ?it/s]

Data Split:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4684 [00:00<?, ?it/s]

  0%|          | 0/586 [00:00<?, ?it/s]

  0%|          | 0/586 [00:00<?, ?it/s]

Data Split:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4684 [00:00<?, ?it/s]

  0%|          | 0/586 [00:00<?, ?it/s]

  0%|          | 0/586 [00:00<?, ?it/s]

Data Split:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4684 [00:00<?, ?it/s]



  0%|          | 0/586 [00:00<?, ?it/s]

  0%|          | 0/586 [00:00<?, ?it/s]