In [3]:
'''
import necessary packages and functions
'''

import os
import re
import io
import csv
import cv2
import random
import sys
import glob
import re
import imageio
import itertools

import numpy as np
import matplotlib.pyplot as plt

from scipy import ndarray, ndimage
from skimage import io, color, transform, util, morphology, measure, filters
from skimage.color import rgb2gray
from skimage.io import imsave, imshow, imread
from skimage.transform import rescale, resize, downscale_local_mean
from skimage.io import imshow, imsave
from skimage.color import rgb2gray
from skimage.transform import rescale, resize, downscale_local_mean
from skimage.segmentation import clear_border
from skimage.morphology import ball, disk, remove_small_objects
from skimage.morphology import erosion, dilation, closing, reconstruction
from skimage.morphology import binary_erosion, binary_dilation, binary_closing, binary_opening
from skimage.measure import label, regionprops, perimeter
from skimage.filters import roberts, sobel, threshold_otsu



In [6]:
'''
define move_by_labels function to move images based on csv labels
'''

def move_by_label(source_dir, dest_dir, labels):
    with open(labels) as infile:
        next(infile)  # Skip the header row
        reader = csv.reader(infile)
        seen = set()
        for External_ID, Label in reader:
            src = os.path.join(source_dir, External_ID)
            dest = os.path.join(dest_dir, Label, External_ID)
            try:
                os.rename(src, dest)
            except WindowsError as e:
                print (e)

'''
define import_images function to pull from directory, convert to grayscale, append to list
'''
def import_images(dir_name, images, image_names):
    for root, dirnames, filenames in os.walk(dir_name):
        for filename in filenames:
            if re.search("\.(jpg|jpeg|png|bmp|tiff)$", filename):
                filepath = os.path.join(root, filename)
                image_names.append(filename)
                
                ## read images and add to list
                image = io.imread(fname=filepath)
                image_gray = rgb2gray(image)
                images.append(image_gray)
                
'''
define move_images function to move files between directories
'''            
def move_images(img_list, src_dir, dest_dir):
    for img in img_list:
        src = os.path.join(src_dir, img)
        dest = os.path.join(dest_dir, img)
        os.rename(src, dest)
        
'''
define split_dataset function for training, testing, and validation split
'''        
def split_dataset(imgs, src_dir, train_dir, test_dir, validation_dir, rseed = 420):
    imgs.sort
    random.seed(rseed)
    random.shuffle(imgs)
    split_1 = int(0.8 * len(imgs))
    split_2 = int(0.9 * len(imgs))
    train = imgs[:split_1]
    validation = imgs[split_1:split_2]
    test = imgs[split_2:]

    move_images(train, src_dir, train_dir)
    move_images(test, src_dir, test_dir)
    move_images(validation, src_dir, validation_dir)

'''
define image augmentation functions to amplify dataset
'''
def random_rotation(image_array: ndarray):
    # pick a random degree of rotation between 25% on the left and 25% on the right
    random_degree = random.uniform(-25, 25)
    return sk.transform.rotate(image_array, random_degree)

def horizontal_flip(image_array: ndarray):
    # horizontal flip doesn't need skimage, it's easy as flipping the image array of pixels !
    return image_array[:, ::-1]

def vertical_flip(image_array: ndarray):
    # vertical flip doesn't need skimage, it's easy as flipping the image array of pixels !
    return image_array[::-1, :]

def augment_images(imgs, img_names):
    for i in range(len(imgs)):
        img_name = img_names[i]

        himg = horizontal_flip(imgs[i])
        hfilename='{}{}{}'.format(aug_dir, 'himg', img_name)
        cv2.imwrite(hfilename, himg)

        vimg = vertical_flip(imgs[i])
        vfilename='{}{}{}'.format(aug_dir, 'vimg', img_name)
        cv2.imwrite(vfilename, vimg)

        hvimg = horizontal_flip(vimg)
        hvfilename='{}{}{}'.format(aug_dir, 'hvimg', img_name)
        cv2.imwrite(hvfilename, hvimg)
        
'''
define function to display multiple images in a single plot
'''

def show_images(images, cols = 4, titles = None):
    """Display a list of images in a single figure with matplotlib.
    
    Parameters
    ---------
    images: List of np.arrays compatible with plt.imshow.
    
    cols (Default = 1): Number of columns in figure (number of rows is 
                        set to np.ceil(n_images/float(cols))).
    
    titles: List of titles corresponding to each image. Must have
            the same length as titles.
    """
    assert((titles is None)or (len(images) == len(titles)))
    n_images = len(images)
    if titles is None: titles = ['Image (%d)' % i for i in range(1,n_images + 1)]
    fig = plt.figure()
    for n, (image, title) in enumerate(zip(images, titles)):
        a = fig.add_subplot(cols, np.ceil(n_images/float(cols)), n + 1)
        if image.ndim == 2:
            plt.gray()
        plt.imshow(image)
        a.set_title(title, fontsize = 100)
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_images)
    plt.show()

        

In [5]:
'''
remove non-lung and poor quality images 

'''

SOURCE_ROOT = '../../data_full/images/'
DEST_ROOT = '../../data_full/removed/'
labels_remove = '../../data_full/labels_remove.csv'

move_by_label(SOURCE_ROOT, DEST_ROOT, labels_remove)


In [9]:
'''
move images to 0/1 folders based on labels
must modify the master label list to remove the filenames in the removal list
'''

SOURCE_ROOT = '../../data_full/images/'
DEST_ROOT = '../../data_full/'
labels = '../../data_full/labels.csv'

move_by_label(SOURCE_ROOT, DEST_ROOT, labels)

            

In [35]:
'''
move images to appropriate folder for training, testing, and validation
'''
yes_dir = '../../data_full/1/'
yes_imgs = []
yes_names = []
import_images(yes_dir, yes_imgs, yes_names)

yes_train = '../../data_full/train/1/'
yes_test = '../../data_full/test/1/'
yes_validation = '../../data_full/validation/1/'
split_dataset(yes_names, yes_dir, yes_train, yes_test, yes_validation)

no_dir = '../../data_full/0/'
no_imgs = []
no_names = []
import_images(no_dir, no_imgs, no_names)

no_train = '../../data_full/train/0/'
no_test = '../../data_full/test/0/'
no_validation = '../../data_full/validation/0/'
split_dataset(no_names, no_dir, no_train, no_test, no_validation)

    

In [37]:
'''
amplify training dataset via image augmentation
apply rotation and both horizontal and vertical flips to all images in train folders
'''
img_dir = '../../data_full/train/1'
aug_dir = '../../data_full/augmented/1/'
imgs = []
img_names = []

import_images(img_dir, imgs, img_names)
augment_images(imgs, img_names)

img_dir = '../../data_full/train/0'
aug_dir = '../../data_full/augmented/0/'
imgs = []
img_names = []

import_images(img_dir, imgs, img_names)
augment_images(imgs, img_names)

