In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [9]:
import os
os.chdir('drive/My Drive')
ROOT = os.getcwd()
os.chdir(ROOT + '/Colab Notebooks')

import sys
sys.path.append('../src')
from project_utilities.config import DATA_DIR_RAW

import numpy as np
import math
#import scipy.ndimage as nd
import os
import shutil
import random
import glob
import zipfile as zf
import requests as rq
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img 

from tqdm import tqdm, tqdm_notebook

# Don't show those pesky 'future warnings'
import warnings

In [2]:
DOWNLOAD_URL = 'https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/rscbjbr9sj-3.zip'
ZIP_PATH = 'raw/chest_xrays.zip'

TRAIN_NORMAL_PATH = 'train/NORMAL'
TRAIN_PNEUMONIA_PATH = 'train/PNEUMONIA'
PREPROC_NORMAL_PATH = 'preprocessed/NORMAL'
PREPROC_PNEUMONIA_PATH = 'preprocessed/PNEUMONIA'
TEST_PNEUMONIA_PATH = 'test/PNUEMONIA'

VALID_NORMAL_PATH = 'valid/NORMAL'
VALID_PNEUMONIA_PATH = 'valid/PNEUMONIA'

NORMAL_DEST = 'preprocessed/Normal'
PNEUMONIA_DEST = 'preprocessed/PNEUMONIA'

In [3]:
def extract_to(zip_file, member, dest_dir):
    
    filename = os.path.basename(member)
    # skip directories
    if not filename: 
        return
    
    test_dir = str(dest_dir) + filename
    if os.path.isfile(test_dir): 
        return

    # copy file (taken from zipfile's extract)
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    source = zip_file.open(member)
    target = open(os.path.join(dest_dir, filename), "wb")
    with source, target:
        shutil.copyfileobj(source, target)
  

In [4]:

def im_array(path):
    x_data = np.array( [np.array(Image.open(path + '/' + file).resize((224,224))) 
                        for file in tqdm_notebook(os.listdir(path), desc='Processing', unit='files') 
                        if len(np.array(Image.open(path + '/' + file)).shape) == 2] )
    return x_data.flatten().reshape(len(x_data), 224**2)

def create_image_array(orig, label=0):
    
    norm_array = im_array(orig)
    
    norm_array = norm_array/255
    norm_y = np.ones((int(norm_array.shape[0]),1)) if label else np.zeros((int(norm_array.shape[0]),1))
    norm_data = np.hstack((norm_array, norm_y))
    return norm_data


In [5]:

def im_augemt(source, destination, num_aug):
    # Code inspired by the following article:  
    # https://www.geeksforgeeks.org/python-data-augmentation/
    if os.path.isdir(destination) is False:
        #print('creating -> {destination}')
        os.mkdir(destination)
        src = ROOT + '/data/raw/' + source
        for name in tqdm_notebook(os.listdir(src)):
            if not name.endswith('jpeg'):
                continue            
            img = load_img(src + '/' + name)
            #print(f'Getting -> {img}')
            image = img_to_array(img)
            image = image.reshape((1,) + image.shape)
             
            datagen =  ImageDataGenerator(rotation_range=10,
                                          samplewise_center=True,
                                          samplewise_std_normalization= True,
                                          width_shift_range=0.2,
                                          height_shift_range=0.2,
                                          shear_range=0.2,
                                          zoom_range=0.2,
                                          fill_mode="nearest",
                                          cval=0.0,
                                          rescale = 0.2,
                                          horizontal_flip=True
                                          )
            #datagen.fit(image)
            for i, batch in enumerate(datagen.flow(image,
                                                   save_to_dir=destination,
                                                   batch_size=1,
                                                   save_prefix='aug',
                                                   save_format='jpeg'
                                                   )
                                                   ):
                if i > num_aug:
                  break

'''  
    p = ag.Pipeline(source, destination ) 
    p.flip_left_right(0.5) 
    p.rotate(0.3, 10, 10) 
    p.skew(0.4, 0.5) 
    p.histogram_equalisation(probability=1.0)
    p.zoom(probability = 0.2, min_factor = 1.1, max_factor = 1.3) 
    p.sample(num_samp, multi_threaded=True) 
  '''  

'  \n    p = ag.Pipeline(source, destination ) \n    p.flip_left_right(0.5) \n    p.rotate(0.3, 10, 10) \n    p.skew(0.4, 0.5) \n    p.histogram_equalisation(probability=1.0)\n    p.zoom(probability = 0.2, min_factor = 1.1, max_factor = 1.3) \n    p.sample(num_samp, multi_threaded=True) \n  '

In [6]:

def copy_from_train():   
    files = os.listdir(TRAIN_NORMAL_PATH)
    for file in tqdm_notebook(files):
        #shutil.move(f'{PREPROC_NORMAL_PATH}/{file}', TRAIN_NORMAL_PATH)
        shutil.copy(f'{TRAIN_NORMAL_PATH}/{file}', PREPROC_NORMAL_PATH)        

    files = os.listdir(TRAIN_PNEUMONIA_PATH)
    for file in tqdm_notebook(files):
        shutil.copy(f'{TRAIN_PNEUMONIA_PATH}/{file}', PREPROC_PNEUMONIA_PATH)
        
    #shutil.rmtree('preprocessed')

In [7]:

def create_train_val():    
    if os.path.isfile(PREPROC_NORMAL_PATH + '/.DS_Store'):
        os.remove(PREPROC_NORMAL_PATH + '/.DS_Store')
        
    if os.path.isfile(PREPROC_PNEUMONIA_PATH + '/.DS_Store'):
        os.remove(PREPROC_PNEUMONIA_PATH + '/.DS_Store')

    train_data = np.concatenate([create_image_array(PREPROC_NORMAL_PATH, label=0),
                                create_image_array(PREPROC_PNEUMONIA_PATH, label=1)], 
                                axis=0
                              )
    
    val_data = np.concatenate([create_image_array(VALID_NORMAL_PATH,  label=0),
                                create_image_array(VALID_PNEUMONIA_PATH, label=1)], 
                                axis=0
                              )
    
    test_data = create_image_array(TEST_PNEUMONIA_PATH, label=1)

    np.random.shuffle(train_data)
    np.random.shuffle(val_data)
    return train_data, val_data, test_data

'''   
    print(f'Shape for train_data: {(train_data.shape)}')
    print(f'Shape for val_data: {val_data.shape}')

    y_train = train_data[:,-1]
    y_train.reshape(y_train.size,1)
    print(f'Shape for y_train: {y_train.shape}')

    X_train = train_data[:,:-1]
    print(f'Shape for X_train: {X_train.shape}')
    
    y_val = val_data[:,-1]
    X_val = val_data[:,:-1]+
    X_val.shape
'''


"   \n    print(f'Shape for train_data: {(train_data.shape)}')\n    print(f'Shape for val_data: {val_data.shape}')\n\n    y_train = train_data[:,-1]\n    y_train.reshape(y_train.size,1)\n    print(f'Shape for y_train: {y_train.shape}')\n\n    X_train = train_data[:,:-1]\n    print(f'Shape for X_train: {X_train.shape}')\n    \n    y_val = val_data[:,-1]\n    X_val = val_data[:,:-1]+\n    X_val.shape\n"

### Create the directory that will hold all the data

In [None]:
          
if os.path.isdir('../data') is False:
    os.mkdir('../data')
    os.mkdir(DATA_DIR_RAW / 'raw')
    r = rq.get(DOWNLOAD_URL, stream=True)
    

#### Download the data

In [None]:

with open(DATA_DIR_RAW / ZIP_PATH, 'wb') as c_ray:
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            c_ray.write(chunk)
    

#### Extract zip file

In [None]:
       
with zf.ZipFile(DATA_DIR_RAW / ZIP_PATH) as archive:
    archive.extract('ZhangLabData.zip', DATA_DIR_RAW / 'raw/')

zang_zip = DATA_DIR_RAW / 'raw/ZhangLabData.zip';


#### From the extracted zip file, extract only the specific paths that hold the files of interest.

In [None]:

with zf.ZipFile(zang_zip) as zang_archive:

    zip_paths = 'CellData/chest_xray/'
    for path in zang_archive.namelist():
        if zip_paths + 'train/NORMAL/' in path:
            extract_to(zang_archive,path, DATA_DIR_RAW / 'raw/train/NORMAL')
        elif zip_paths + 'train/PNEUMONIA/' in path:
            extract_to(zang_archive, path, DATA_DIR_RAW / 'raw/train/PNEUMONIA')
        elif zip_paths + 'test/NORNAL' in path:
            extract_to(zang_archive, path, DATA_DIR_RAW / 'raw/test/NORMAL')
        elif zip_paths + 'test/PNEUMONIA' in path:
            extract_to(zang_archive, path, DATA_DIR_RAW / 'raw/test/PNUEMONIA')
        else:
            continue



#### Remove the zip files. No longer needed.

In [None]:
os.remove(DATA_DIR_RAW / 'raw/chest_xrays.zip')
os.remove(DATA_DIR_RAW / 'raw/ZhangLabData.zip') 

In [None]:
os.chdir(DATA_DIR_RAW / 'raw')
os.getcwd()

'/content/drive/My Drive/data/raw'

#### Manually create a validation set from the train data using 20%

In [None]:
train_norm = len(os.listdir("train/NORMAL"))
train_pnue = len(os.listdir("train/PNEUMONIA"))

if os.path.isdir('valid/NORMAL') is False:
    sample_norm = math.ceil(train_norm * 0.2)
    sample_pnue = math.ceil(train_pnue * 0.2)
    os.makedirs('valid/NORMAL')
    os.makedirs('valid/PNEUMONIA')

    for i in random.sample(glob.glob('train/NORMAL/*'), sample_norm):
        shutil.move(i, 'valid/NORMAL/')      
    for i in random.sample(glob.glob('train/PNEUMONIA/*'), sample_pnue):
        shutil.move(i, 'valid/PNEUMONIA/')


In [None]:
import math
math.ceil((train_pnue - train_norm) / train_pnue)

1

In [10]:
os.chdir(ROOT + '/data/raw')

#### Create augemented data to used with XGBoost

In [None]:
if os.path.isdir('./preprocessed') is False:
  
    os.mkdir('./preprocessed/')   
    os.chdir('./preprocessed/')
  
    # Data augmentation
    im_augemt(TRAIN_NORMAL_PATH, 'NORMAL', 2) 

    #im_augemt(TRAIN_PNEUMONIA_PATH, 'PNEUMONIA', math.ceil((train_pnue - train_norm)/train_pnue) ) 
    os.mkdir('PNEUMONIA')
    os.chdir('../')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=1079.0), HTML(value='')))




In [None]:
os.mkdir('preprocessed/PNEUMONIA')

#### Copy files from train into preprocess to complete the dataset

In [None]:
copy_from_train()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1079.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=3107.0), HTML(value='')))




In [None]:
train_data, val_data, test_data = create_train_val()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, description='Processing', max=4550.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Processing', max=3106.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Processing', max=270.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Processing', max=777.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Processing', max=390.0, style=ProgressStyle(description_w…




In [None]:

os.mkdir('npy')
np.save('npy/train_data.npy', train_data)
np.save('npy/val_data.npy', val_data)
np.save('npy/test_data.npy', test_data)

In [18]:
def remove_files(mydir):
    print(f'Directory is: {mydir}')  
    filelist = [ f for f in os.listdir(mydir) if os.path.isfile(f)]
    print(f'Num files: {len(filelist)}')
    for f in filelist:
        print(f'Removing file: {f}')      
        os.remove(os.path.join(mydir, f))



In [None]:

dir_list = [TRAIN_NORMAL_PATH, TRAIN_PNEUMONIA_PATH, VALID_NORMAL_PATH, VALID_PNEUMONIA_PATH, TEST_PNEUMONIA_PATH, './npy/']
for d in tqdm_notebook(dir_list, desc='Proccessing', unit='Files'):
    print(f'Checking -> {d}')
    if os.path.isdir(d):
        print(f'Deleting files in -> {d}')
        remove_files(d)       

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, description='Proccessing', max=6.0, style=ProgressStyle(description_wi…

Checking -> train/NORMAL
Deleting files in -> train/NORMAL
Checking -> train/PNEUMONIA
Deleting files in -> train/PNEUMONIA
Checking -> valid/NORMAL
Deleting files in -> valid/NORMAL
Checking -> valid/PNEUMONIA
Deleting files in -> valid/PNEUMONIA
Checking -> test/PNUEMONIA
Deleting files in -> test/PNUEMONIA
Checking -> ./npy/



In [None]:
os.chdir(ROOT)
os.getcwd()
shutil.rmtree('./data')