In [17]:
import sys
sys.path.append('../src')
from project_utilities.config import DATA_DIR_RAW

import numpy as np
import os
import shutil
import random
import glob
import zipfile as zf
import requests as rq
from tqdm import tqdm, tqdm_notebook

# Don't show those pesky 'future warnings'
import warnings

In [10]:
DOWNLOAD_URL = 'https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/rscbjbr9sj-3.zip'
ZIP_PATH = 'raw/chest_xrays.zip'

In [8]:
os.mkdir(DATA_DIR_RAW / 'raw')

In [11]:
r = rq.get(DOWNLOAD_URL, stream=True)
with open(DATA_DIR_RAW / ZIP_PATH, 'wb') as c_ray:
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            c_ray.write(chunk)

In [65]:
def extract_to(zip_file, member, dest_dir):
    
    filename = os.path.basename(member)
    # skip directories
    if not filename: 
        return
    
    test_dir = str(dest_dir) + filename
    if os.path.isfile(test_dir): 
        return

    # copy file (taken from zipfile's extract)
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    source = zip_file.open(member)
    target = open(os.path.join(dest_dir, filename), "wb")
    with source, target:
        shutil.copyfileobj(source, target)
            

In [21]:
            
with zf.ZipFile(DATA_DIR_RAW / ZIP_PATH) as archive:
    archive.extract('ZhangLabData.zip', DATA_DIR_RAW / 'raw/')

zang_zip = DATA_DIR_RAW / 'raw/ZhangLabData.zip'

In [22]:
with zf.ZipFile(zang_zip) as zang_archive:

    zip_paths = 'CellData/chest_xray/'
    for path in zang_archive.namelist():

        if zip_paths + 'train/NORMAL/' in path:
            extract_to(zang_archive, path, DATA_DIR_RAW / 'raw/train/NORMAL')

        elif zip_paths + 'train/PNEUMONIA/' in path:
            extract_to(zang_archive, path, DATA_DIR_RAW / 'raw/train/PNEUMONIA')

        elif zip_paths + 'test/NORMAL/' in path:
            extract_to(zang_archive, path, DATA_DIR_RAW / 'raw/test/NORMAL')

        elif zip_paths + 'test/PNEUMONIA/' in path:
            extract_to(zang_archive, path, DATA_DIR_RAW / 'raw/test/PNEUMONIA')
            
        else:
            continue
        
os.remove(DATA_DIR_RAW / 'raw/chest_xrays.zip')
os.remove(DATA_DIR_RAW / 'raw/ZhangLabData.zip') 

In [23]:
os.chdir(DATA_DIR_RAW / 'raw')
os.getcwd()

'C:\\Users\\bigal\\OneDrive\\Documents\\FlatIron\\Module_4_Final_Project\\mod-4-project-image-classification\\data\\raw'

In [25]:
train_norm = len([name for name in os.listdir("train/NORMAL") if not os.path.isdir(name)])
train_pnue = len([name for name in os.listdir("train/PNEUMONIA") if not os.path.isdir(name)])

sample_norm = math.ceil(len(os.listdir('train/NORMAL')) * 0.2)
sample_pnue = math.ceil(len(os.listdir('train/PNEUMONIA')) * 0.2)

In [None]:
if os.path.isdir('valid/NORMAL') is False:
    os.makedirs('valid/NORMAL')
    os.makedirs('valid/PNEUMONIA')

    for i in random.sample(glob.glob('train/NORMAL/*'), sample_norm):
        shutil.move(i, 'valid/NORMAL/')      
    for i in random.sample(glob.glob('train/PNEUMONIA/*'), sample_pnue):
        shutil.move(i, 'valid/PNEUMONIA/')


In [34]:
TRAIN_NORMAL_PATH = 'train/NORMAL'
TRAIN_PNEUMONIA_PATH = 'train/PNEUMONIA'
PREPROC_NORMAL_PATH = 'preprocessed/NORMAL'
PREPROC_PNEUMONIA_PATH = 'preprocessed/PNEUMONIA'

VALID_NORMAL_PATH = 'valid/NORMAL'
VALID_PNEUMONIA_PATH = 'valid/PNEUMONIA'


NORMAL_DEST = '../../preprocessed/Normal'
PNEUMONIA_DEST = '../../preprocessed/PNEUMONIA'

In [35]:
import Augmentor as ag

def im_augemt(source, destination, num_samp):
    p = ag.Pipeline(source, destination ) 
    p.flip_left_right(0.5) 
    p.rotate(0.3, 10, 10) 
    p.skew(0.4, 0.5) 
    p.histogram_equalisation(probability=1.0)
    p.zoom(probability = 0.2, min_factor = 1.1, max_factor = 1.3) 
    p.sample(num_samp, multi_threaded=True) 
    
im_augemt(TRAIN_NORMAL_PATH, NORMAL_DEST, train_norm + train_pnue ) 

im_augemt(TRAIN_PNEUMONIA_PATH, PNEUMONIA_DEST, train_pnue - train_norm ) 

Initialised with 1079 image(s) found.
Output directory set to train/NORMAL\../../preprocessed/Normal.

Processing <PIL.Image.Image image mode=L size=1928x1303 at 0x18FAC946748>: 100%|█| 5233/5233 [03:33<00:00, 24.54 Samples/s]


Initialised with 3106 image(s) found.
Output directory set to train/PNEUMONIA\../../preprocessed/PNEUMONIA.

Processing <PIL.Image.Image image mode=L size=936x696 at 0x18FB7F772B0>: 100%|█| 2535/2535 [00:47<00:00, 53.88 Samples/s]   


In [36]:
os.getcwd()

'C:\\Users\\bigal\\OneDrive\\Documents\\FlatIron\\Module_4_Final_Project\\mod-4-project-image-classification\\data\\raw'

In [6]:
train_norm, train_pnue

(1079, 3106)

In [40]:
files = os.listdir('preprocessed/NORMAL')
for file in tqdm_notebook(files):
    shutil.move(f'{PREPROC_NORMAL_PATH}/{file}', TRAIN_NORMAL_PATH)

HBox(children=(IntProgress(value=0, max=5233), HTML(value='')))




In [41]:
files = os.listdir('preprocessed/PNEUMONIA')
for file in tqdm_notebook(files):
    shutil.move(f'{PREPROC_PNEUMONIA_PATH}/{file}', TRAIN_PNEUMONIA_PATH)

HBox(children=(IntProgress(value=0, max=2535), HTML(value='')))




In [44]:
shutil.rmtree('preprocessed')

In [56]:
if os.path.isfile(TRAIN_NORMAL_PATH + '/.DS_Store'):
    os.remove(TRAIN_NORMAL_PATH + '/.DS_Store')
    
if os.path.isfile(TRAIN_PNEUMONIA_PATH + '/.DS_Store'):
    os.remove(TRAIN_PNEUMONIA_PATH + '/.DS_Store')

found it


In [53]:
from PIL import Image

def im_array(path):
    x_data = np.array( [np.array(Image.open(path + '/' + file).resize((224,224))) 
                        for file in tqdm_notebook(os.listdir(path), desc='Processing', unit='files') 
                        if len(np.array(Image.open(path + '/' + file)).shape) == 2] )
    return x_data.flatten().reshape(len(x_data), 224**2)

def create_image_array(orig, label=0):
    
    norm_array = im_array(orig)
    
    norm_array = norm_array/255
    norm_y = np.ones((int(norm_array.shape[0]),1)) if label else np.zeros((int(norm_array.shape[0]),1))
    norm_data = np.hstack((norm_array, norm_y))
    return norm_data


In [57]:
train_data = np.concatenate([create_image_array(TRAIN_NORMAL_PATH, label=0),
                            create_image_array(TRAIN_PNEUMONIA_PATH, label=1)], 
                            axis=0
                           )

HBox(children=(IntProgress(value=0, description='Processing', max=6312, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Processing', max=5641, style=ProgressStyle(description_width=…




In [49]:
val_data = np.concatenate([create_image_array(VALID_NORMAL_PATH,  label=0),
                            create_image_array(VALID_PNEUMONIA_PATH, label=1)], 
                            axis=0
                           )

HBox(children=(IntProgress(value=0, description='Processing', max=270, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Processing', max=777, style=ProgressStyle(description_width='…




In [58]:
np.random.shuffle(train_data)
np.random.shuffle(val_data)

print(f'Shape for train_data: {(train_data.shape)}')
print(f'Shape for val_data: {val_data.shape}')

y_train = train_data[:,-1]
y_train.reshape(y_train.size,1)
print(f'Shape for y_train: {y_train.shape}')

X_train = train_data[:,:-1]
print(f'Shape for X_train: {X_train.shape}')

Shape for train_data: (11543, 50177)
Shape for val_data: (983, 50177)
Shape for y_train: (11543,)
Shape for X_train: (11543, 50176)


In [59]:
y_val = val_data[:,-1]
X_val = val_data[:,:-1]

In [60]:
X_val.shape

(983, 50176)

In [63]:
os.mkdir('npy')
np.save('npy/train_data.npy', train_data)
np.save('npy/val_data.npy', val_data)