# Malaria Detection: ETL

In [1]:
# remove old image folder
!rm -r ./rCell_images
!ls

a2_m1.json			       img_7.jpg
a2_m1.json.zip			       img_8.jpg
a2_m1.json.zip.base64		       img_9.jpg
a2_m2.json			       malaria2.h5
a2_m2.json.zip			       malaria.h5
a2_m2.json.zip.base64		       malaria_history.pickle
a2_m3.json			       malaria_model.h5
a2_m3.json.zip			       mnist.onnx
a2_m3.json.zip.base64		       model.pickle
a2_m4.json			       my_modelx.h5
a2_m4.json.zip			       nix.png
a2_m4.json.zip.base64		       predData
a.png				       __pycache__
cell_images			       pytorch-mnist.zip
cell-images-for-detecting-malaria.zip  rklib.py
data				       rklib.pyc
dl4j-snapshot.jar		       saved_models.pth
img_1.jpg			       save.p
img_2.jpg			       scratch_space
img_3.jpg			       sonar.csv
img_4.jpg			       testData
img_5.jpg			       trainData
img_6.jpg


In [6]:
# install resize image
!pip install python-resize-image

Collecting python-resize-image
  Using cached https://files.pythonhosted.org/packages/c7/b5/01e49796187415278796d5c64f8fff750a2e27765155be20876dffaabce3/python_resize_image-1.1.18-py2.py3-none-any.whl
Collecting requests>=2.19.1 (from python-resize-image)
  Using cached https://files.pythonhosted.org/packages/7d/e3/20f3d364d6c8e5d2353c72a67778eb189176f08e873c9900e10c0287b84b/requests-2.21.0-py2.py3-none-any.whl
Collecting Pillow>=5.1.0 (from python-resize-image)
[?25l  Downloading https://files.pythonhosted.org/packages/0d/f3/421598450cb9503f4565d936860763b5af413a61009d87a5ab1e34139672/Pillow-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl (2.0MB)
[K    100% |████████████████████████████████| 2.0MB 5.1MB/s eta 0:00:01
[31mtensorflow 1.2.1 has requirement bleach==1.5.0, but you'll have bleach 2.0.0 which is incompatible.[0m
[31mtensorflow 1.2.1 has requirement html5lib==0.9999999, but you'll have html5lib 0.999999999 which is incompatible.[0m
[31mibm-cos-sdk-core 2.0.1 has requirement re

In [7]:
# We will use a convolutional deep net for image classification
# Based on data exploration I decided to resize each image to 64x64 px
# which will be the input layer size of the net
# We will keep each image's image ratio and pad its background with 0s if needed

import tqdm
import numpy as np
from PIL import Image
import zipfile
from resizeimage import resizeimage

imgSize = [64,64]

resDataDir = './rCell_images'

sDirs = ['Parasitized', 'Uninfected']

if not os.path.isdir(resDataDir):
    os.mkdir(resDataDir)
    print('Created {}'.format(resDataDir))

for sD in sDirs:
    dir2make = os.path.join(resDataDir, sD)
    if not os.path.isdir(dir2make):
        os.mkdir(dir2make)
        print('Created {}'.format(dir2make))

zipRef = zipfile.ZipFile('./cell-images-for-detecting-malaria.zip')
cFiles = zipRef.namelist()
cImgs  = [s for s in cFiles if "png" in s]

probImgs = []

for i in tqdm.trange(len(cImgs)):
#for i in tqdm.trange(10):
    cImg = cImgs[i]
    
    zImg = zipRef.open(cImg)
    
    if "Parasitized" in cImg:
        #print("Sepp")
        cDir = 'Parasitized'

    elif "Uninfected" in cImg:
        #print("Schorsch")
        cDir = 'Uninfected'
    
    fImg = os.path.join(resDataDir, cDir, "r"+os.path.basename(cImg))  
    
    #resize image
    try:                
        imgObj = Image.open(zImg)                         
        imgObj = resizeimage.resize_contain(imgObj, imgSize, resample=Image.LANCZOS, bg_color=(0, 0, 0, 0))
        imgObj.save(fImg)
        imgObj.close()
    except:
        print('Could not work on {}'.format(cImg))
        probImgs.append(cImg)
            

zipRef.close()

Created ./rCell_images
Created ./rCell_images/Parasitized
Created ./rCell_images/Uninfected


100%|██████████| 27558/27558 [03:28<00:00, 132.41it/s]


The following three cells are a workaround to get a few files resized I had issues with. Due to a strange bug I was not able to cycle over a subset of image and do the resizing automatically.


In [8]:
# open "problem" images

import matplotlib.pyplot as plt

zs = []

if len(probImgs) > 0:
    zipRef = zipfile.ZipFile('./cell-images-for-detecting-malaria.zip')

    zs = []
    for p in probImgs:
        z = zipRef.open(p)
        i = Image.open(z)
        zs.append(i)
else:
    print('No problem images found -> skipping this step')

No problem images found -> skipping this step


The next step is curious indeed and likely caused by a bug in the PIL linbrary. <br>
This step is skipped if all images were correctly resized above!
<br><br>
I had to alternate between the next following two cells, which contain the **identical** code to get the last 12 images manually resized: Manually increment k and alternate between cells for each image ...<br><br> This must be a currious bug or interaction between the jupiter notebook and the PIL library that results in different behaviour for the **identical** code that depends on the cell that executes it ...

In [9]:
# identical code as next cell
# I had to call both in a alternating way to workaround
# a very strange bug
if len(probImgs) > 0:
    k = 10
    cImg = probImgs[k]
    if "Parasitized" in cImg:
        #print("Sepp")
        cDir = 'Parasitized'

    elif "Uninfected" in cImg:
        #print("Schorsch")
        cDir = 'Uninfected'

    fImg = os.path.join(resDataDir, cDir, "r"+os.path.basename(cImg))  

    #resize image

    imgObj = resizeimage.resize_contain(zs[k], imgSize, resample=Image.LANCZOS, bg_color=(0, 0, 0, 0))
    imgObj.save(fImg)
    imgObj.close()
    
    print(cImg)
    print(fImg)
        
    plt.imshow(zs[k])
    
else:
    print('No problem images found -> skipping this step')

No problem images found -> skipping this step


In [10]:
if len(probImgs) > 0:
    k = 12
    cImg = probImgs[k]
    if "Parasitized" in cImg:
        #print("Sepp")
        cDir = 'Parasitized'

    elif "Uninfected" in cImg:
        #print("Schorsch")
        cDir = 'Uninfected'

    fImg = os.path.join(resDataDir, cDir, "r"+os.path.basename(cImg))  

    #resize image

    imgObj = resizeimage.resize_contain(zs[k], imgSize, resample=Image.LANCZOS, bg_color=(0, 0, 0, 0))
    imgObj.save(fImg)
    imgObj.close()
    
    print(cImg)
    print(fImg)
        
    plt.imshow(zs[k])
    
else:
    print('No problem images found -> skipping this step')

No problem images found -> skipping this step


In [11]:
# get counts of resized parasitized and uninfected images
# to make sure we have equally many images in either folder
# We should end up with 13779 per category
p = os.listdir('rCell_images/Parasitized')
pI = [s for s in p if "png" in s]

u = os.listdir('rCell_images/Uninfected')
uI = [s for s in u if "png" in s]

print( len(uI) )
print( len(pI) )

13779
13779


In [None]:
# we next split our images in three parts: training (70%), testing(20%) and evaluation(10%)
# images are copied into three folders with subdirectories for either class
# this allows us to directly load our data using the keras image processor library, later on

import os
import shutil
from tqdm import trange
from random import shuffle
import collections

# define split ratios
# sum up to 1
trData  = 0.7
teData  = 0.2
prData  = 0.1

# source directory with scaled  images
dataDir = './rCell_images'

# target directories
trainDir = './trainData'
testDir  = './testData'
predDir  = './predData'

# delete old dirs in case they exist
try:
    shutil.rmtree(trainDir)
    shutil.rmtree(testDir)
    shutil.rmtree(predDir)
except:
    print('No previous dirs found')

# create new folders
os.mkdir(trainDir)
os.mkdir(testDir)
os.mkdir(predDir)

# for each image class: assign images to training, testing and evaluation data
for sDir in os.listdir(dataDir):
    cDir   = os.path.join(dataDir, sDir)    
    cFiles = os.listdir(cDir)
               
    allInds = list(range(len(cFiles)))
    shuffle(allInds)    
    
    trainInds = allInds[0:round(len(cFiles)*trData)]
    testInds  = allInds[round(len(cFiles)*trData):round(len(cFiles)*trData)+round(len(cFiles)*teData)]
    predInds  = allInds[round(len(cFiles)*trData)+round(len(cFiles)*teData):]
    
    combInds = trainInds + testInds + predInds
    print('Working on {} images--------------------------'.format(sDir))
    print('Copying {} images to training dir'.format(len( trainInds )))
    print('Copying {} images to test dir'.format(len( testInds  )))
    print('Copying {} images to prediction dir'.format( len( predInds  )))
    print('Overlapping images between folders: {}'.format( len([item for item, count in collections.Counter(combInds).items() if count > 1])) )
    print('All copied images will sum up to {} (all images = {})'.format(len(combInds), len(allInds)))
    
    
    # copy training data
    os.mkdir( os.path.join(trainDir, sDir) )
    bar = trange(len(trainInds))
    for cImg in trainInds:
        bar.update()
        shutil.copy(os.path.join(cDir, cFiles[cImg]) , os.path.join(trainDir,sDir))

    # copy testing data
    os.mkdir( os.path.join(testDir, sDir) )        
    bar = trange(len(testInds))
    for cImg in testInds:
        bar.update()
        shutil.copy(os.path.join(cDir, cFiles[cImg]) , os.path.join(testDir,sDir))
    
    # copy prediction data
    os.mkdir( os.path.join(predDir, sDir) )
    bar = trange(len(predInds))
    for cImg in predInds:
        bar.update()
        shutil.copy(os.path.join(cDir, cFiles[cImg]) , os.path.join(predDir,sDir))