In [8]:
import numpy as np
import pandas as pd

from skimage import io
from tqdm import tqdm_notebook
from scipy import ndimage, misc
from PIL import Image, ImageEnhance
import random
import imageio

In [2]:
TRAIN = pd.read_csv("TRAIN.csv", index_col=0)

## Separate data into 5 folds for separate data augmentation and validation

In [3]:
X = np.array(TRAIN.index)

y = np.array(TRAIN)
y = np.array([np.where(r==1)[0][0] for r in y])

In [4]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, random_state=50, shuffle=True)
skf.get_n_splits(X, y)
print(skf)  


foldList = []

StratifiedKFold(n_splits=5, random_state=50, shuffle=True)


In [5]:
for train_index, test_index in skf.split(X, y): # extract each fold one by one
    foldList.append(X[test_index])

## Data Augmentation

In [6]:
import cv2

def cv2_clipped_zoom(img, zoom_factor):
    """
    Center zoom in/out of the given image and returning an enlarged/shrinked view of 
    the image without changing dimensions
    Args:
        img : Image array
        zoom_factor : amount of zoom as a ratio (0 to Inf)
    """
    height, width = img.shape[:2] # It's also the final desired shape
    new_height, new_width = int(height * zoom_factor), int(width * zoom_factor)

    ### Crop only the part that will remain in the result (more efficient)
    # Centered bbox of the final desired size in resized (larger/smaller) image coordinates
    y1, x1 = max(0, new_height - height) // 2, max(0, new_width - width) // 2
    y2, x2 = y1 + height, x1 + width
    bbox = np.array([y1,x1,y2,x2])
    # Map back to original image coordinates
    bbox = (bbox / zoom_factor).astype(np.int)
    y1, x1, y2, x2 = bbox
    cropped_img = img[y1:y2, x1:x2]

    # Handle padding when downscaling
    resize_height, resize_width = min(new_height, height), min(new_width, width)
    pad_height1, pad_width1 = (height - resize_height) // 2, (width - resize_width) //2
    pad_height2, pad_width2 = (height - resize_height) - pad_height1, (width - resize_width) - pad_width1
    pad_spec = [(pad_height1, pad_height2), (pad_width1, pad_width2)] + [(0,0)] * (img.ndim - 2)

    result = cv2.resize(cropped_img, (resize_width, resize_height))
    result = np.pad(result, pad_spec, mode='constant')
    assert result.shape[0] == height and result.shape[1] == width
    return result

In [7]:
# Create dataframe to hold all image names and labels for augmented images
AUGMENTED = pd.DataFrame()

In [9]:
def augmentFold(foldNum, foldList):
    for name in tqdm_notebook(foldList[foldNum]):
        img = io.imread("Processed Images/TRAIN/"+name+".jpg")

        augs = [] # array of numpy array-images to be saved

        # make i images per input image
        for i in range(4):
            newImg = ndimage.rotate(img, random.randint(1,3)*90) # rotate
            
            choice = random.randint(1,3) # flip
            if(choice==2):
                newImg = newImg[::-1, :, :]
            elif(choice==3):
                newImg = img[:, ::-1, :]
                
            # contrast and brightness augs, using Pillow (PIL fork)
            newImg = Image.fromarray(newImg.astype('uint8'), 'RGB')
            contrast = ImageEnhance.Contrast(newImg)
            newImg = contrast.enhance(random.uniform(0.9,1.1))
            brightness = ImageEnhance.Brightness(newImg)
            newImg = np.array(brightness.enhance(random.uniform(0.9,1.1)))
            newImg = cv2_clipped_zoom(newImg, random.uniform(0.9, 1))
            augs.append(newImg)
        
        
        # add to AUGMENTED dataframe and save images
        label = np.array(TRAIN.loc[name]) # label for all augmented images from this "name"
        for i in range(len(augs)):
            AUGMENTED[name+"_"+str(i)] = label
            imageio.imwrite("Processed Images/AUGMENTED/"+name+"_"+str(i)+".jpg",
                        augs[i])

    
    return AUGMENTED.T

In [10]:
FOLD_1_LABELS = augmentFold(0, foldList)
FOLD_2_LABELS = augmentFold(1, foldList)
FOLD_3_LABELS = augmentFold(2, foldList)
FOLD_4_LABELS = augmentFold(3, foldList)
#FOLD_5_LABELS = augmentFold(4, foldList)

HBox(children=(IntProgress(value=0, max=4056), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4055), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4053), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4051), HTML(value='')))




In [11]:
AUGMENTED.index = TRAIN.columns
AUGMENTED.T.to_csv("AUGMENTED.csv")

In [12]:
np.save("kfold-splits.npy", foldList)

In [15]:
len(AUGMENTED.T)

64860