In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from subprocess import check_output
import glob
import cv2 as cv
import multiprocessing as mp
from itertools import repeat

This notebook loads, preprocess and dumps the images and labels data in a numpy array object in pickle format. 

In [2]:
base_path = "../../.kaggle/diabetic_retinopathy_detection/"
train_labels = pd.read_csv(base_path+"trainLabels.csv")

In [3]:
train_labels["id"] = train_labels["image"].apply(lambda x: x.split("_")[0]).astype(int)
print(train_labels.shape)
train_labels.head()

(35126, 3)


Unnamed: 0,image,level,id
0,10_left,0,10
1,10_right,0,10
2,13_left,0,13
3,13_right,0,13
4,15_left,1,15


In [4]:
#number of participants
print(len(train_labels["id"].unique()))

17563


In [5]:
train_files = np.sort(glob.glob(base_path+"train/*.jpeg"))
test_files = np.sort(glob.glob(base_path+"test/*.jpeg"))

In [6]:
len(train_files), len(test_files)

(35126, 53576)

In [7]:
def img_preprocess(img, row_size=700, col_size=700):
    """
    Resize the img to (row_size, col_size, num_channels)
    and normalize to values 0 to 1.
    """
    img_new = cv.resize(img, (row_size, col_size), 
                        interpolation = cv.INTER_CUBIC)
    img_new = cv.normalize(img_new, None, alpha = 0, beta = 1,
                           norm_type = cv.NORM_MINMAX,
                           dtype = cv.CV_32F)
    return img_new

def preprocess(arg):
    """
    Returns the preprocessed image, image_name, and label
    """
    train_labels, file = arg
    img = cv.imread(file)
    img = img_preprocess(img)
    
    img_nm = file.split("/")[-1].split(".")[0]
    label = train_labels[train_labels["image"]==img_nm]["level"].iloc[0]
    return img, img_nm, label

def dump_preprocess(train_files, train_labels, num_per_batch=2000):
    
    num_batch = len(train_files)//num_per_batch \
                        if len(train_files)%num_per_batch==0\
                        else (len(train_files)//num_per_batch)+1
    print("Number of Batches:", num_batch)
    for i in range(num_batch):
        files_batch = train_files[i*num_per_batch: i*num_per_batch + num_per_batch]
        pool = mp.Pool(processes=mp.cpu_count()-2)
        res = pool.map(preprocess, zip(repeat(train_labels, len(files_batch)),
                                       files_batch))
        pool.close()
        images = np.array([i[0] for i in res])
        labels_ids = np.array([i[1:] for i in res])
        pd.to_pickle(images, base_path+"train_preprocessed/"+"images_%i.pkl"%(i))
        pd.to_pickle(labels_ids, base_path+"train_preprocessed/"+"lab_id_%i.pkl"%(i)) 
        print("Batch ", i)

In [8]:
arg = train_labels, train_files[0]
a = preprocess(arg)

In [9]:
%%time
dump_preprocess(train_files, train_labels)

Number of Batches: 18
Batch  0
Batch  1
Batch  2
Batch  3
Batch  4
Batch  5
Batch  6
Batch  7
Batch  8
Batch  9
Batch  10
Batch  11
Batch  12
Batch  13
Batch  14
Batch  15
Batch  16
Batch  17
CPU times: user 17min 29s, sys: 31min 40s, total: 49min 9s
Wall time: 57min 27s
