In [1]:
import os
import os.path
import shutil
import glob
import time
from sklearn.model_selection import StratifiedKFold                                                                                                                       

import matplotlib.pyplot as plt
import matplotlib.cm as colormap
plt.rcParams['image.cmap'] = 'Pastel1'

import numpy as np
np.random.seed(1)

from keras.preprocessing.image import img_to_array
from keras.utils import np_utils
from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50
from keras.applications.imagenet_utils import preprocess_input

Using TensorFlow backend.


In [4]:
imagedir = "Datasets/CG_Resized"

In [13]:
cur_dir = os.getcwd()
os.chdir(imagedir)  # the parent folder with sub-folders

# Get number of samples per family
list_fams = sorted(os.listdir(os.getcwd()), key=str.lower)  # vector of strings with family names
no_imgs = []  # No. of samples per family
for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    len1 = len(glob.glob('*.jpg'))  # assuming the images are stored as 'jpg'
    no_imgs.append(len1)
    os.chdir('..')
num_samples = np.sum(no_imgs)  # total number of all samples

# Compute the labels
y = np.zeros(num_samples)
pos = 0
label = 0
for i in no_imgs:
    print ("Label:%2d\tFamily: %15s\tNumber of images: %d" % (label, list_fams[label], i))
    for j in range(i):
        y[pos] = label
        pos += 1
    label += 1
num_classes = label

# Compute the features
width, height,channels = (224,224,3)
X = np.zeros((num_samples, width, height, channels))
cnt = 0
list_paths = [] # List of image paths
print("Processing images ...")
for i in range(len(list_fams)):
    for img_file in glob.glob(list_fams[i]+'/*.jpg'):
        #print("[%d] Processing image: %s" % (cnt, img_file))
        list_paths.append(os.path.join(os.getcwd(),img_file))
        img = image.load_img(img_file, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        X[cnt] = x
        cnt += 1
print("Images processed: %d" %(cnt))

os.chdir(cur_dir)

Label: 0	Family:              CG	Number of images: 8394
Label: 1	Family:              PG	Number of images: 8002
Processing images ...
Images processed: 16396


In [14]:
# Create stratified k-fold subsets                                                                                                                                        
kfold = 5  # no. of folds                                                                 
skf = StratifiedKFold(kfold, shuffle=True, random_state=1)
skfind = [None] * kfold  # skfind[i][0] -> train indices, skfind[i][1] -> test indices
cnt = 0                                              
for index in skf.split(X, y):         
    skfind[cnt] = index                                                 
    cnt += 1 

In [15]:
for i in range(kfold):
    print(skfind[i][1])

[   21    25    39 ..., 16382 16391 16395]
[    4     5     6 ..., 16381 16384 16385]
[    1     3     9 ..., 16353 16360 16375]
[    0     8    10 ..., 16373 16377 16393]
[    2    14    15 ..., 16390 16392 16394]


In [16]:
l = np.array(list_paths)
for i in range(kfold):
    print(l[skfind[i][1]])

['/home/edmar/GIT/CG/Datasets/CG_Resized/CG/862.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/CG/1240434411.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/CG/3576.jpg' ...,
 '/home/edmar/GIT/CG/Datasets/CG_Resized/PG/21_56_33_prev.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/PG/02_14_3_prev.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/PG/19_02_86_prev.jpg']
['/home/edmar/GIT/CG/Datasets/CG_Resized/CG/04.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/CG/web_1265551721.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/CG/964.13305.jpg' ...,
 '/home/edmar/GIT/CG/Datasets/CG_Resized/PG/1218_24_52_prev.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/PG/12_21_5_prev.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/PG/23_64_61_prev.jpg']
['/home/edmar/GIT/CG/Datasets/CG_Resized/CG/3584.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/CG/1258127203.jpg'
 '/home/edmar/GIT/CG/Datasets/CG_Resized/CG/web_1281339516.jpg' ...,
 '/home/edmar/GIT/CG/Datasets/CG_Resized/PG/810_06_5896_prev.jpg'
 '/home/edmar/GIT/C

In [17]:
foldsdir = 'Folds' 
if not os.path.exists(foldsdir):
    os.makedirs(foldsdir)

l = np.array(list_paths)
for i in range(kfold):
    fdir = os.path.join(foldsdir,'Fold'+str(i))
    if not os.path.exists(fdir):
        os.makedirs(fdir)
    for fname in l[skfind[i][1]]:
        shutil.copy(fname,fdir)