### Set dependancies

In [47]:
%reset -f

import os
import shutil 
from shutil import copyfile
import numpy as np
import random

import cv2
import glob

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Split images to train and validation subdirs

In [48]:
def img_train_test_split(img_source_dir, train_size):
    
    # Set up empty folder structure if not exists
    if not os.path.exists('data'):
        os.makedirs('data')
    else:
        if not os.path.exists('data/train'):
            os.makedirs('data/train')
        if not os.path.exists('data/validation'):
            os.makedirs('data/validation')
            
    # Get the subdirectories in the main image folder
    subdirs = [subdir for subdir in os.listdir(img_source_dir) if os.path.isdir(os.path.join(img_source_dir, subdir))]

    for subdir in subdirs:
        subdir_fullpath = os.path.join(img_source_dir, subdir)
        if len(os.listdir(subdir_fullpath)) == 0:
            print(subdir_fullpath + ' is empty')
            break

        train_subdir = os.path.join('data/train', subdir)
        validation_subdir = os.path.join('data/validation', subdir)

        # Create subdirectories in train and validation folders
        if not os.path.exists(train_subdir):
            os.makedirs(train_subdir)

        if not os.path.exists(validation_subdir):
            os.makedirs(validation_subdir)

        train_counter = 0
        validation_counter = 0

        # Randomly assign an image to train or validation folder
        for filename in os.listdir(subdir_fullpath):
            if filename.endswith(".jpg") or filename.endswith(".png"): 
                fileparts = filename.split('.')

                if random.uniform(0, 1) <= train_size:
                    copyfile(os.path.join(subdir_fullpath, filename), os.path.join(train_subdir, 'train' + str(train_counter) + '.' + fileparts[1]))
                    train_counter += 1
                else:
                    copyfile(os.path.join(subdir_fullpath, filename), os.path.join(validation_subdir, 'valid' + str(validation_counter) + '.' + fileparts[1]))
                    validation_counter += 1
                    
        print('Copied ' + str(train_counter) + ' images to data/train/' + subdir)
        print('Copied ' + str(validation_counter) + ' images to data/validation/' + subdir)
        


In [49]:
os.chdir("/")
img_source_dir='images/'

# Get the subdirectories in the main image folder
subdirs = [subdir for subdir in os.listdir(img_source_dir) if os.path.isdir(os.path.join(img_source_dir, subdir))]
subdirs

print (subdirs)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']


In [50]:
try:
    shutil.rmtree('./data')
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

In [51]:
train_dir = os.path.join('./data', 'train')
validation_dir = os.path.join('./data', 'validation')

In [52]:
for i in range(len(subdirs)):
    globals()['train_'+subdirs[i]+'_dir'] = os.path.join(train_dir, subdirs[i])
    globals()['validation_'+subdirs[i]+'_dir'] = os.path.join(validation_dir, subdirs[i])

In [55]:
train_size=0.8
IMG_HEIGHT = 200
IMG_WIDTH = 200

In [56]:
img_train_test_split(img_source_dir, train_size)

Copied 89 images to data/train/A
Copied 0 images to data/validation/A
Copied 149 images to data/train/B
Copied 0 images to data/validation/B
Copied 73 images to data/train/C
Copied 0 images to data/validation/C
Copied 66 images to data/train/D
Copied 0 images to data/validation/D
Copied 197 images to data/train/E
Copied 0 images to data/validation/E
Copied 104 images to data/train/F
Copied 0 images to data/validation/F
Copied 130 images to data/train/G
Copied 0 images to data/validation/G
Copied 26 images to data/train/H
Copied 0 images to data/validation/H
Copied 172 images to data/train/I
Copied 0 images to data/validation/I


### Create list of files from folders with corresponing classes

In [57]:
def label_files_indirs(subdirs, img_source_dir):
# create labels from images stored in dirs
# inputs:
# subdirs - list of subdirs (classes)
# img_source_dir - main directory, containting subdirs
# outputs:
# labels - array of consecutive labels (starting from 0, 1, 2... etc.)
    file_num=np.zeros(len(subdirs))
    labelled=[]
    labels=[]
    for i in range(len(subdirs)):
        mypath=str(img_source_dir+'/'+subdirs[i])
        file_num[i]=len([f for f in os.listdir(mypath) if f.endswith('.png') and os.path.isfile(os.path.join(mypath, f))])
        labelled=np.full((1,int(file_num[i])), i)
        labels=np.append(labels,labelled)
    return labels

In [58]:
y_train=label_files_indirs(subdirs, train_dir)
y_test=label_files_indirs(subdirs, validation_dir)

### Read images into numpy arrays

In [61]:
def im2npa(imagedir, subdirs, show):
# create numpy array from directory of images
# inputs:
# imagedir - directory with the images
# subdirs - list of subdirs (classes)
# show - if=1, plot files and file names
# outputs:
# data - numpy array variable with all images as tensor
    data = []
    for i in range(len(subdirs)):
        pathis=os.path.join(imagedir, subdirs[i],"*.png")
        files = glob.glob(pathis)
        for myFile in files:
            image = cv2.imread (myFile, cv2.IMREAD_UNCHANGED)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, (IMG_HEIGHT,IMG_WIDTH))
            data.append (image)
            if show==1:
                print(myFile)
                imgplot = plt.imshow(image)
                plt.show()
    
    data=np.asarray(data)
    print('Data shape:', data.shape)
    return data

In [62]:
x_test=im2npa('./data/validation/', subdirs ,0)
x_train=im2npa('./data/train/', subdirs ,0)

Data shape: (1006, 200, 200, 3)


### Save to .npz

In [63]:
np.savez('vetdata.npz', x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test)