In [1]:
from sklearn.datasets import load_files       
from keras.utils import np_utils
import numpy as np
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [27]:
# creation of train, validation and test folders

import os
import random
from shutil import copyfile

def img_train_test_split(img_source_dir, train_size, validation_size):
    """
    Randomly splits images over a train and validation folder, while preserving the folder structure
    
    Parameters
    ----------
    img_source_dir : string
        Path to the folder with the images to be split. Can be absolute or relative path   
        
    train_size : float
        Proportion of the original images that need to be copied in the subdirectory in the train folder
    """    
    if not (isinstance(img_source_dir, str)):
        raise AttributeError('img_source_dir must be a string')
        
    if not os.path.exists(img_source_dir):
        raise OSError('img_source_dir does not exist')
        
    if not (isinstance(train_size, float)):
        raise AttributeError('train_size must be a float')
        
    # Set up empty folder structure if not exists
    if not os.path.exists('data'):
        os.makedirs('data')
    else:
        if not os.path.exists('data/train'):
            os.makedirs('data/train')
        if not os.path.exists('data/validation'):
            os.makedirs('data/validation')
        if not os.path.exists('data/test'):
            os.makedirs('data/test')
            
    # Get the subdirectories in the main image folder
    subdirs = [subdir for subdir in os.listdir(img_source_dir) if os.path.isdir(os.path.join(img_source_dir, subdir))]

    for subdir in subdirs:
        subdir_fullpath = os.path.join(img_source_dir, subdir)
        if len(os.listdir(subdir_fullpath)) == 0:
            print(subdir_fullpath + ' is empty')
            break

        train_subdir = os.path.join('data/train', subdir)
        validation_subdir = os.path.join('data/validation', subdir)
        test_subdir = os.path.join('data/test', subdir)

        # Create subdirectories in train and validation folders
        if not os.path.exists(train_subdir):
            os.makedirs(train_subdir)

        if not os.path.exists(validation_subdir):
            os.makedirs(validation_subdir)
            
        if not os.path.exists(test_subdir):
            os.makedirs(test_subdir)

        train_counter = 0
        validation_counter = 0
        test_counter = 0

        # Randomly assign an image to train or validation folder
        for filename in os.listdir(subdir_fullpath):
            if filename.endswith(".jpg") or filename.endswith(".png"): 
                fileparts = filename.split('.')

                if random.uniform(0, 1) <= train_size:
                    copyfile(os.path.join(subdir_fullpath, filename),
                             os.path.join(train_subdir, str(train_counter) + '.' + fileparts[1]))
                    train_counter += 1
                elif random.uniform(0, 1) <= validation_size:
                    copyfile(os.path.join(subdir_fullpath, filename),
                             os.path.join(validation_subdir, str(validation_counter) + '.' + fileparts[1]))
                    validation_counter += 1
                else :
                    copyfile(os.path.join(subdir_fullpath, filename),
                             os.path.join(test_subdir, str(test_counter) + '.' + fileparts[1]))
                    test_counter += 1
                    
        print('Copied ' + str(train_counter) + ' images to data/train/' + subdir)
        print('Copied ' + str(validation_counter) + ' images to data/validation/' + subdir)

In [2]:
img_train_test_split('../data/Images', 0.7, 0.2)

NameError: name 'img_train_test_split' is not defined

In [3]:
# define function to load train, test, and validation datasets
def load_dataset(path):
    data = load_files(path)
    dog_files = np.array(data['filenames'])
    dog_targets = np_utils.to_categorical(np.array(data['target']))
    return dog_files, dog_targets

# load train, test, and validation datasets
train_files, train_targets = load_dataset('../data/train')
valid_files, valid_targets = load_dataset('../data/validation')
test_files, test_targets = load_dataset('../data/test')

# load list of dog names
# the [20:-1] portion simply removes the filepath and folder number
dog_names = [item[25:-1] for item in sorted(glob("../data/train/*/"))]

# print statistics about the dataset
print('There are %d total dog categories.' % len(dog_names))
print('There are %s total dog images.\n' % len(np.hstack([train_files, valid_files, test_files])))
print('There are %d training dog images.' % len(train_files))
print('There are %d validation dog images.' % len(valid_files))
print('There are %d test dog images.'% len(test_files))

There are 120 total dog categories.
There are 20580 total dog images.

There are 14444 training dog images.
There are 1255 validation dog images.
There are 4881 test dog images.


In [11]:
# load datasets
dog_img, dog_targets = load_dataset('../data/Images')

# load list of dog names
dog_breed = [item[25:-1] for item in sorted(glob("../data/Images/*/"))]

In [26]:
# training image augmentation
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras import optimizers
from keras.callbacks import History 

batch_size = 16

# this is the augmentation configuration I will use for training
train_datagen = ImageDataGenerator(rotation_range = 30,
                                   width_shift_range = 0.2,
                                   height_shift_range = 0.2,
                                   rescale = 1./255,
                                   shear_range = 0.2,
                                   zoom_range = 0.2,
                                   horizontal_flip = True,
                                   fill_mode = 'nearest')

# This is the augmentation configuration I will use for testing/validation... just a rescale
test_datagen = ImageDataGenerator(preprocessing_function = xception.preprocess_input,
                                  rescale=1./255)

# This is the generator which will read pictures found in my training subset
train_generator = train_datagen.flow_from_directory('../data/train/',
                                                    target_size = (224, 224),
                                                    batch_size = batch_size,
                                                    class_mode = 'categorical')

# This is the generator for validation data
validation_generator = test_datagen.flow_from_directory('../data/validation/',
                                                        target_size = (224, 224),
                                                        batch_size = batch_size,
                                                        class_mode = 'categorical')

Found 14444 images belonging to 120 classes.
Found 1255 images belonging to 120 classes.


In [25]:
shape(train_generator)

NameError: name 'shape' is not defined

In [19]:
from keras.preprocessing import image                  
from tqdm import tqdm

def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(224, 224))
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

ImportError: No module named 'tqdm'

In [None]:
train_tensors = paths_to_tensor(train_generator).astype('float32')
valid_tensors = paths_to_tensor(validation_generator).astype('float32')
test_tensors = paths_to_tensor(test_files).astype('float32')

In [32]:
from keras.applications.vgg16 import VGG16
from keras.layers import Input, Dense, Flatten
from keras.models import Model


# Charger VGG-16 pré-entraîné sur ImageNet et sans les couches fully-connected
model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

for layer in model.layers:
    layer.trainable = False
    
# Récupérer la sortie de ce réseau
x = model.output

x = Flatten()(x)

# Ajouter la nouvelle couche fully-connected pour la classification à 133 classes
predictions = Dense(120, activation='softmax')(x)

# Définir le nouveau modèle
new_model = Model(inputs=model.input, outputs=predictions)

In [33]:
# Compiler le modèle 
new_model.compile(loss="categorical_crossentropy", 
                  optimizer=optimizers.SGD(lr=0.0001, momentum=0.9),
                  metrics=["accuracy"])

# Entraîner sur les données d'entraînement (X_train, y_train)



In [None]:
%%time
new_model.fit_generator(train_generator,
                        steps_per_epoch = 6680 // batch_size,
                        epochs = 50,
                        validation_data = validation_generator,
                        validation_steps = 835 // batch_size,
                        verbose=2)

Epoch 1/50
 - 1673s - loss: 4.9147 - accuracy: 0.0106 - val_loss: 4.7251 - val_accuracy: 0.0132
Epoch 2/50
 - 1631s - loss: 4.8240 - accuracy: 0.0157 - val_loss: 4.9377 - val_accuracy: 0.0085
Epoch 3/50
 - 5514s - loss: 4.7180 - accuracy: 0.0264 - val_loss: 4.7610 - val_accuracy: 0.0108
Epoch 4/50
