In [None]:
import pandas as pd
from os import path
from os import environ
import os
import random as random
from tensorflow.keras.preprocessing import image as imgproc
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras import models
from tensorflow.keras import callbacks
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random

%matplotlib inline

### Initialisation

In [None]:
# Attempt to make runs more reproducible
seed_value=20212042
print("Using random seed: %d" % seed_value)
environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value) # tensorflow 2.x

print("GPUs Available:", tf.config.list_physical_devices('GPU'))
print("Tensorflow version:", tf.__version__)

### Constants

In [None]:
DATA_DIR = './imagewoof2-320'
TRAIN_DIR = path.join(DATA_DIR, 'train')
VALIDATION_DIR = path.join(DATA_DIR, 'validation') # a split off 'train' used as validation set during NN training
TEST_DIR = path.join(DATA_DIR, 'val') # the original Imagenette test dir
MODELS_DIR = path.join('./models')
LABELS_FILE = path.join(DATA_DIR, 'noisy_imagewoof.csv')
BREEDS = { # mappings from: https://image-net.org/challenges/LSVRC/2014/browse-synsets.php
    'n02086240': 'Shih-Tzu',
    'n02087394': 'Rhodesian ridgeback',
    'n02088364': 'beagle',
    'n02089973': 'English foxhound',
    'n02093754': 'Border terrier'
}
CLASSES = ['n02086240', 'n02087394', 'n02088364', 'n02089973', 'n02093754'] # BREEDS.keys()
IMG_SIZE = (299, 299)
COLOUR_SCALE = 1/255.
BATCH_SIZE = 32

### Helper functions

In [None]:
def plot_model_history(history):
    # Loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    
    # Categorical Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['categorical_accuracy'])
    plt.plot(history.history['val_categorical_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('categorical_accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.tight_layout()
    plt.show()
    
def load_random_image(filepath):
    img_file = random.choice(os.listdir(filepath))
    img = imgproc.load_img(path.join(filepath, img_file))
    img = img.resize(IMG_SIZE)
    img_array = imgproc.img_to_array(img)
    return img_array * COLOUR_SCALE

# Data generation flow from train/validation directory
def create_flow(datagen, path, batch_size):
    # TODO document choices
    return datagen.flow_from_directory(
        path,
        target_size=IMG_SIZE,
        classes=CLASSES,
        class_mode='categorical',
        batch_size=batch_size
    )

def evaluate_model(model_file, data_path=TEST_DIR):
    model = models.load_model(model_file)
    test_flow = create_flow(ImageDataGenerator(rescale=COLOUR_SCALE), data_path, 1)

    loss_accuracy = model.evaluate(test_flow, steps=test_flow.samples, verbose=False)
    print('Evaluating against data in %s: Loss=%.2f, Accuracy=%.2f%%' % (data_path, loss_accuracy[0], loss_accuracy[1] * 100.))
    
    test_flow.reset()
    y_true = []
    for i in range(test_flow.samples):
        _, y = test_flow.next()
        y_true.append(np.argmax(y, axis=1))
    y_true = np.array(y_true)
    
    test_flow.reset()
    y_pred = model.predict(test_flow, steps=test_flow.samples, verbose=False)
    y_pred = np.argmax(y_pred, axis=1)
        
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=CLASSES)
    disp.plot(cmap=plt.cm.Blues)
    plt.show()

### Make train/validation split and organize directory structure accordingly

In [None]:
if not path.isdir(VALIDATION_DIR):
    ground_truth = pd.read_csv(LABELS_FILE)
    ground_truth = ground_truth[ground_truth['noisy_labels_0'].isin(CLASSES)]
    test_df = ground_truth[ground_truth['is_valid']==True]
    imagenette_train = ground_truth[ground_truth['is_valid']==False]
    train_df, val_df = train_test_split(imagenette_train, test_size=0.2) # the dataset is balanced
    val_df = val_df.rename(columns={'path': 'orig_path'})
    val_df['path'] = val_df['orig_path'].str.replace('train/', 'validation/')
    val_df.apply(lambda v: os.renames(path.join(DATA_DIR, v['orig_path']), path.join(DATA_DIR, v['path'])), axis=1)
    del val_df['orig_path']

### Define and train InceptionV3-based CNN

In [None]:
ImageDataGenerator(rescale=COLOUR_SCALE)
model_file = path.join(MODELS_DIR, 'imagewoof', 'inceptionv3_based.h5')
if not path.isfile(model_file):
    os.makedirs(path.dirname(model_file), exist_ok=True)
    
    # Create InceptionV3 model
    inceptionv3 = keras.applications.InceptionV3(weights='imagenet', include_top=False)
    inceptionv3.trainable = False

    # Add new top layers for classification
    inputs = keras.Input(shape=IMG_SIZE + (3,))
    x = inceptionv3(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    output = layers.Dense(5, activation='softmax')(x)

    finetuned_model = keras.Model(inputs=inputs, outputs=output)
    finetuned_model.compile(
        optimizers.RMSprop(lr=1e-3),
        'categorical_crossentropy',
        metrics=[metrics.CategoricalAccuracy()]
    )
    finetuned_model.summary()

    # Train model
    epochs_count = 10
    train_gen = ImageDataGenerator( # TODO document choices
        rescale=COLOUR_SCALE,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    train_flow = create_flow(train_gen, TRAIN_DIR, BATCH_SIZE)
    val_flow = create_flow(ImageDataGenerator(rescale=COLOUR_SCALE), VALIDATION_DIR, BATCH_SIZE)
    save_best_cb = callbacks.ModelCheckpoint(filepath=model_file,
                                             monitor='val_loss', mode='min', save_best_only=True,
                                             verbose=False) # set to True to see best model's epoch
    history = finetuned_model.fit(train_flow, steps_per_epoch=train_flow.samples // BATCH_SIZE,
                       validation_data=val_flow, validation_steps=val_flow.samples // BATCH_SIZE,
                       epochs=epochs_count,
                       callbacks=[save_best_cb],
                       verbose=False)
    plot_model_history(history)
    evaluate_model(model_file)
    
    # Load best weights
    finetuned_model.load_weights(model_file)
    
    # Freeze all layers up to the top two inception blocks
    inceptionv3.trainable = True
    first_trainable_layer = 250
    for layer in inceptionv3.layers[:first_trainable_layer]:
        layer.trainable = False
    for layer in inceptionv3.layers[first_trainable_layer:]:
        layer.trainable = True
    
    # Fine tune model
    epochs_count = 50
    finetuned_model.compile(
        optimizers.RMSprop(lr=1e-5),
        'categorical_crossentropy',
        metrics=[metrics.CategoricalAccuracy()]
    )
    finetuned_model.summary()
    history = finetuned_model.fit(train_flow, steps_per_epoch=train_flow.samples // BATCH_SIZE,
                       validation_data=val_flow, validation_steps=val_flow.samples // BATCH_SIZE,
                       epochs=epochs_count,
                       callbacks=[save_best_cb],
                       verbose=False)
    plot_model_history(history)

# Evaluate against test dataset
evaluate_model(model_file)

### Evaluate against unseen images

In [None]:
evaluate_model(model_file, './imagewoof2-in_the_wild')