# Import Essential Libraries and Data

In [None]:
import random
from numpy.random import seed
from tensorflow.random import set_seed

seed_value=42
random.seed(seed_valued)
set_seed(seed_value)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random
import os
import cv2
import sys
from pylab import rcParams
from PIL import Image
from tqdm import tqdm
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation, Input, GlobalAveragePooling2D
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
#from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.mixed_precision import experimental as mixed_precision
from sklearn.model_selection import StratifiedKFold

In [None]:
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

In [None]:
df_train = pd.read_csv('E:/Kaggle.Challenges/cassava-leaf-disease-classification/train.csv')
df_train.head()

In [None]:
df_train['label'] = df_train['label'].astype(str)
df_train.info()

# Image Augmentation

In [None]:
batch_size = 32
image_size = 300

input_shape = (image_size, image_size, 3)
target_size = (image_size, image_size)

In [None]:
img_augmentation = tf.keras.Sequential([tf.keras.layers.experimental.preprocessing.RandomCrop(image_size, image_size),
                                       tf.keras.layers.experimental.preprocessing.RandomFlip('horizontal_and_vertical'),
                                       tf.keras.layers.experimental.preprocessing.RandomRotation(0, 25),
                                       tf.keras.layers.experimental.preprocessing.RandomContrast(0.2)])

In [None]:
def DataGenerator(train_set, val_set):
    train_datagen = ImageDataGenerator().flow_from_dataframe(dataframe = train_set,
                                                            directory = 'E:/Kaggle.Challenges/cassava-leaf-disease-classification/train_images',
                                                            x_col = 'image_id',
                                                            y_col = 'label',
                                                            target_size = target_size,
                                                            batch_size = batch_size,
                                                            shuffle = True,
                                                            class_mode = 'sparse',
                                                            seed = seed_value)
    val_datagen = ImageDataGenerator().flow_from_dataframe(dataframe = val_set,
                                                          directory = 'E:/Kaggle.Challenges/cassava-leaf-disease-classification/train_images',
                                                          x_col = 'image_id',
                                                          y_col = 'label',
                                                          target_size = target_size,
                                                          batch_size = batch_size,
                                                          shuffle = False,
                                                          class_mode = 'sparse',
                                                          seed = seed_value)
    
return train_datagen, val_datagen

# Model Building

In [None]:
epochs = 3
total_steps = (int(len(df_train)*0.8/batch_size)+1)*epochs

lr = tf.keras.experimental.CosineDecay(initial_learning_rate = 1e-3, decay_steps = total_steps)

In [None]:
def build_model():
    base_model = EfficientNetB0(include_top = False, weights = 'imagenet', input_shape = input_shape)
    
    inputs = Input(shape = input_shape)
    base = base_model(inputs)
    pooling = GlobalAveragePooling2d()(base)
    outputs = Dense(5, activation = 'softmax', dtype='float32')(pooling)
    
    model = Model(inputs = inputs, outputs = outputs)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer = optimizer, loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Stratified 5-Fold Cross Validation

In [None]:
fold_number = 0
n_splits = 5
oof_accuracy = []

tf.keras.backend.clear_session()
skf = StratifiedKFold(n_splits = n_splits, random_state = seed_value)

for train_index, val_index in skf.split(df_train['image_id'], df['label']):
    train_set = df_train.loc[train_index]
    val_set = df_train.loc[val_index]
    train_datagen, val_datagen = DataGenerator(train_set, val_set)
    model = build_model()
    print('Training fold no. :' + str(fold_number + 1))
    
    model_name = 'effnetb0'
    fold_name = 'fold.h5'
    filepath = model_name + str(fold_number + 1) + fold_name
    callbacks = [ModelCheckpoint(filepath = filepath, monitor = 'val_accuracy', save_best_only = True)]
    
    history = model.fit(train_datagen, epochs = epochs,
                       validation_data = val_datagen,
                       callbacks = callbacks)
    oof_accuracy.append(max(history.history['val_accuracy']))
    fold_number += 1
    if fold_number == n_splits:
        print('Training finished!')

# Retrieving out-of-fold accuracy

After this, we can see what is out average OOF accuracy.

Now to retrieve our OOF predictions, we have to load each model and get them to predict the validation data from their fold.

In [None]:
models = []
for i in range(5):
    effnet = load_model('./effnetb0' + str(i+1) + 'fold.h5')
    models.append(effnet)
    
model_one = models[0]
models_two = models[1]
models_three = models[2]
models_four = models[3]
models_five = models[4]

In [None]:
df = pd.read_csv('E:/Kaggle.Challenges/cassava-leaf-disease-classification/train.csv')
val_list = []

skf = StratifiedKFold(n_splits = 5, random_state = seed_value)

for train_index, val_index in skf.split(df['image_id'], df['label']):
    val_list.append(val_index)
    
one_fold = df.loc[val_list[0]]
two_fold = df.loc[val_list[1]]
three_fold = df.loc[val_list[2]]
four_fold = df.loc[val_list[3]]
five_fold = df.fold[val_list[4]]

In [None]:
tta = tf.keras.Sequential([tf.keras.layers.experimental.preprocessing.RandomFlip('horizontal_and_vertical'),
                          tf.keras.layers.experimental.preprocessing.RandomRotation(0.25),
                          tf.keras.layers.experimental.preprocessing.RandomContrast(0.2)])

In [None]:
def duplicate_image(img_path, image_size=image_size, tta_runs=2):

    img = Image.open(img_path)
    img = img.resize((image_size, image_size))
    img_height, img_width = img.size
    img = np.array(img)
    
    img_list = []
    for i in range(tta_runs):
        img_list.append(img)
  
    return np.array(img_list)

In [None]:
def predict_with_tta(image_filename, folder, tta_runs=2):
    
    #apply TTA to each of the 3 images and sum all predictions for each local image
    localised_predictions = []
    local_image_list = duplicate_image(folder+image_filename)
    for local_image in local_image_list:
        local_image = tf.expand_dims(local_image,0)
        augmented_images = [tta(local_image) for i in range(tta_runs)]
        predictions = model.predict(np.array(augmented_images[0]))
        localised_predictions.append(np.sum(predictions, axis=0))
    
    #sum all predictions from all 3 images and retrieve the index of the highest value
    global_predictions = np.sum(np.array(localised_predictions),axis=0)
    max_value = max(global_predictions)
    final_prediction = np.argmax(global_predictions)
    
    return [final_prediction, max_value, global_predictions]

In [None]:
train_folder = 'E:/Kaggle.Challenges/cassava-leaf-disease-classification/train_images/'
train_image = "1000015157.jpg"
predictions = predict_with_tta(train_image, train_folder)

print("Predicted Label: ", predictions[0])
print("Predicted Label Value: ", predictions[1])
print("Predicted One-Hot Label: ", predictions[2])

In [None]:
print("Confidence Level: {:.2f}".format(predictions[1]/2*100), "%")

In [None]:
def predict_image_list(image_list, folder):
    predictions = []
    values = []
    with tqdm(total=len(image_list)) as pbar:
        for image_filename in image_list:
            pbar.update(1)
            predictions.append(predict_with_tta(image_filename, folder)[0])
            values.append(predict_with_tta(image_filename, folder)[1])
    return [predictions, values]

In [None]:
threshold = 2*0.8

In [None]:
mask1 = (one_fold["label"] != one_fold["pred"]) & (one_fold["value"] >= threshold)
one_list = one_fold[mask1].index.to_list()

mask2 = (two_fold["label"] != two_fold["pred"]) & (two_fold["value"] >= threshold)
two_list = two_fold[mask2].index.to_list()

mask3 = (three_fold["label"] != three_fold["pred"]) & (three_fold["value"] >= threshold)
three_list = three_fold[mask3].index.to_list()

mask4 = (four_fold["label"] != four_fold["pred"]) & (four_fold["value"] >= threshold)
four_list = four_fold[mask4].index.to_list()

mask5 = (five_fold["label"] != five_fold["pred"]) & (five_fold["value"] >= threshold)
five_list = five_fold[mask5].index.to_list()

combined_list = list(np.unique(one_list + two_list + three_list + four_list + five_list))

In [None]:
df = df_train.drop(combined_list, axis='index')

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
fold_number = 0
n_splits = 5
oof_accuracy = []

tf.keras.backend.clear_session()
skf = StratifiedKFold(n_splits=n_splits, random_state=seed_value)
for train_index, val_index in skf.split(df["image_id"], df["label"]):
    train_set = df.loc[train_index]
    val_set = df.loc[val_index]
    train_datagen, val_datagen = DataGenerator(train_set, val_set)
    model = build_model()
    print("Training fold no.: " + str(fold_number+1))

    model_name = "denoised effnetb0 "
    fold_name = "fold.h5"
    filepath = model_name + str(fold_number+1) + fold_name
    callbacks = [ModelCheckpoint(filepath=filepath, monitor='val_accuracy', save_best_only=True)]

    history = model.fit(train_datagen, epochs=epochs, validation_data=val_datagen, callbacks=callbacks)
    oof_accuracy.append(max(history.history["val_accuracy"]))
    fold_number += 1
    if fold_number == n_splits:
        print("Training finished!")