In [1]:
import numpy as np
import pandas as pd
from PIL import Image
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.utils import class_weight
from sklearn.preprocessing import minmax_scale
import random
import cv2
from imgaug import augmenters as iaa
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.experimental import CosineDecay
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.layers.experimental.preprocessing import RandomCrop,CenterCrop, RandomRotation


# ====================================================
# CFG
# ====================================================
class CFG:
    debug = True
    training_percentage = .8
    epochs = 8
    batch_size = 4
    image_size = 512
    dropout_rate = 0.4

class PATH:
    train_folder = '../input/cassava-leaf-disease-classification/train_images/'
    train_csv = "../input/cassava-leaf-disease-classification/train.csv"
    ef_weight = '../model/efficientnetb3_notop.h5'
    ef_noisy_student_weight = '../model/efficientnetb3_notop.h5'

if CFG.debug:
    CFG.epochs = 1


# ====================================================
# PREPARE DATA
# ====================================================
samples_df = pd.read_csv(PATH.train_csv)
samples_df = shuffle(samples_df, random_state=42)
samples_df["filepath"] = PATH.train_folder+samples_df["image_id"]
training_item_count = int(len(samples_df)*CFG.training_percentage)
validation_item_count = len(samples_df)-int(len(samples_df)*CFG.training_percentage)
training_df = samples_df[:training_item_count]
validation_df = samples_df[training_item_count:]


# ====================================================
# BUILD GRAPH
# ====================================================

def load_image_and_label_from_path(image_path, label):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    return img, label

def adapt_mode(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = layers.experimental.preprocessing.Rescaling(1.0 / 255)(img)
    return img

AUTOTUNE = tf.data.experimental.AUTOTUNE


training_data = tf.data.Dataset.from_tensor_slices((training_df.filepath.values, training_df.label.values))
validation_data = tf.data.Dataset.from_tensor_slices((validation_df.filepath.values, validation_df.label.values))
training_data = training_data.map(load_image_and_label_from_path, num_parallel_calls=AUTOTUNE)
validation_data = validation_data.map(load_image_and_label_from_path, num_parallel_calls=AUTOTUNE)
training_data_batches = training_data.shuffle(buffer_size=1000).batch(CFG.batch_size).prefetch(buffer_size=AUTOTUNE)
validation_data_batches = validation_data.shuffle(buffer_size=1000).batch(CFG.batch_size).prefetch(buffer_size=AUTOTUNE)

adapt_data = tf.data.Dataset.from_tensor_slices(training_df.filepath.values)
adapt_data = adapt_data.map(adapt_mode, num_parallel_calls=AUTOTUNE)
adapt_data_batches = adapt_data.shuffle(buffer_size=1000).batch(CFG.batch_size).prefetch(buffer_size=AUTOTUNE)




def build_graph(init_weights=PATH.ef_weight)

    # build graph
    data_augmentation_layers = tf.keras.Sequential(
        [
            layers.experimental.preprocessing.RandomCrop(height=CFG.image_size, width=CFG.image_size),
            layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
            layers.experimental.preprocessing.RandomRotation(0.25),
            layers.experimental.preprocessing.RandomZoom((-0.2, 0)),
            layers.experimental.preprocessing.RandomContrast((0.2,0.2))
        ]
    )
    n_ouput = len(sorted(training_df.label.unique()))

    input_shape = (CFG.image_size, CFG.image_size, 3)
    
    efficientnet = EfficientNetB3(
        weights=init_weights,
        include_top=False,
        input_shape=input_shape,
        drop_connect_rate=CFG.dropout_rate)
    
    inputs = Input(shape=input_shape)
    augmented = data_augmentation_layers(inputs)
    efficientnet = efficientnet(augmented)
    pooling = layers.GlobalAveragePooling2D()(efficientnet)
    dropout = layers.Dropout(CFG.dropout_rate)(pooling)
    outputs = Dense(n_ouput, activation="softmax")(dropout)
    model = Model(inputs=inputs, outputs=outputs)

    # set train param
    model.get_layer('efficientnetb3').get_layer('normalization').adapt(adapt_data_batches)
    decay_steps = int(round(len(training_df)/CFG.batch_size))*CFG.epochs
    cosine_decay = CosineDecay(initial_learning_rate=1e-4, decay_steps=decay_steps, alpha=0.3)
    callbacks = [ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]
    model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(cosine_decay), metrics=["accuracy"])

    return model

model = build_graph()
model.summary()

  data = yaml.load(f.read()) or {}


Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 512, 512, 3)]     0         
_________________________________________________________________
sequential (Sequential)      (None, 512, 512, 3)       0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 16, 16, 1536)      10783535  
_________________________________________________________________
global_average_pooling2d (Gl (None, 1536)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1536)              0         
_________________________________________________________________
dense (Dense)                (None, 5)                 7685      
Total params: 10,791,220
Trainable params: 10,703,917
Non-trainable params: 87,303
_____________________________________

In [20]:
len(sorted(training_df.label.unique()))

5

In [2]:
# ====================================================
# TRAIN
# ====================================================

In [None]:
N_LESSION = 1

# run model 
history = model.fit(training_data_batches, epochs = CFG.epochs, validation_data=validation_data_batches, callbacks=callbacks)

In [3]:
history = model.fit(training_data_batches, epochs = CFG.epochs, validation_data=validation_data_batches, callbacks=callbacks)
# pd.DataFrame(history.his).to_csv('history')



In [6]:
history.history

{'loss': [0.6297047734260559],
 'accuracy': [0.7858853936195374],
 'val_loss': [0.44493407011032104],
 'val_accuracy': [0.8429906368255615]}

In [14]:
test_time_augmentation_layers = tf.keras.Sequential(
    [
        layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
        layers.experimental.preprocessing.RandomZoom((-0.2, 0)),
        layers.experimental.preprocessing.RandomContrast((0.2,0.2))
    ]
)

def scan_over_image(img_path, crop_size=512):
    '''
    Will extract 512x512 images covering the whole original image
    with some overlap between images
    '''
    
    img = Image.open(img_path)
    img_height, img_width = img.size
    img = np.array(img)
    
    y = random.randint(0,img_height-crop_size)
    x = random.randint(0,img_width-crop_size)

    x_img_origins = [0,img_width-crop_size]
    y_img_origins = [0,img_height-crop_size]
    img_list = []
    for x in x_img_origins:
        for y in y_img_origins:
            img_list.append(img[x:x+crop_size , y:y+crop_size,:])
  
    return np.array(img_list)


def predict_and_vote(image_filename, folder, TTA_runs=4):
    '''
    Run the model over 4 local areas of the given image,
    before making a decision depending on the most predicted
    disease.
    '''
    
    #apply TTA to each of the 4 images and sum all predictions for each local image
    localised_predictions = []
    local_image_list = scan_over_image(folder+image_filename)
    for local_image in local_image_list:
        duplicated_local_image = tf.convert_to_tensor(np.array([local_image for i in range(TTA_runs)]))
        augmented_images = test_time_augmentation_layers(duplicated_local_image)
        
        predictions = model.predict(augmented_images)
        localised_predictions.append(np.sum(predictions, axis=0))
    
    #sum all predictions from all 4 images and retrieve the index of the highest value
    global_predictions = np.sum(np.array(localised_predictions),axis=0)
    final_prediction = np.argmax(global_predictions)
    
    return final_prediction

def run_predictions_over_image_list(image_list, folder):
    predictions = []
    with tqdm(total=len(image_list)) as pbar:
        for image_filename in image_list:
            pbar.update(1)
            predictions.append(predict_and_vote(image_filename, folder))
    return predictions

In [15]:
validation_df.head()

Unnamed: 0,image_id,label,filepath
10234,2824543301.jpg,3,../input/cassava-leaf-disease-classification/t...
4763,184909120.jpg,3,../input/cassava-leaf-disease-classification/t...
9062,2602456265.jpg,3,../input/cassava-leaf-disease-classification/t...
1874,1331491784.jpg,3,../input/cassava-leaf-disease-classification/t...
17431,414363375.jpg,3,../input/cassava-leaf-disease-classification/t...


In [16]:
validation_df["results"] = run_predictions_over_image_list(validation_df["image_id"], PATH.train_folder)

100%|██████████| 4280/4280 [18:01<00:00,  3.96it/s]


In [17]:
validation_df.head()

Unnamed: 0,image_id,label,filepath,results
10234,2824543301.jpg,3,../input/cassava-leaf-disease-classification/t...,3
4763,184909120.jpg,3,../input/cassava-leaf-disease-classification/t...,3
9062,2602456265.jpg,3,../input/cassava-leaf-disease-classification/t...,3
1874,1331491784.jpg,3,../input/cassava-leaf-disease-classification/t...,3
17431,414363375.jpg,3,../input/cassava-leaf-disease-classification/t...,3


In [19]:
sum(validation_df['label']==validation_df['results'])/validation_df.shape[0]

0.858411214953271