In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, json, cv2
from PIL import Image
from sklearn.model_selection import train_test_split

#model imports (keras/tensorflow)
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from keras import layers, models
from tensorflow.keras.applications import EfficientNetB0
from keras.optimizers import Adam
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

### Notes

- NORMALIZE THE LAYERS IT TAKES WAY TOO LONG TO TRAIN
- Best results 512 img_size, might want to use 224 to save GPU if toying with model
- Make Normalization Layers Non-Trainable for pretrained model
    - https://www.kaggle.com/bibhash123/cassava-classification-simple-overview-tf-keras
- (custom loss func) - Bi-Tempered Logistic Loss or other?
- (cross-validation strat) - 5 fold cross validation

Notebooks that helped me, and taught me some awesome things along the way.

[Maksym Shkliarevskyis Notebook](https://www.kaggle.com/maksymshkliarevskyi/cassava-leaf-disease-best-keras-cnn)

[Chris Deotte's GPU/TPU Augmentation Notebook](https://www.kaggle.com/cdeotte/rotation-augmentation-gpu-tpu-0-96)

# Load Data

In [None]:
WORK_DIR = '../input/cassava-leaf-disease-classification'

#counting number of images in the train_images folder
print('Train images: %d' %len(os.listdir(os.path.join(WORK_DIR, "train_images"))))

In [None]:
#reading labels from .json file
with open(os.path.join(WORK_DIR, "label_num_to_disease_map.json")) as file:
    print(json.dumps(json.loads(file.read()), indent=4))

The labels for each image are stored in a csv file. We can read these in to a pandas dataframe and view the top five rows of this dataFrame.

In [None]:
#reading in the training image labels 
all_train_labels_df = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
all_train_labels_df.head(3)

# Visualizing Data

Although I find it is quite difficult for the human eye to determine any major differnces between the leaves in this dataset, I feel it is important to plot a few images to get a feel the nature of the dataset. 

- I used .sample() to take three random images from each group. This is handy as everytime the notebook is ran we see a different set of images.

###  "0" : Cassava Bacterial Blight (CBB)

In [None]:
i=1
plt.figure(figsize=(40, 40))
for image_id in all_train_labels_df.loc[all_train_labels_df.label==0].sample(3).image_id.values:
    img = cv2.imread(os.path.join(WORK_DIR, "train_images", image_id)) #using cv2 library to read imgs
    plt.subplot(1, 3, i)
    plt.imshow(img)
    plt.title(image_id, fontsize=30)
    i+=1

plt.show()


###  "1" : Cassava Brown Streak Disease (CBSD)

In [None]:
i=1
plt.figure(figsize=(40, 40))
for image_id in all_train_labels_df.loc[all_train_labels_df.label==1].sample(3).image_id.values:
    img = cv2.imread(os.path.join(WORK_DIR, "train_images", image_id)) #using cv2 library to read imgs
    plt.subplot(1, 3, i)
    plt.imshow(img)
    plt.title(image_id, fontsize=30)
    i+=1

plt.show()

### "2" : Cassava Green Mottle (CGM)

In [None]:
i=1
plt.figure(figsize=(40, 40))
for image_id in all_train_labels_df.loc[all_train_labels_df.label==2].sample(3).image_id.values:
    img = cv2.imread(os.path.join(WORK_DIR, "train_images", image_id)) #using cv2 library to read imgs
    plt.subplot(1, 3, i)
    plt.imshow(img)
    plt.title(image_id, fontsize=30)
    i+=1

plt.show()

### "3" : Cassava Mosaic Disease (CMD)

In [None]:
i=1
plt.figure(figsize=(40, 40))
for image_id in all_train_labels_df.loc[all_train_labels_df.label==3].sample(3).image_id.values:
    img = cv2.imread(os.path.join(WORK_DIR, "train_images", image_id)) #using cv2 library to read imgs
    plt.subplot(1, 3, i)
    plt.imshow(img)
    plt.title(image_id, fontsize=30)
    i+=1

plt.show()

### "4" : Healthy

In [None]:
i=1
plt.figure(figsize=(40, 40))
for image_id in all_train_labels_df.loc[all_train_labels_df.label==4].sample(3).image_id.values:
    img = cv2.imread(os.path.join(WORK_DIR, "train_images", image_id)) #using cv2 library to read imgs
    plt.subplot(1, 3, i)
    plt.imshow(img)
    plt.title(image_id, fontsize=30)
    i+=1

plt.show()

# Data Augmentation + Variables

STEPS_PER_EPOCH - great parameter to user when making augmented data on the fly. This is basically the number of batch iterations before the epoch is considered finished.

VALIDATION_STEPS - Same as STEPS_PER_EPOCH but is used when we are testing the model on the validation dataset

In [None]:
#model parameters values used to aid in readability and debugging etc.
BATCH_SIZE = 16
EPOCHS = 15
TARGET_SIZE = 224 #parameter that specifies the image dimensions '224 for test, 512 for highest score'
STEPS_PER_EPOCH = len(all_train_labels_df)*0.9 / BATCH_SIZE #number of batch iterations before epoch is considered done
VALIDATION_STEPS = len(all_train_labels_df)*0.1 / BATCH_SIZE

I would like to do a stratified test split on the data so that I have both a training and validation dataset that is representative of the entire dataset. 

In [None]:
#setting train_labels to string before train_test_split
all_train_labels_df.label = all_train_labels_df.label.astype('str')

#stratified train_test_split
train_labels_df, valid_labels_df = train_test_split(all_train_labels_df, random_state=2718,
                                      test_size=0.10, stratify = all_train_labels_df.label)

Just making sure we are getting an even distribution of labels across the train and test datasets

In [None]:
train_labels_df.label.value_counts(normalize=True)

In [None]:
valid_labels_df.label.value_counts(normalize=True)

Using very intuituive keras image preprocessing framework called ImageDataGenerator.

- documentation and parameter guide found here -> [link](https://keras.io/api/preprocessing/image/#imagedatagenerator-class)

In [None]:
my_data_augmentor = ImageDataGenerator(preprocessing_function = None,
                    rotation_range = 45,
                    zoom_range = 0.2,
                    horizontal_flip = True,
                    vertical_flip = True,
                    fill_mode = 'nearest', #set to default val
                    shear_range = 0.1, #shear_range is distortion along axis, which changes the angle of the image
                    height_shift_range = 0.1,
                    width_shift_range = 0.1)

This is how we read through different images in the work directory and apply the imagedatagenerator as they are read in.

It is also important to note that the typical train_test_split of the data is occuring in the imagedatagenerator. We 

In [None]:
training_generator = my_data_augmentor.flow_from_dataframe(train_labels_df,
                         directory = os.path.join(WORK_DIR, "train_images"),
                         x_col = "image_id",
                         y_col = "label",
                         target_size = (TARGET_SIZE, TARGET_SIZE),
                         batch_size = BATCH_SIZE,
                         class_mode = "sparse")

validation_generator = my_data_augmentor.flow_from_dataframe(valid_labels_df,
                         directory = os.path.join(WORK_DIR, "train_images"),
                         x_col = "image_id",
                         y_col = "label",
                         target_size = (TARGET_SIZE, TARGET_SIZE),
                         batch_size = BATCH_SIZE,
                         class_mode = "sparse") #Determines the type of label arrays that are returned

### Visualizing Augmented Data

The following sub section is pretty cool, it enables me to alter the parameters of the ImageDataGnerator, and visualize how this affects an image before committing those parameters to a model to train.

In [None]:
generator = my_data_augmentor.flow_from_dataframe(train_labels_df.iloc[153:154],
                         directory = os.path.join(WORK_DIR, "train_images"),
                         x_col = "image_id",
                         y_col = "label",
                         target_size = (TARGET_SIZE, TARGET_SIZE),
                         batch_size = BATCH_SIZE,
                         class_mode = "categorical")

aug_images = [generator[0][0][0]/255 for i in range(10)]
fig, axes = plt.subplots(2, 5, figsize = (20, 10))
axes = axes.flatten()
for img, ax in zip(aug_images, axes):
    ax.imshow(img)
    ax.axis('off')
plt.tight_layout()
plt.show()

# Creating the model

To start of I am going to be building off one of tensorflows pretrained models called efficientnetB0. I will be trying out models using different base models ie VGG, Inception, and ResNet. Im not entirely sure which model is best for this situation, but thats what the internet is for. I am looking into this topic!

The following cell builds and saves the model below. This has previously been saved to a dataset and we can loads this back in with its optimal weights.

Note we are using "sparse_categorical_crossentropy" because we did not one hot encode our training labels. If we did one_hot encode labels we would use "categorical_crossentropy"

In [None]:
# !git clone https://github.com/Diulhio/bitemperedloss-tf.git
# import sys
# sys.path.append('./bitemperedloss-tf')
# from tf_bi_tempered_loss import BiTemperedLogisticLoss

# # loss=BiTemperedLogisticLoss(t1=1.0, t2=1.0, label_smoothing=0.1)
# loss=BiTemperedLogisticLoss(t1=1.0, t2=1.0)

In [None]:
#make sure internet is enabled in notebook so we can access efficientNetB0 model from google.storage.api
def create_model():
    model = models.Sequential()

    model.add(EfficientNetB0(include_top = False, weights = 'imagenet', 
                             input_shape = (TARGET_SIZE, TARGET_SIZE, 3)))
    
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dense(5, activation = "softmax"))# 5 is the dimensionality of the output space "5 options"

    model.compile(optimizer = 'adam',
                  loss = "sparse_categorical_crossentropy",
                  metrics = ["acc"])
    return model

In [None]:
model = create_model()
print('CNN has {} layers'.format(len(model.layers)))

In [None]:
model.save('./EfNetB0_untrained_progress_model.h5')

# Callbacks

ModelCheckpoint docs - https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint

ReduceLROnPlateau docs - https://keras.io/api/callbacks/reduce_lr_on_plateau/

In [None]:
#callbacks
model_save = ModelCheckpoint('./EffNetB0_best_progress_weights_model.h5', 
                             save_best_only = True, 
                             save_weights_only = True,
                             monitor = 'val_loss', 
                             mode = 'min',
                             save_freq='epoch',
                             verbose = 1)

my_early_stopper = EarlyStopping(monitor = 'val_loss', min_delta = 0.001, 
                           patience = 5, mode = 'min', verbose = 1,
                           restore_best_weights = True)

# reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, 
#                               patience = 2, min_delta = 0.001, 
#                               mode = 'min', verbose = 1)

### Custom LR Scheduler

The warm-up phase is important so that we do not have any catsrtophic forgetting of model weights happening at the beginning of training on the pretrained model.

A custom LR scheduler is common practice for transfer learning. The learning rate starts near zero, then increases to a maximum, then decays over time.

[Chris Deotte's GPU/TPU Aug](https://www.kaggle.com/cdeotte/rotation-augmentation-gpu-tpu-0-96)

In [None]:
# Learning rate schedule for TPU, GPU and CPU.
# Using an LR ramp up because fine-tuning a pre-trained model.
# Starting with a high LR would break the pre-trained weights.

LR_START = 0.0005
LR_MAX = 0.001
LR_MIN = 0.0005
LR_RAMPUP_EPOCHS = 7
LR_SUSTAIN_EPOCHS = 0
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr
    
#setting verbose=True allows us to see LR in model training
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)

Visualizing the Custom LR Scheduler

In [None]:
rng = [i for i in range(25 if EPOCHS<25 else EPOCHS)]
y = [lrfn(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

In [None]:
model.summary()

# Fit Model

In [None]:
 history = model.fit(
     training_generator,
     steps_per_epoch = STEPS_PER_EPOCH,
     epochs = EPOCHS,
     validation_data = validation_generator,
     validation_steps = VALIDATION_STEPS,
     callbacks = [model_save, my_early_stopper, lr_callback]
 )

In [None]:
model.save('./EffNetB0_trained_progress_model.h5')

### Visualizing Model History

In [None]:
plt.figure(figsize=(13, 5))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Model Loss")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'])
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(13, 5))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Train','Test'])
plt.grid()
plt.show()

# Prediction

Making predictions in a seperate notebook, just keeping code here as a guide.

In [None]:
# sample_submission = pd.read_csv('../input/cassava-leaf-disease-classification/sample_submission.csv')
# sample_submission

In [None]:
#making preds on all the test_images
# preds = []

# for image_id in sample_submission.image_id:
#     image = Image.open(os.path.join(WORK_DIR,  "test_images", image_id))
#     image = image.resize((TARGET_SIZE, TARGET_SIZE))
#     image = np.expand_dims(image, axis = 0)
#     preds.append(np.argmax(model.predict(image)))

# sample_submission['label'] = preds
# sample_submission