# Introduction

This notebook will will train an image classifier for the Cassava Leaf Disease Classification Kaggle Challenge

In [None]:
import re
import numpy as np
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
from keras.models import load_model

In [None]:
###### Seed Everything
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
#GCS_DS_PATH = KaggleDatasets().get_gcs_path('cassava-leaf-disease-classification')
GCS_DS_PATH = '/kaggle/input/cassava-leaf-disease-classification/'

FILEPATHS =  tf.io.gfile.glob(GCS_DS_PATH + '/train_tfrecords/*.tfrec')
#FILEPATHS = tf.io.gfile.glob(GCS_DS_PATH + 'train_tfrecords/ld_train00-1338.tfrec')
#FILEPATHS = [GCS_DS_PATH + 'train_tfrecords/ld_train00-1338.tfrec']

## Constants

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

# Constants
NUM_TRAINING_IMAGES = int(count_data_items(FILEPATHS) )
print("Number of training images:", NUM_TRAINING_IMAGES)


#IMAGE_SIZE = [800,600] # Original image size
DIM = 128
IMAGE_SIZE = [DIM, DIM] # we can't use the original image size because it's too big, and we'll encounter an Out of memory error
CLASSES = 5

BATCH_SIZE = 64
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
EPOCHS = 10

## Reading the Data

In [None]:
def decode_image(image_data):
    """
    This function is used to read the image column of the TFRecord
    
    Input
    image_data - Tensor
    """
    
    # Read the image as a numpy array
    image = tf.image.decode_jpeg(image_data, channels=3)
    
    # Originally, the image is saved as a string so we need to convert the it into floats.
    # We will also normalize the values so that it lies in the [0,1] range
    image = tf.cast(image, tf.float32) / 255.0
    
    # Resize the image into our desired dimension
    image = tf.image.resize(image, [DIM, DIM])
    #image = tf.image.resize(image, [IMAGE_SIZE[0], IMAGE_SIZE[1]]) # Use original image size
    
    # Reshape the image into (length, width, color_channel)
    image = tf.reshape(image, [DIM, DIM, 3])
    #image = tf.reshape(image, [IMAGE_SIZE[0], IMAGE_SIZE[1], 3]) # Use original image size
    
    return image

def read_labeled_tfrecord(example):
    """
    This function is used to read one example(aka one TFRecord) from a TFRecordDataset
    
    Input
    example - TFRecordDataset
    """
    
    # To read one record, we need to define what are the expected "columns" of the data
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), 
        "target": tf.io.FixedLenFeature([], tf.int64), 
    }
    
    # Read one example
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    
    # Get the image column. Since the image column is saved as a string, we need to convert it into a matrix
    image = decode_image(example['image'])
    
    # Get the label column
    label = tf.cast(example['target'], tf.int32)
    
    return image, label 

def load_dataset(filenames, labeled = True, ordered = False):
    """
    This function will create a TFRecordDataset from the given filenames
    
    Input
    filenames - list of strings
    """
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False 
        
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.with_options(ignore_order) 
    dataset = dataset.map(read_labeled_tfrecord ) 
    return dataset

def get_training_dataset(dataset):
    """
    This function will retrieve a subset/batch of records from the given dataset
    
    Input
    dataset - TFRecordDataset
    """
    dataset = dataset.repeat() 
    dataset = dataset.map(onehot)
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def onehot(image,label):
    """
    This function converts the label of an image to one-hot encoding
    """
    
    return image,tf.one_hot(label,CLASSES)

In [None]:
# load the dataset
train_dataset = load_dataset(FILEPATHS)

## define and compile the model

In [None]:
# ResNet50 base model
rnet = tf.keras.applications.ResNet50(
        input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3),
        weights='imagenet',
        include_top=False
    )
# freeze the base model
rnet.trainable = False

# Create a new model
model = tf.keras.Sequential([
    rnet
    , tf.keras.layers.GlobalAveragePooling2D()
    , tf.keras.layers.Dropout(0.2)
    , tf.keras.layers.Dense(5, activation='softmax', dtype='float32')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.05),
              metrics=['categorical_accuracy'])

In [None]:
MY_MODEL_FILE = '/kaggle/input/my-cassava-data/resnet50_model_v9.h5' # v5-9 model takes images of size (128, 128, 3)

model = load_model(MY_MODEL_FILE, custom_objects = None)

In [None]:
model.summary()

In [None]:
model.layers[-1].get_weights()

# Update Dropout Rate
model.layers[-1].rate = 0.5
#model = tf.keras.models.clone_model(model)
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.05),
              metrics=['categorical_accuracy'])
#model.load_weights('weights_resnet50_model_v9.h5')


# train the model

In [None]:
# Freeze the first layer (base ResNet50 model)
model.layers[0].trainable = False

In [None]:
model.summary()

In [None]:
# train the topmost layer
history = model.fit(
    get_training_dataset(train_dataset),
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs = 5,
    #epochs = EPOCHS,
    verbose=1
)

In [None]:
# fine tune all the layers (base restnet50 and top layer)
rnet.trainable = True
model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.05),
              metrics=['categorical_accuracy'])

In [None]:
history = model.fit(
    get_training_dataset(train_dataset),
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs = 10,
    verbose=1
)

In [None]:
# save model
model_filename = 'resnet50_model_v10.h5'
print("save model to", model_filename)
model.save(model_filename)

In [None]:
# save model's weights only
model_weights_filename = 'weights_' + model_filename
print("save model's weights to", model_weights_filename)
model.save_weights(model_weights_filename)

## evaluate the model

In [None]:
FILENAMES =  tf.io.gfile.glob('/kaggle/input/cassava-leaf-disease-classification/train_tfrecords/*.tfrec')
train_dataset = load_dataset(FILENAMES)

def get_test_dataset(dataset):
    """
    This function will retrieve a subset/batch of records from the given dataset
    
    Input
    dataset - TFRecordDataset
    """
    dataset = dataset.map(onehot)
    #dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

result = model.evaluate(get_test_dataset(train_dataset))
dict(zip(model.metrics_names, result))