In [49]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import seaborn as sns
import cv2
import albumentations as A
import math
import tensorflow_hub as hub
import keras

from PIL import Image
from tensorflow import keras
from albumentations.core.composition import Compose, OneOf
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Add, Activation
from tensorflow.keras.layers import MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten

from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import load_img, img_to_array, array_to_img

### Config

In [2]:
train_meta_data = '../train.csv'
train_data_dir = '../input/paddy-disease-classification/train_images'
epochs = 100
lr = 1e-4
valid_split = 0.2
input_size = 256
batch_size = 16
classes = 10
steps_in_epoch = 580
initializer = tf.keras.initializers.HeUniform()
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.categorical_crossentropy

### Callback functions

In [3]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=15,
                                              monitor='val_loss',
                                              restore_best_weights=True,
                                              verbose=1)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(patience=3,
                                                 monitor='val_loss',
                                                 factor=0.75,
                                                 verbose=1)

checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='best_checkpoint.hdf5',
                                                monitor='val_loss',
                                                verbose=1,
                                                save_best_only=True)

### Pre-processing Pipeline

#### utility functions

In [4]:
def random_cut_out(images):
    return tfa.image.random_cutout(images, (32, 32), constant_values=0)


def cropping(arr):
    h, w = arr.shape[:2]
    central_crop_width=(0.45, 0.65, 0.85)
    central_crop_height=(0.45, 0.65, 0.85)
    h_factor, w_factor = random.choice(list(zip(central_crop_width, central_crop_height)))
    offset_h = (h - h_factor * h) // 2
    offset_w = (w - w_factor * w) // 2

    return tf.image.crop_to_bounding_box(arr, int(offset_h), int(offset_w), int(h * h_factor), int(w * w_factor))


def center_crop_and_random_augmentations_fn(image):
    image = cropping(image)
    image = tf.image.resize(image, (256, 256))
    image = tf.image.random_brightness(image, 0.2)
    image = tf.image.random_contrast(image, 0.5, 2.0)
    image = tf.image.random_saturation(image, 0.75, 1.25)
    image = tf.image.random_hue(image, 0.1)
    
    return image

### Config data loaders

In [5]:
train_data = []

for i, folder in enumerate(os.listdir('../input/paddy-disease-classification/train_images/')):
    folder_path = os.path.join('../input/paddy-disease-classification/train_images',folder)
    
    for j, file in enumerate(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file)
        train_data.append([file_path, folder, file])

train_data = pd.DataFrame(train_data, columns=['file_path', 'label', 'image_id'])
train_data

In [6]:
meta = pd.read_csv('../input/paddy-disease-classification/train.csv')
train_data = pd.merge(train_data, meta.drop('label', axis=1), on='image_id')
train_data

In [44]:
train, test = train_test_split(train_data, test_size=0.05, random_state=48)

train.shape, test.shape

In [45]:
tr = pd.DataFrame(train['label'].value_counts()/train.shape[0])
te = pd.DataFrame(test['label'].value_counts()/test.shape[0])

tr['subset'] = 'train'
te['subset'] = 'test'

temp = pd.concat([tr,te]).reset_index()

plt.figure(figsize=[24,6], dpi=300)
sns.barplot(x='index',
            y='label',
            hue='subset',
            data=temp)
plt.savefig('class-balance.jpg')
plt.show()

In [54]:
generator = ImageDataGenerator(rescale=1 / 255,
                               rotation_range=10,
                               width_shift_range=0.25,
                               height_shift_range=0.25,
                               horizontal_flip=True,
                               vertical_flip=True,
                               channel_shift_range=0.65,
                               fill_mode='reflect',
                               validation_split=valid_split,
                               preprocessing_function=center_crop_and_random_augmentations_fn)

train_datagen = generator.flow_from_dataframe(dataframe=train,
                                              x_col='file_path',
                                              y_col='label',
                                              target_size=(input_size, input_size),
                                              batch_size=batch_size,
                                              subset='training')

valid_datagen = generator.flow_from_dataframe(dataframe=train,
                                              x_col='file_path',
                                              y_col='label',
                                              target_size=(input_size, input_size),
                                              batch_size=batch_size,
                                              subset='validation')

test_datagen = ImageDataGenerator(rescale=1.0/255).flow_from_dataframe(dataframe=test,
                                                                       x_col='file_path',
                                                                       y_col='label',
                                                                       target_size=(input_size, input_size),
                                                                       batch_size=batch_size,
                                                                       shuffle=False)

In [47]:
len(train_datagen.next()[0]), len(valid_datagen.next()[0])

#### data loders output

#### train mini batch

In [11]:
fig, axes = plt.subplots(nrows=2, ncols=8, figsize=[32, 8], dpi=200)
axes = axes.ravel()

for i, arr in enumerate(train_datagen.next()[0]):
    img = array_to_img(arr)
    axes[i].imshow(img)
    
plt.savefig('train-mini-batch.jpg')
plt.show()

#### validation mini batch

In [12]:
fig, axes = plt.subplots(nrows=2, ncols=8, figsize=[32, 8], dpi=200)
axes = axes.ravel()

for i, arr in enumerate(valid_datagen.next()[0]):
    img = array_to_img(arr)
    axes[i].imshow(img)
    
plt.savefig('valid-mini-batch.jpg')
plt.show()

In [13]:
plt.figure(figsize=[24,20], dpi=200)
sns.barplot(x='age', y='label', hue='variety', data=meta, palette='OrRd_r')
plt.savefig('age-vs-label.jpg')
plt.show()

In [14]:
plt.figure(figsize=[12,6], dpi=200)
sns.barplot(x='age', y='label', hue='variety', 
            data=meta.groupby(by=['age', 'variety'])[['label']].count().reset_index(), 
            palette='OrRd_r')
plt.savefig('class-sizes.jpg')
plt.show()

### Model

In [56]:
model_base = EfficientNetB3(include_top=False, input_shape=(256, 256, 3), weights='imagenet')

In [51]:
tf.keras.utils.plot_model(model_base, to_file='EfficientNetB3.png')

In [57]:
input_layer = Input(shape=(input_size,input_size,3))
x_ = model_base(input_layer)
x_ = GlobalAveragePooling2D()(x_)
output_layer = Dense(units=10, activation='softmax')(x_)

model = Model(input_layer, output_layer)

model.compile(optimizer=optimizer,
              loss=loss,
              metrics=['accuracy'])

In [58]:
model.summary()

In [None]:
history = model.fit(train_datagen,
                    validation_data=valid_datagen,
                    batch_size=batch_size,
                    epochs=epochs,
                    callbacks=[early_stop,reduce_lr])

In [20]:
model.evaluate(test_datagen)

### Evaluate

In [21]:
plt.figure(figsize=[12,6], dpi=300)
sns.lineplot(x=list(range(len(history.history['accuracy']))),
             y=history.history['accuracy'],
             label='train')
sns.lineplot(x=list(range(len(history.history['val_accuracy']))),
             y=history.history['val_accuracy'],
             label='validation')
plt.savefig('accuracy-hist.jpg')
plt.show()

In [22]:
plt.figure(figsize=[12,6], dpi=300)
sns.lineplot(x=list(range(len(history.history['loss']))),
             y=history.history['loss'],
             label='train')
sns.lineplot(x=list(range(len(history.history['val_loss']))),
             y=history.history['val_loss'],
             label='validation')
plt.savefig('loss-hist.jpg')
plt.show()

### Saving files

In [23]:
temp = pd.DataFrame(history.history)
temp.to_csv('model_xception_history.csv', index=False)

In [24]:
try:
    model.save('model_xception.hdf5')
except:
    print('file saving error.')

In [25]:
try:
    model.save_weights('model_xception_weights.hdf5')
except:
    print('file saving error.')

### Inference

In [26]:
test_loc = '../input/paddy-disease-classification/test_images'

test_data = ImageDataGenerator(rescale=1.0/255,
                               samplewise_center=True,
                               samplewise_std_normalization=True).flow_from_directory(directory=test_loc,
                                                                                      target_size=(input_size, input_size),
                                                                                      batch_size=batch_size,
                                                                                      classes=['.'],
                                                                                      shuffle=False)

In [27]:
train_datagen.class_indices

In [28]:
predict_max = np.argmax(model.predict(test_data, verbose=1),axis=1)

In [29]:
inverse_map = {v:k for k,v in train_datagen.class_indices.items()}
predictions = [inverse_map[k] for k in predict_max]

In [30]:
files=test_data.filenames

temp = pd.DataFrame({"image_id":files,
                      "label":predictions})

temp.image_id = temp.image_id.str.replace('./', '')
temp.to_csv('model_submission_v6.csv', index=False)
temp

In [31]:
temp.label.value_counts()