In [0]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cv2
import os
import sys
import math
import random
import datetime
import seaborn as sn
import pickle
import gc
import scipy.io as sio

from sympy import *
from scipy import ndimage
from tensorflow.keras.layers import Input, Conv2D, ReLU, BatchNormalization,\
                                    Add, AveragePooling2D, GlobalAveragePooling2D,\
                                    Flatten, Dense, Dropout, MaxPooling2D, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.densenet import DenseNet121
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score

from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)
sys.path.append('/content/drive/My Drive/Colab Notebooks/Plant_Pathology/')

# custom moduls
import prepare_img
import prepare_data
import classifiers
import augmentation

  import pandas.util.testing as tm


Mounted at /content/drive


In [0]:
### ImageProcessing // OpenCV ###
# This cell makes the compressed version of the original dataset, run only once 
# and then load the data from [train_compressed] folder

load_path = '/content/drive/My Drive/Colab Notebooks/Plant_Pathology/images/train/'
save_path = '/content/drive/My Drive/Colab Notebooks/Plant_Pathology/images/train_compressed/'

data_ids = os.listdir(load_path)

limit = len(data_ids)
limit = 0 # skip cycle 

img_size_desired = (400,300)

for i in range(limit):
  img0 = cv2.imread(os.path.join(load_path,data_ids[i]))

  if img0 is not None:
    if i % 10 == 0:
      print(i)

    img2,ellipse = prepare_img.FitEllipse(img0)
    img3,rectangle = prepare_img.FitRectangleInsideEllipse(img2, ellipse, img_size_desired)
    img4 = prepare_img.CropAroundRectangle(img3, rectangle, img_size_desired)

    cv2.imwrite(os.path.join(save_path, data_ids[i]), img4)

In [0]:
### LOAD DATASET
# This cell loads the selected dataset in load_path, suggested to use transformed images
labels_path = '/content/drive/My Drive/Colab Notebooks/Plant_Pathology/train.csv'
test_path = '/content/drive/My Drive/Colab Notebooks/Plant_Pathology/images/test_comp_2/'
load_path = '/content/drive/My Drive/Colab Notebooks/Plant_Pathology/images/train_comp_2/'
faulty_path = '/content/drive/My Drive/Colab Notebooks/Plant_Pathology/exclude_from_experiment2.txt'

# load dataset
train = prepare_data.load(load_path, labels = labels_path)

### remove faulty images
train = prepare_data.remove_faulty(train, faulty_path)

# resample "multiple disease [1]" class 2 times because it is really small
train = prepare_data.resample(train, 1, 2)

# summarize training set
prepare_data.summarize(train)

In [0]:
parent_path =         "/content/drive/My Drive/Colab Notebooks/Plant_Pathology/checkpoints/"
model_flag = 0    # 0: ResNet18, 1: Densenet121 2: ResNet50
savemode_flag = 0   # 0: nothing, 1: only checkpoint, 2: only tensorboard 3: both
continue_training_from = False  # CV cycle can be selected to continue the training
special_tag = 'ENS_' # special information in the saved filename

# shuffle training set before CV
order = np.arange(0, len(train['x']), 1)
random.shuffle(order)
for key in train.keys():
  if type(train[key]) is not str:
    train[key] = train[key][order]

# unfold dictionary
train_x0 = train['x']
train_y0 = train['y']

# init variables
imgsize = train['x'][0].shape
num_splits = 4 
num_classes = 4
num_channels = 3
saved_weights = []
train_idxs = [] # used in training
val_idxs = []
train_idxs_str = [] # used in saving
val_idxs_str = []

# predefined training order
kf = KFold(n_splits=num_splits)
kf.get_n_splits(train_x0)

# flipped the val_idx-train_idx position since in bagging the training is on the smaller group and the validation is on the orig. training set
for val_idx, train_idx in kf.split(train_x0): 
  train_idxs.append(train_idx)
  val_idxs.append(val_idx)
  train_idxs_str.append(train['ids'][train_idx])
  val_idxs_str.append(train['ids'][val_idx])

# calculate mean and std over full training
_, mean, std = prepare_data.normalize(train_x0, False, False)

for CV in range(num_splits):

  begin = datetime.datetime.now()
  # KFold
  train_idx = train_idxs[CV]
  val_idx = val_idxs[CV]

  # apply augmentation on the training set
  train_x, train_y = augmentation.pipeline(train_x0[train_idx], train_y0[train_idx])
  val_x = train_x0[val_idx].copy()
  val_y = train_y0[val_idx].copy()

  # normalize augmented training + val dataset
  train_x, _, _ = prepare_data.normalize(train_x, mean, std)
  val_x, _, _ = prepare_data.normalize(val_x, mean, std)   

  # reset session
  tf.keras.backend.clear_session()

  ### model selection
  opt = 'adam' # default optimizer
  if model_flag == 0: 
    model = classifiers.ResNet18t(input_shape=imgsize,output_num=num_classes)
    #opt = Adam(lr=0.00005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) # smaller steps
    tag = 'ResNet18_'
      
  elif model_flag == 1: 
    model = DenseNet121(include_top=True, 
                        weights=None, 
                        input_tensor=None, 
                        input_shape=imgsize, 
                        pooling=None, 
                        classes=num_classes)
    tag = 'Dense121_'

  elif model_flag == 2: 
    model = ResNet50(include_top=True, 
                        weights=None, 
                        input_tensor=None, 
                        input_shape=imgsize, 
                        pooling=None, 
                        classes=num_classes)
    tag = 'ResNet50_'

  model.compile(optimizer=opt,loss='categorical_crossentropy',metrics=['accuracy'])
    
  ### path for callbacks 
  timestr = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  name = tag+special_tag+timestr+'_CV'+str(CV)
  checkpoint_path = parent_path+'ckpt/'+name+"/cp-{epoch:04d}.ckpt"
  checkpoint_dir = os.path.dirname(checkpoint_path)
  os.system('mkdir {}'.format(checkpoint_dir))

  # checkpoint
  cp_callback = ModelCheckpoint(
      filepath=checkpoint_path,
      save_weights_only=True,
      verbose=1)
  
  # tensorboard
  tensorboard_callback = TensorBoard(
      log_dir= parent_path+'tensorboard_logs/'+name,
      histogram_freq=1)

  # learning rate 
  lr_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1, mode='min')
  
  # saving method selection
  if savemode_flag == 0:
    savemode = [lr_callback]
  elif savemode_flag == 1:
    savemode = [lr_callback, cp_callback]
  elif savemode_flag == 2:
    savemode = [lr_callback, tensorboard_callback]
  elif savemode_flag == 3:
    savemode = [lr_callback, cp_callback, tensorboard_callback]

  # for custom resnet18 must force garbage collection
  if model_flag == 0: 
      savemode = savemode + [classifiers.garbage_cb()]

  # training
  history = model.fit(
      x=train_x,
      y=train_y,
      epochs=35,
      verbose=1,
      validation_data=(val_x, val_y),
      batch_size=32,
      callbacks=savemode)
  
  # save trained weights of current model
  saved_weights.append(model.get_weights())
 
  # save variables which are necessary to continue interrupted experiments
  os.makedirs(parent_path + 'variables/' + name + '/')
  f = open(parent_path + 'variables/' + name + '/info.pckl', 'wb')
  pickle.dump([predictions_cv, train_y, train_idxs_str, val_idxs_str, CV, mean, std], f)
  f.close()

  print("Time spent in CV cycle:", datetime.datetime.now() - begin)


# save weights from all trained models ~ 170mb
f = open(parent_path + 'variables/' + name + '/weights.pckl', 'wb')
pickle.dump([saved_weights], f)
f.close()

Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 00017: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 00020: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 00023: ReduceLROnPlateau reducing learning rate to 1.6000001778593287e-06.
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 00026: ReduceLROnPlateau reducing learning rate to 3.200000264769187e-07.
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 00029: ReduceLROnPlateau reducing learning rate to 6.400000529538374e-08.
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 00032: ReduceLROnPlateau reducing learning rate to 1.2800001059076749e-08.
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 00035: ReduceLROnPl

In [0]:
### Evaluate model
# must use model.predict() separately, because causing memory error in the training loop

# load separate testing set 
test = prepare_data.load(test_path) 

model_auc_ens = np.zeros((num_splits))

for CV in range(num_splits):
  # reorganize training set to reproduce training time validation
  val_idx = np.zeros((len(val_idxs_str[CV])), dtype='uint8')
  for i in range(len(val_idx)):
      cur_image_id = val_idxs_str[CV][i]
      pos = np.where(cur_image_id == train['ids'])[0]
      if len(pos) > 1: # multiple disease class is resampled
          pos = pos[0]
      val_idx[i] = int(pos)
  val_x = train['x'][val_idx]
  val_y = train['y'][val_idx]

  val_x, _, _ = prepare_data.normalize(val_x, mean, std)

  model.set_weights(saved_weights[CV])
  model_auc_ens[CV] = roc_auc_score(val_y, model.predict(val_x))

print(model_auc_ens)

# make predictions based on 2 best performing model
bestmodels = np.argsort(model_auc_ens)[2:] 
predictions_ens = np.zeros((np.append(len(bestmodels),test['y'].shape)))

# normalize testing set
test_x = test['x'].copy()
for i in range(num_channels):
    test_x[:,:,:,i] = (test_x[:,:,:,i] - mean[i]) / std[i]

for CV in range(len(bestmodels)):
  model.set_weights(saved_weights[bestmodels[CV]])
  predictions_ens[CV] = model.predict(test_x)

[0.94448604 0.95160749 0.94225005 0.90526368]


In [0]:
# make prediction based on 2 best performing model
bestmodels = np.argsort(model_auc_ens)[2:] 
predictions_ens = np.zeros((np.append(len(bestmodels),test['y'].shape)))

test_x = test['x'].copy()
for i in range(num_channels):
    test_x[:,:,:,i] = (test_x[:,:,:,i] - mean[i]) / std[i]

for CV in range(len(bestmodels)):
  
  model.set_weights(saved_weights[bestmodels[CV]])
  predictions_ens[CV] = model.predict(test_x)

In [0]:
savedir = load_variable_path.replace('info.pckl', 'final_preds.mat')
sio.savemat(savedir, {'name': load_variable_path.split('/')[-2],
                      'train_idxs': train_idxs_str, # train idx of CV
                      'val_idxs': val_idxs_str, # val idx of CV
                      'train_y': train['y'], # train set labels
                      'train_ids': train['ids'], # train set names
                      'CV': CV, # current CV cycle
                      'test_y': test['y'], # test set labels
                      'test_ids': test['ids'], # test set names
                      'predictions_ens': predictions_ens, # predictions of current model on testing set (bagging)
                      'model_auc_ens': model_auc_ens # AUC of current model based on val set   
                      })

subm_path =  '/content/drive/My Drive/Colab Notebooks/Plant_Pathology/sample_submission.csv'
sample_submission = pd.read_csv(subm_path)

# Test_1818.jpg -> Test_1818
for i in range(len(test['ids'])):
    test['ids'][i] = test['ids'][i].split('.')[0]

# sample submission is in alphabetic order, preds are not
order = np.zeros((len(sample_submission['image_id'])), dtype='uint8')
for i in range(len(order)):
    cur_image_id = sample_submission['image_id'][i]
    order[i] = int(np.where(cur_image_id == test['ids'])[0])

predictions_ens_soft = np.average(predictions_ens, axis = 0)
sample_submission.iloc[:,1:] = predictions_ens_soft[order]
sample_submission.to_csv(load_variable_path.replace('info.pckl', 'submission.csv'), index=False)