<a href="https://colab.research.google.com/github/bmanikan/projects/blob/master/PlantPythology.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!kaggle datasets list -s pathology

ref                                                        title                                               size  lastUpdated          downloadCount  
---------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  
paultimothymooney/breast-histopathology-images             Breast Histopathology Images                         3GB  2017-12-19 05:46:40          18261  
kmader/colorectal-histology-mnist                          Colorectal Histology MNIST                           2GB  2018-09-19 14:20:49           2393  
skeef79/plant-pathology-more-data-no-background            plant pathology more data no background            261MB  2020-05-21 13:03:40             24  
piantic/plantpathology-apple-dataset                       PlantPathology Apple Dataset                       813MB  2020-04-24 13:45:22            104  
ambarish/kimia-path-960                                    KIMIA_Path_960   

In [1]:
!pip install kaggle
#from google.colab import files
#files.upload()
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets list -s pathology
!kaggle datasets download -d ianmoone0617/plant-pathology-resized-512-256 
!unzip \*.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: plant_pathology_small_256/Train_1417.jpg  
  inflating: plant_pathology_small_256/Train_1418.jpg  
  inflating: plant_pathology_small_256/Train_1419.jpg  
  inflating: plant_pathology_small_256/Train_142.jpg  
  inflating: plant_pathology_small_256/Train_1420.jpg  
  inflating: plant_pathology_small_256/Train_1421.jpg  
  inflating: plant_pathology_small_256/Train_1422.jpg  
  inflating: plant_pathology_small_256/Train_1423.jpg  
  inflating: plant_pathology_small_256/Train_1424.jpg  
  inflating: plant_pathology_small_256/Train_1425.jpg  
  inflating: plant_pathology_small_256/Train_1426.jpg  
  inflating: plant_pathology_small_256/Train_1427.jpg  
  inflating: plant_pathology_small_256/Train_1428.jpg  
  inflating: plant_pathology_small_256/Train_1429.jpg  
  inflating: plant_pathology_small_256/Train_143.jpg  
  inflating: plant_pathology_small_256/Train_1430.jpg  
  inflating: plant_pathology_small_256/Tr

In [2]:
#@title Import dependencies
import os
import shutil
import glob
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random



import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [3]:
#Set seed value
seed_value = 43

os.environ['PYTHONHASHSEED'] = str(seed_value)

random.seed(seed_value)

#numpy seed
np.random.seed(seed_value)

#Tf seed
tf.random.set_seed(seed_value)

#Configure new global tensorflow session
from tensorflow.compat.v1.keras import backend as k
session_conf = tf.compat.v1.ConfigProto(
    intra_op_parallelism_threads = 1,
    inter_op_parallelism_threads = 1
)

sess = tf.compat.v1.Session(graph = tf.compat.v1.get_default_graph(), config = session_conf)
k.set_session(sess)


In [4]:
base_dir = os.path.join(os.getcwd(),'plant_pathology_small_512')
images = glob.glob(base_dir + '/*.jpg')
print(len(images))
df = pd.read_csv('train.csv')
df['healthy'] = df['healthy'].astype(float)
df['multiple_diseases'] = df['multiple_diseases'].astype(float)
df['rust'] = df['rust'].astype(float)
df['scab'] = df['scab'].astype(float)
df.head()

3642


Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Train_0,0.0,0.0,0.0,1.0
1,Train_1,0.0,1.0,0.0,0.0
2,Train_2,1.0,0.0,0.0,0.0
3,Train_3,0.0,0.0,1.0,0.0
4,Train_4,1.0,0.0,0.0,0.0


In [5]:

classes = ['healthy', 'multiple_diseases', 'rust', 'scab']

#Creating directories
for cl in classes:
    image_list = [i for i in df[df[cl] == 1]['image_id'] if 'Test_' not in i]
    if not os.path.exists(os.path.join(base_dir, cl)):
        os.makedirs(os.path.join(base_dir, cl))
    for i in range(len(image_list)):
        image = os.path.join(base_dir, image_list[i] + '.jpg')
        shutil.move(image, os.path.join(base_dir, cl))

#Creating  Test dataset
os.makedirs(os.path.join(base_dir, 'test'))
test_dir = os.path.join(base_dir, 'test')
test_dir
images = glob.glob(base_dir + '/*.jpg')
for t in images:
    shutil.move(t, test_dir)

#Creating Train & Validation dataset
for cl in classes:
    image_dir = os.path.join(base_dir, cl)
    images = glob.glob(image_dir + '/*.jpg')
    print(f'the number of images in {cl} class is {len(images)} images.')
    num_rec = round(len(images)*0.8)
    train, val = images[:num_rec], images[num_rec:]
    
    for t in train:
        if not os.path.exists(os.path.join(base_dir, 'train', cl)):
            os.makedirs(os.path.join(base_dir, 'train', cl))
        train_dir = os.path.join(base_dir, 'train', cl)
        shutil.move(t, train_dir)
        
    for t in val:
        if not os.path.exists(os.path.join(base_dir, 'val', cl)):
            os.makedirs(os.path.join(base_dir, 'val', cl))
        val_dir = os.path.join(base_dir, 'val', cl)
        shutil.move(t, val_dir)


the number of images in healthy class is 516 images.
the number of images in multiple_diseases class is 91 images.
the number of images in rust class is 622 images.
the number of images in scab class is 592 images.


In [6]:
#Specifying directories
train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

#Parameters
batch_size = 32
IMG_SHAPE = 224
classes = ['healthy', 'multiple_diseases', 'rust', 'scab']

#Train dataset & Generator
image_gen_train = ImageDataGenerator(
    rotation_range = 90,
    width_shift_range = 0.2,
    shear_range = 0.1,
    zoom_range = 0.1,
    rescale = 1./255,
    horizontal_flip = True,
    vertical_flip = True,
    brightness_range = (0.1, 1.5),
    channel_shift_range = 0.1,
    validation_split = 0.2,
    dtype=float
)


#old method of splitting files and creating Generator

train_data_gen = image_gen_train.flow_from_directory(
    batch_size = batch_size,
    directory = train_dir,
    shuffle = True,
    target_size = (IMG_SHAPE,IMG_SHAPE),
    class_mode = 'categorical',
    seed=seed_value
)

#Validation dataset & Generator
image_gen_val = ImageDataGenerator(rescale=1./255)

val_data_gen = image_gen_train.flow_from_directory(batch_size=batch_size,
                                                 directory=val_dir,
                                                 target_size=(IMG_SHAPE, IMG_SHAPE),
                                                 class_mode='categorical',
                                                 seed=seed_value)

'''
test_generator = image_gen_val.flow_from_dataframe(
    test,
    directory = base_dir,
    target_size = (IMG_SHAPE,IMG_SHAPE),
    x_col = 'image_id',
    y_col = None,
    class_mode = None,
    shuffle = False,
    batch_size = batch_size
)


#forming train & validation set with Flow_from_dataframe
train = pd.read_csv('train.csv')
train['image_id'] = train['image_id'] + '.jpg'

train_gen = image_gen_train.flow_from_dataframe(
    train,
    directory = base_dir,
    target_size = (IMG_SHAPE, IMG_SHAPE),
    x_col = 'image_id',
    y_col = classes,
    class_mode = 'raw',
    batch_size = batch_size,
    subset = 'training'
)

val_gen = image_gen_train.flow_from_dataframe(
    train,
    directory = base_dir,
    target_size = (IMG_SHAPE, IMG_SHAPE),
    x_col = 'image_id',
    y_col = classes,
    class_mode = 'raw',
    shuffle = False,
    batch_size = batch_size,
    subset = 'validation'
)

'''
#Forming test dataset & Generator
sub_path = os.path.join(os.getcwd(), 'sample_submission.csv')

test = pd.read_csv(sub_path)
test['image_id'] = test['image_id'] + '.jpg'

test_generator = image_gen_train.flow_from_dataframe(
    test,
    directory = test_dir,
    target_size = (IMG_SHAPE,IMG_SHAPE),
    x_col = 'image_id',
    y_col = None,
    class_mode = None,
    shuffle = False,
    batch_size = batch_size,
    seed=seed_value
)


Found 1458 images belonging to 4 classes.
Found 363 images belonging to 4 classes.
Found 1821 validated image filenames.


In [7]:
'''
import numpy as np
dataiter = iter(train_data_gen)
images, labels = dataiter.next()

fig = plt.figure(figsize=(40,8))

for idx in np.arange(20):
  ax = fig.add_subplot(2,20/2, idx+1, xticks=[], yticks=[])
  plt.imshow(images[idx])
  ax.set_title(classes[int(labels[idx])])
'''

'\nimport numpy as np\ndataiter = iter(train_data_gen)\nimages, labels = dataiter.next()\n\nfig = plt.figure(figsize=(40,8))\n\nfor idx in np.arange(20):\n  ax = fig.add_subplot(2,20/2, idx+1, xticks=[], yticks=[])\n  plt.imshow(images[idx])\n  ax.set_title(classes[int(labels[idx])])\n'

In [8]:
he_initializer = tf.keras.initializers.HeUniform(seed = 43)

In [9]:
'''
#@title VGG16 Model Base_model of acc 82%
vgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=False)
resnet = tf.keras.applications.ResNet50V2(weights='imagenet', include_top=False, pooling = 'avg')

#for l in range(len(vgg16.layers)):
#  vgg16.layers[l].trainable = False
model = Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(224,224,3),name = 'image_input'))
model.add(vgg16)
model.add(tf.keras.layers.GlobalAveragePooling2D())
model.add(tf.keras.layers.Flatten(name='flatten'))
model.add(Dense(4096, activation='relu', name='fc1'))
model.add(Dense(2048, activation='relu', name='fc2'))
model.add(Dense(4, activation='softmax', name='predictions'))
model.summary()
'''

"\n#@title VGG16 Model Base_model of acc 82%\nvgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=False)\nresnet = tf.keras.applications.ResNet50V2(weights='imagenet', include_top=False, pooling = 'avg')\n\n#for l in range(len(vgg16.layers)):\n#  vgg16.layers[l].trainable = False\nmodel = Sequential()\nmodel.add(tf.keras.layers.InputLayer(input_shape=(224,224,3),name = 'image_input'))\nmodel.add(vgg16)\nmodel.add(tf.keras.layers.GlobalAveragePooling2D())\nmodel.add(tf.keras.layers.Flatten(name='flatten'))\nmodel.add(Dense(4096, activation='relu', name='fc1'))\nmodel.add(Dense(2048, activation='relu', name='fc2'))\nmodel.add(Dense(4, activation='softmax', name='predictions'))\nmodel.summary()\n"

In [10]:
'''
resnet = hub.KerasLayer('https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/4',
                        trainable = True)
def build_model(trained_model):
  model = Sequential()
  model.add(trained_model)
  #model.add(tf.keras.layers.GlobalAveragePooling2D())
  model.add(Dense(1024, activation='relu', name='fc1'))
  model.add(tf.keras.layers.Dropout(0.2))
  model.add(Dense(512, activation='relu', name='fc2'))
  model.add(tf.keras.layers.Dropout(0.2))
  model.add(Dense(256, activation='relu', name='fc3'))
  model.add(tf.keras.layers.Dropout(0.2))
  model.add(Dense(4, activation='softmax', name='predictions'))
  model.build([None,IMG_SHAPE,IMG_SHAPE,3])
  return model

model = build_model(resnet)
model.summary()
'''

"\nresnet = hub.KerasLayer('https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/4',\n                        trainable = True)\ndef build_model(trained_model):\n  model = Sequential()\n  model.add(trained_model)\n  #model.add(tf.keras.layers.GlobalAveragePooling2D())\n  model.add(Dense(1024, activation='relu', name='fc1'))\n  model.add(tf.keras.layers.Dropout(0.2))\n  model.add(Dense(512, activation='relu', name='fc2'))\n  model.add(tf.keras.layers.Dropout(0.2))\n  model.add(Dense(256, activation='relu', name='fc3'))\n  model.add(tf.keras.layers.Dropout(0.2))\n  model.add(Dense(4, activation='softmax', name='predictions'))\n  model.build([None,IMG_SHAPE,IMG_SHAPE,3])\n  return model\n\nmodel = build_model(resnet)\nmodel.summary()\n"

In [11]:
import keras
from keras.models import Model
model = tf.keras.applications.ResNet50V2(include_top=False, weights='imagenet', input_shape=(IMG_SHAPE,IMG_SHAPE,3))
x = model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = Dense(128, activation = 'relu', kernel_initializer=he_initializer)(x)
x = Dense(64, activation = 'relu', kernel_initializer=he_initializer)(x)
predictions = Dense(4, activation = 'softmax')(x)

model = Model(inputs = model.input, outputs = predictions)

#model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [12]:
#@title Calculating class weights

#alternative way - to be tested
# from sklearn.utils import class_weight
# class_weight = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

count_h = np.sum(df['healthy'] == 1 )
count_m = np.sum(df['multiple_diseases'] == 1)
count_r = np.sum(df['rust'] == 1)
count_s = np.sum(df['scab'] == 1)
total = count_h + count_m + count_r +count_s
print(f'Example: \n     Total:{total} \n     Healthy:{count_h} ({100*count_h/total})\n     Multiple:{count_m} ({100*count_m/total})\n     Rust:{count_r} ({100*count_r/total})\n     Scab:{count_s} ({100*count_s/total})')

weights_for_0 = (1/count_h)*(total)/2.0
weights_for_1 = (1/count_m)*(total)/2.0
weights_for_2 = (1/count_r)*(total)/2.0
weights_for_3 = (1/count_s)*(total)/2.0

class_weight = {0: weights_for_0, 1: weights_for_1, 2: weights_for_2, 3: weights_for_3}

print('Weight for class 0: {:.2f}'.format(weights_for_0))
print('Weight for class 1: {:.2f}'.format(weights_for_1))
print('Weight for class 2: {:.2f}'.format(weights_for_2))
print('Weight for class 3: {:.2f}'.format(weights_for_3))

Example: 
     Total:1821 
     Healthy:516 (28.336079077429982)
     Multiple:91 (4.99725425590335)
     Rust:622 (34.15705656232839)
     Scab:592 (32.50961010433828)
Weight for class 0: 1.76
Weight for class 1: 10.01
Weight for class 2: 1.46
Weight for class 3: 1.54


In [13]:
#stop it early
earlystopping = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss',
    patience = 20,
    restore_best_weights=True
)

#model checkpoint
checkpointfile = os.path.join(os.getcwd(), 'drive/My Drive/project/plantpathology/ResNet50_08_23/0')

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpointfile,
    monitor = 'val_accuracy',
    save_best_only = True
)
  
#Learning rate scheduler
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor = 'val_accuracy',
    factor = 0.1,
    patience = 7,
    cooldown = 1,
    min_lr = 0.0000001,
    verbose=1
)



In [14]:
#Focal loss:
def focal_loss(gamma=2., alpha=0.25):
  gamma = float(gamma)
  alpha = float(alpha)

  def focal_loss_fixed(y_true, y_pred):
    epsilon = 1.e-9
    y_true = tf.convert_to_tensor(y_true, tf.float32)
    y_pred = tf.convert_to_tensor(y_pred, tf.float32)

    model_out = tf.add(y_pred, epsilon)
    ce = tf.multiply(y_true, -tf.math.log(model_out))
    weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
    f1 = tf.multiply(alpha, tf.multiply(weight, ce))
    reduced_f1 = tf.reduce_max(f1, axis=1)
    return tf.reduce_mean(reduced_f1)
  return focal_loss_fixed

In [15]:
epochs = 250

adam = tf.keras.optimizers.Adam(learning_rate=0.001)
sgd = tf.keras.optimizers.SGD(momentum=0.9, nesterov=True)

model.compile(optimizer = sgd,
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])


history = model.fit(train_data_gen,
          epochs = epochs,
          validation_data = val_data_gen,
          callbacks = [earlystopping, checkpoint, reduce_lr])

Epoch 1/250
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /content/drive/My Drive/project/plantpathology/ResNet50_08_23/0.00/assets
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epo

KeyboardInterrupt: ignored

In [16]:
train_loss = history.history['loss']
train_acc = history.history['accuracy']

val_loss = history.history['val_loss']
val_acc = history.history['val_accuracy']

epochs_range = range(len(train_acc))

plt.figure(figsize=(8,8))
plt.subplot(1,2,1)
plt.plot(epochs_range, train_acc, label = 'training accuracy')
plt.plot(epochs_range, val_acc, label = 'validation accuracy')
plt.legend(loc = 'lower right')
plt.title('training and validation accuracy')

plt.subplot(1,2,2)
plt.plot(epochs_range, train_loss, label='training loss')
plt.plot(epochs_range, val_loss, label='validation loss')
plt.legend()
plt.title('training and validation loss')

plt.show()


NameError: ignored

In [None]:
'''
images, labels = next(iter(train_data_gen))
class_pred = np.array(classes)

predicted_batch = model.predict(images)
predicted_batch = tf.squeeze(predicted_batch).numpy()

predicted_ids = np.argmax(predicted_batch, axis=-1)
predicted_class_names = class_pred[predicted_ids]
'''

In [None]:
#loading saved model from checkpointfile
imported = tf.keras.models.load_model(os.path.join(os.getcwd(), 'drive/My Drive/project/plantpathology/ResNet50_08_21/0.00'))
imported.layers[0].trainable = False
imported.summary()

In [None]:
#Creating Submission file

sub = pd.read_csv(sub_path)
probs_nansnet = model.predict(test_generator)
sub.loc[:,'healthy':] = probs_nansnet

sub.to_csv('submission_nasnet.csv', index=False)
sub.head()

In [None]:
!kaggle competitions submit -c plant-pathology-2020-fgvc7 -f submission_nasnet.csv -m "SGD"

In [None]:
from tqdm import tqdm

In [None]:
#Test Time Augmentation
tta_steps = 10
predictions = []

for i in tqdm(range(tta_steps)):
  preds = imported.predict_generator(test_generator)
  predictions.append(preds)

pred = np.mean(predictions, axis = 0)
sub.loc[:,'healthy':] = pred

sub.to_csv('submission_nasnet.csv', index=False)
sub.head()

In [None]:
!kaggle competitions submit -c plant-pathology-2020-fgvc7 -f submission_nasnet.csv -m "trial of nb"