In [1]:
# basics
import os
import time
import numpy as np

# EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# Data preprocessing
import tensorflow as tf
import tensorflow_io as tfio
from sklearn.model_selection import train_test_split

# Deep learning
from tensorflow.keras.applications import ResNet50, VGG16, InceptionV3
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

# utilities
from tensorflow.keras.models import load_model

In [2]:
# Define the directory path
directory_path = '/content/drive/MyDrive/Akshay /plant'

# Construct file paths based on the directory path
sample_data_path = os.path.join(directory_path, 'sample_submission.csv')
train_data_path = os.path.join(directory_path, 'train.csv')
test_data_path = os.path.join(directory_path, 'test.csv')
model_path = os.path.join(directory_path, 'paddy_disease_model.h5')
notebook_copy1_path = os.path.join(directory_path, 'Untitled6-Copy1.ipynb')
notebook_path = os.path.join(directory_path, 'Untitled6.ipynb')
images_dir = os.path.join(directory_path, 'images')
checkpoints_dir = os.path.join(directory_path, '.ipynb_checkpoints')

# Read the CSV files
sample_data = pd.read_csv(sample_data_path)
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Set the image directory
image_dir = images_dir + '/'

# Output the constructed file paths and directory paths
print("Sample Data Path:", sample_data_path)
print("Train Data Path:", train_data_path)
print("Test Data Path:", test_data_path)
print("Model Path:", model_path)
print("Notebook Copy1 Path:", notebook_copy1_path)
print("Notebook Path:", notebook_path)
print("Images Directory:", images_dir)
print("Checkpoints Directory:", checkpoints_dir)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Akshay /plant\\sample_submission.csv'

In [None]:
def print_short_summary(name, data):
    """
    Print data head, shape and info.

    Args:
        name (str): name of dataset
        data (dataframe): dataset in a pd.DataFrame format
    """
    print(name)
    print('\n1. Data head:')
    print(data.head())
    print('\n2. Data shape: {}'.format(data.shape))
    print('\n3. Data info:')
    data.info()

def print_number_files(dirpath):
    print('{}: {} files'.format(dirpath, len(os.listdir(dirpath))))

In [None]:
print_short_summary('Train data', train_data)

In [None]:
print_short_summary('Test data', test_data)

In [None]:
print_short_summary('Sample data', sample_data)

In [None]:
print_number_files('/content/drive/MyDrive/Akshay /plant/images')

In [None]:
# Cleaning
del print_short_summary, print_number_files

In [None]:
# Plot horizontal barplot of number of records per label
plt.figure(figsize=(16, 9))
tmp = train_data.iloc[:, 1:].sum().values
tmp = pd.DataFrame(tmp, train_data.columns[1:]).sort_values(by=[0], ascending = False)
sns.barplot(y = tmp.index, x = np.ravel(tmp.values), orient='h')
plt.xlabel('Number of records')
plt.ylabel('Label')
plt.title('Number of records per label')
plt.show()

In [None]:
def get_images_to_plot(file_names):
    """
    Return list of image objects.

    Args:
        file_names: list of filenames
    Returns:
        list: list of image objects
    """
    return [Image.open(f) for f in file_names]

def get_image_label(dirname, data, labels, n = 5):
    """
    Return dictionary with label-imagepath.

    Args:
        dirname: name of the directory
        data: dataset of file names
        labels: list of labels
        n (opt): number of images per label
    Returns:
        dict: dictionary with label-imagepath pairs
    """
    dict_img = {}
    for l in labels:
        indexes = data[l] == 1
        tmp = data[indexes][:n]
        tmp = dirname + tmp['image_id'] + '.jpg'
        tmp = tmp.values
        tmp = get_images_to_plot(tmp)
        dict_img[l] = tmp

    return dict_img

In [None]:
# Print original image size
img_path = image_dir + train_data['image_id'][0] + '.jpg'
img = Image.open(img_path)
print('Original image size: {}'.format(img.size))

In [None]:
def print_short_summary(name, data):
    """
    Print data head, shape and info.

    Args:
        name (str): name of dataset
        data (dataframe): dataset in a pd.DataFrame format
    """
    print(name)
    print('\n1. Data head:')
    print(data.head())
    print('\n2. Data shape: {}'.format(data.shape))
    print('\n3. Data info:')
    data.info()

def print_number_files(dirpath):
    print('{}: {} files'.format(dirpath, len(os.listdir(dirpath))))

In [None]:
print_short_summary('Train data', train_data)

In [None]:
print_short_summary('Test data', test_data)

In [None]:
print_short_summary('Sample data', sample_data)

In [None]:
print_number_files(image_dir)

In [None]:
# Cleaning
del print_short_summary, print_number_files

In [None]:
# Plot horizontal barplot of number of records per label
plt.figure(figsize=(16, 9))
tmp = train_data.iloc[:, 1:].sum().values
tmp = pd.DataFrame(tmp, train_data.columns[1:]).sort_values(by=[0], ascending = False)
sns.barplot(y = tmp.index, x = np.ravel(tmp.values), orient='h')
plt.xlabel('Number of records')
plt.ylabel('Label')
plt.title('Number of records per label')
plt.show()


In [None]:
def get_images_to_plot(file_names):
    """
    Return list of image objects.

    Args:
        file_names: list of filenames
    Returns:
        list: list of image objects
    """
    return [Image.open(f) for f in file_names]

def get_image_label(dirname, data, labels, n = 5):
    """
    Return dictionary with label-imagepath.

    Args:
        dirname: name of the directory
        data: dataset of file names
        labels: list of labels
        n (opt): number of images per label
    Returns:
        dict: dictionary with label-imagepath pairs
    """
    dict_img = {}
    for l in labels:
        indexes = data[l] == 1
        tmp = data[indexes][:n]
        tmp = dirname + tmp['image_id'] + '.jpg'
        tmp = tmp.values
        tmp = get_images_to_plot(tmp)
        dict_img[l] = tmp

    return dict_img

In [None]:
# Print original image size
img_path = image_dir + train_data['image_id'][0] + '.jpg'
img = Image.open(img_path)
print('Original image size: {}'.format(img.size))

In [None]:
# Get 5 filenames per label
data = get_image_label(image_dir, train_data, tmp.index)

In [None]:
# Initialize subplots with number of labels rows and 5 columns
fig, axes = plt.subplots(nrows=len(tmp.index), ncols=5, figsize=(16, 9))

# Loop through selected images and display in the respective rows
labels = tmp.index
for i in range(len(labels)*5):
    row = i // 5
    col = i % 5
    axes[row, col].imshow(data[labels[row]][col])
    axes[row, col].set_title(labels[row])
    axes[row, col].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Cleaning
del get_images_to_plot, get_image_label, img_path, img
del data, fig, axes, labels, row, col

In [None]:
# Global configuration
BATCH_SIZE = 32
NUM_CLASSES = 4
NUM_EPOCHS = 15
IMAGE_HEIGHT, IMAGE_WIDTH = 224, 224

In [None]:
# Get class distribution
class_counts = train_data.iloc[:, 1:].sum()

# Get maximum count of the majority class
max_count = class_counts.max()

# Upsample the minority classes to the max_count records
balanced_data = []
for class_name in class_counts.index:
    class_data = train_data[train_data[class_name] == 1]
    upsampled_data = class_data.sample(max_count
                                         , replace = True
                                         , random_state = 0)
    balanced_data.append(upsampled_data)

# Get final balanced dataframe
train_data_balanced = pd.concat(balanced_data, axis=0, ignore_index=True)
train_data_balanced.iloc[:, 1:].sum()

In [None]:
def get_augmented_image(image):
    """
    Return augmented image.

    Args:
        image: image tensor
    Returns:
        image
    """
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_hue(image, max_delta=0.1)
    image = tf.image.random_brightness(image, max_delta=0.2)
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    image = tf.image.random_saturation(image, lower=0.8, upper=1.2)

    return image

In [None]:
# Get image paths and labels
image_paths = image_dir + train_data_balanced['image_id'] + '.jpg'
image_paths = image_paths.values
labels = train_data_balanced.loc[:, 'healthy':]

# Split into train and test sets fo training accuracy
X_train, X_test, y_train, y_test = train_test_split(image_paths
                                                    , labels
                                                    , test_size = 0.25
                                                    , shuffle = True
                                                    , random_state = 43)

In [None]:
del image_paths, labels

In [None]:
def get_decoded_image(image_path, label = None):
    """
    Load and preprocess images using TensorFlow I/O
    and Image Generator transformation.

    Args:
        image_path: path to JPEG image
    Returns:
        image or tuple: decoded image or (image, label)
    """
    image = tf.io.read_file(image_path)
    # Set 3 channels, RGB
    image = tf.image.decode_image(image, channels=3)
    image.set_shape([None, None, 3])
    # Set size to convert to compatible with models input
    image = tf.image.resize(image, [IMAGE_HEIGHT, IMAGE_WIDTH])
    image = tf.cast(image, tf.float32) / 255.0
    image = get_augmented_image(image)

    return image if label is None else (image, label)


def get_prefetched_data(data):
    """
    Create a TensorFlow dataset from image paths.
    Execution in parallel.
    Load, preprocess images and batch the data.
    Prefetch batches to improve training performance.

    Args:
        data: ndarray of image paths + labels (opt)
    Returns:
        tf.data.Dataset: preprocessed and preloaded TensorFlow dataset for keras NN
    """
    # Autotune the degree of parallelism during training
    AUTOTUNE = tf.data.experimental.AUTOTUNE

    # Create dataset from image paths
    dataset = tf.data.Dataset.from_tensor_slices(data)

    # Apply parallel processing to load and preprocess images
    dataset = dataset.map(get_decoded_image, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)

    return dataset

In [None]:
# Get prefetched tf.data.DataSet for subsequent models
train_dataset = get_prefetched_data((X_train, y_train))
test_dataset = get_prefetched_data((X_test, y_test))

In [None]:
def get_model(Model):
    """
    Return Model architecture.

    Args:
        obj: model class
    Returns:
        obj: model architecture
    """
    model = Model(weights='imagenet'
                     , include_top=False
                     , input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 3))

    return model

def get_model_resnet():
    return get_model(ResNet50)

def get_model_vggnet():
    return get_model(VGG16)

def get_model_inceptionnet():
    return get_model(InceptionV3)

In [None]:
def get_compiled_model(func):
    """
    Return model to be trained with a multi-GPU strategy.
    Allows data parallelism by copying all of the model's variables
    to each processor.

    Args:
        func: function to get model architecture
    Returns:
        compiled_model: tensorflow model
    """
    # Check if GPU is available
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        # Create a MirroredStrategy.
        strategy = tf.distribute.MirroredStrategy()

        print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
        print('No GPU available, falling back to CPU.')

    with strategy.scope():
        compiled_model = func()
        # Add custom classification layers for our task
        compiled_model = Sequential([
            compiled_model
            , layers.GlobalAveragePooling2D()
            # Add dense layers with 128 and 64 units
            , layers.Dense(128, activation="relu")
            , layers.Dense(64, activation="relu")
            # Add regularization with dropout rate 30%
            , layers.Dropout(0.3)
            , layers.Dense(NUM_CLASSES, activation='softmax')
        ])
        compiled_model.compile(optimizer = 'adam'
                              , loss = 'categorical_crossentropy'
                              , metrics = ['categorical_accuracy'])

    return compiled_model

In [None]:
def plot_model_scores(scores, model_name):
    """
    Plot train and test accuracy scores of a model by epoch
    """
    train_scores, test_scores = scores
    epochs = range(1, len(train_scores) + 1)

    # Plot train and test scores
    plt.figure(figsize=(16, 9))
    plt.plot(epochs, train_scores, label='Train score')
    plt.plot(epochs, test_scores, label='Test score')
    plt.title('Train and test accuracy scores of the {}'.format(model_name))
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy Score')
    plt.legend()
    plt.grid(True)
    plt.show()


def get_model_results(model_name, model):
    """
    Return tuple of runtime, train and test scores.
    Compile, fit and save model along the way.

    Args:
        model: fitted model
    Returns:
        (runtime, (train_scores, test_scores) )
    """
    model = get_compiled_model(model)

    st = time.time()
    model.fit(train_dataset, epochs = NUM_EPOCHS, validation_data=test_dataset)
    runtime = time.time() - st

    model.save('{}.h5'.format(model_name))

    train_scores = model.history.history['categorical_accuracy']
    test_scores = model.history.history['val_categorical_accuracy']

    tf.keras.backend.clear_session()

    return (runtime, (train_scores, test_scores))

In [None]:
# Get train and test scores of every epoch
runtime_resnet, scores_resnet = get_model_results('model_resnet'
                                                  ,get_model_resnet)

In [None]:
# Plot scores
plot_model_scores(scores_resnet, 'ResNet')

In [None]:
# Get train and test scores of every epoch
runtime_vggnet, scores_vggnet = get_model_results('model_vggnet'
                                                  ,get_model_vggnet)

In [None]:
# Plot scores
plot_model_scores(scores_vggnet, 'VGG-Net')

In [None]:
# Get train and test scores of every epoch
runtime_inceptionnet, scores_inceptionnet = get_model_results('model_inceptionnet'
                                                  ,get_model_inceptionnet)

In [None]:
# Plot scores
plot_model_scores(scores_inceptionnet, 'InceptionNet')

In [None]:
# Print table results
results = [('ResNet', runtime_resnet, scores_resnet)
          ,('VGG-Net', runtime_vggnet, scores_vggnet)
          ,('InceptionNet', runtime_inceptionnet, scores_inceptionnet)]
table = []
for i in range(len(results)):
    tmp = {
            'model': results[i][0]
            , 'runtime (sec)': results[i][1]
            , 'train_score (cat. accuracy)': results[i][2][0][-1]
            , 'test_score (cat. accuracy)': results[i][2][1][-1]
        }
    table.append(tmp)


pd.DataFrame(table).sort_values(by = ['test_score (cat. accuracy)'
                                      ,'runtime (sec)']
                                , ascending = [False
                                               , True]).reset_index(drop = True)

In [None]:
# Cleaning
del results, tmp, table
del get_model_results, plot_model_scores, get_compiled_model

In [None]:
# Load top model
model = load_model('model_inceptionnet.h5')

In [None]:
# Create prefethed dataset of images to classify
submit_data = image_dir + sample_data['image_id'] + '.jpg'
submit_data = submit_data.values

submit_dataset = get_prefetched_data((submit_data))

In [None]:
# Get results
results = model.predict(submit_dataset)

In [None]:
# Merge results with sample submission
sample_data.loc[:, 'healthy':] = results

TypeError: Interface.__init__() missing 2 required positional arguments: 'inputs' and 'outputs'