In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from pytorch_lightning import Trainer
from copy import copy
import seaborn as sns
from torch import nn
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from tqdm.auto import tqdm
import pandas as pd
import os
from tqdm import tqdm
import logging
from argparse import Namespace
from torchvision import transforms
import torch
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from matplotlib import pyplot as plt

In [None]:

import sys

efficientnet_path='/kaggle/input/efficientnet-pytorch'

sys.path.append(efficientnet_path)

!ls /kaggle/input/timm-pretrained-efficientnet
!mkdir -p /root/.cache/torch/hub/checkpoints/
!cp /kaggle/input/timm-pretrained-efficientnet/efficientnet/efficientnet_b0_ra-3dd342df.pth /root/.cache/torch/hub/checkpoints/efficientnet_b0_ra-3dd342df.pth


In [None]:
%matplotlib inline

# Functions

In [None]:
# file transforms.py


import albumentations as A
from albumentations.pytorch import ToTensorV2
from torchvision import transforms


def get_train_transforms():
    return A.Compose([
        A.HueSaturationValue(hue_shift_limit=5, sat_shift_limit=5, val_shift_limit=5, p=1),
        A.ToFloat(max_value=1.0),
        A.RandomResizedCrop(256, 256, scale=(0.3, 0.9)),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])


def get_test_transforms():
    return A.Compose([
        A.ToFloat(max_value=1.0),
        A.Resize(400, 400),
        A.CenterCrop(256, 256),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])



In [None]:
# file utils.py

import numpy as np
import pandas as pd
import seaborn as sns
import os
from skimage import io
from torch.utils.data import Dataset
import torch
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid


class Unnormalize:
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
        Returns:
            Tensor: Normalized image.
        """
        for t, m, s in zip(tensor, self.mean, self.std):
            t.mul_(s).add_(m)
            # The normalize code -> t.sub_(m).div_(s)
        return tensor


def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.

    Arguments
    ---------
    cf:            confusion matrix to be passed in

    group_names:   List of strings that represent the labels row by row to be shown in each square.

    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'

    count:         If True, show the raw number in the confusion matrix. Default is True.

    normalize:     If True, show the proportions for each category. Default is True.

    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.

    xyticks:       If True, show x and y ticks. Default is True.

    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.

    sum_stats:     If True, display summary statistics below the figure. Default is True.

    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.

    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html

    title:         Title for the heatmap. Default is None.

    '''

    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names) == cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten() / np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels, group_counts, group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0], cf.shape[1])

    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        # Accuracy is sum of diagonal divided by total observations
        accuracy = np.trace(cf) / float(np.sum(cf))

        # if it is a binary confusion matrix, show some more stats
        if len(cf) == 2:
            # Metrics for Binary Confusion Matrices
            precision = cf[1, 1] / sum(cf[:, 1])
            recall = cf[1, 1] / sum(cf[1, :])
            f1_score = 2 * precision * recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy, precision, recall, f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""

    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize == None:
        # Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks == False:
        # Do not show categories if xyticks is False
        categories = False

    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf, annot=box_labels, fmt="", cmap=cmap, cbar=cbar, xticklabels=categories, yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)

    if title:
        plt.title(title)


def plot_image(img, label=None, ax=None):
    img = torch.Tensor(np.array(img))
    label_num_to_disease_map = {0: 'Cassava Bacterial Blight (CBB)',
                                1: 'Cassava Brown Streak Disease (CBSD)',
                                2: 'Cassava Green Mottle (CGM)',
                                3: 'Cassava Mosaic Disease (CMD)',
                                4: 'Healthy'}

    if not ax:
        ax = plt.gca()
    ax.imshow(img.permute(2, 1, 0))
    ax.axis('off')
    if label is not None:

        if isinstance(label, int):
            label = label_num_to_disease_map.get(label, 0)
        ax.set_title(f'{label}')


def plot_label_examples(dataset, targets, target_label):
    label_indices = np.where(targets == target_label)[0]

    sample = np.random.choice(label_indices, 6)

    fig = plt.figure(figsize=(20, 10))

    grid = ImageGrid(fig, 111,  # similar to subplot(111)
                     nrows_ncols=(2, 3),  # creates 2x2 grid of axes
                     axes_pad=0.1,  # pad between axes in inch.
                     )

    for ax, idx in zip(grid, sample):
        img, label = dataset[idx]
        assert label == target_label
        plot_image(img, ax=ax)
    plt.suptitle(f'Label {target_label}')
    plt.show()


class DatasetFromSubset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform

    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.transform:
            x = self.transform(image=x)['image']
        return x, y

    def __len__(self):
        return len(self.subset)


class CassavaDataset(Dataset):
    def __init__(self, root, image_ids, labels, transform=None):
        super().__init__()
        self.root = root
        self.image_ids = image_ids
        self.labels = labels
        self.targets = self.labels
        self.transform = transform

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        label = self.labels[idx]
        img = io.imread(os.path.join(self.root, self.image_ids[idx]))

        if self.transform:
            img = self.transform(image=img)['image']

        return img, label


In [None]:
# file models/model.py

from argparse import Namespace

import torch
from pytorch_lightning.metrics.functional import accuracy
from torch import nn
import timm
import pytorch_lightning as pl
import torch.nn.functional as F


class LeafDoctorModel(pl.LightningModule):
    def __init__(self, hparams = None):
        super().__init__()
        self.hparams = hparams or Namespace()

        self.trunk = timm.create_model('efficientnet_b0', pretrained=True, num_classes=5)

    def forward(self, x):
        return self.trunk(x)

    def predict_proba(self, x):
        probabilities = nn.functional.softmax(self.forward(x), dim=1)
        return probabilities

    def predict(self, x):
        return torch.max(self.forward(x), 1)[1]

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(),
                                      lr=self.hparams.lr or self.hparams.learning_rate,
                                      weight_decay=self.hparams.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                                  patience=self.hparams.reduce_lr_on_pleteau_patience)
        return {
            'optimizer': optimizer,
            'lr_scheduler': lr_scheduler,
            'monitor': 'val_loss',
            'interval': 'epoch',
            'frequency': 1
        }

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        acc = accuracy(y_hat, y)
        self.log("train_acc", acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        acc = accuracy(y_hat, y)
        self.log("val_acc", acc, prog_bar=True, logger=True),
        self.log("val_loss", loss, prog_bar=True, logger=True)


In [None]:
# file node_helpers.py

def score(predictions, labels):
    return {
        'accuracy': accuracy_score(predictions, labels),
        'f1_score': f1_score(predictions, labels, average='weighted'),
    }


def predict(model, dataset, indices, batch_size=10, num_workers=4, transform=None):
    transform = transform or get_test_transforms()
    dataset = DatasetFromSubset(
        torch.utils.data.Subset(dataset, indices=indices),
        transform=transform)

    loader = torch.utils.data.DataLoader(dataset,
                                         batch_size=batch_size,
                                         num_workers=num_workers,
                                         shuffle=False)

    predictions = []
    probas = []
    model.eval()
    model.freeze()
    for images, labels in tqdm(loader):
        batch_preds = model.predict(images)
        predictions += batch_preds.tolist()
        probas += model.predict_proba(images).tolist()
    return predictions, probas


In [None]:
#Pipeline train

def split_data(train_labels, parameters):
    """Splits trainig data into the train and validation set"""
    train_indices, val_indices = train_test_split(range(len(train_labels)),
                     stratify=train_labels.label,
                     random_state=parameters['seed'],
                     test_size=parameters['validation_size'])
    return train_indices, val_indices


def train_model(train_images_torch, train_indices, val_indices, parameters):
    train_transform, val_transform = get_train_transforms(), get_test_transforms()

    train_dataset = DatasetFromSubset(torch.utils.data.Subset(train_images_torch, indices=train_indices),
                                      transform=train_transform)

    val_dataset = DatasetFromSubset(torch.utils.data.Subset(train_images_torch, indices=val_indices),
                                    transform=val_transform)

    train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                    batch_size=parameters['batch_size'],
                                                    num_workers=parameters['data_loader_workers'],
                                                    shuffle=True)

    val_data_loader = torch.utils.data.DataLoader(val_dataset, num_workers=parameters['data_loader_workers'], batch_size=parameters['batch_size'])

    # Callbacks
    model_checkpoint = ModelCheckpoint(monitor="val_loss",
                                       verbose=True,
                                       dirpath=parameters['checkpoints_dir'],
                                       filename="{epoch}_{val_loss:.4f}",
                                       save_top_k=parameters['save_top_k_checkpoints'])
    early_stopping = EarlyStopping('val_loss',
                                   patience=parameters['early_stop_patience'],
                                   verbose=True,
                                   )

    hparams = Namespace(**parameters)

    trainer = Trainer.from_argparse_args(
        hparams,
        reload_dataloaders_every_epoch = True,
        callbacks=[model_checkpoint, early_stopping],
    )

    # Model
    model = LeafDoctorModel(hparams)

    # LR finding
    # lr_finder = trainer.tuner.lr_find(model,
    #                                   train_dataloader=train_data_loader,
    #                                   val_dataloaders=[val_data_loader])
    # plt.figure()
    # plt.title('LR finder results')
    # lr_finder.plot(suggest=True)
    # plt.show()
    # new_lr = lr_finder.suggestion()
    #
    # logging.info('LR finder found this LR: %f', new_lr)
    # model.hparams.lr = new_lr

    # Training
    trainer.fit(model, train_data_loader, val_data_loader)
    logging.info('Training finished')

    # Saving
    best_checkpoint = model_checkpoint.best_model_path
    model = LeafDoctorModel().load_from_checkpoint(checkpoint_path=best_checkpoint)
    return model


def score_model(model, train_images_torch, indices, parameters):
    logging.info('Scoring model')
    labels = train_images_torch.labels[indices]
    predictions = predict(model,
                          dataset=train_images_torch,
                          indices=indices,
                          batch_size=parameters['batch_size'],
                          num_workers=parameters['data_loader_workers'],
                          transform=get_test_transforms())

    scores = score(predictions, labels)

    logging.info(f'Validation scores:\n{scores}')
    return scores, predictions


In [None]:
#Pipeline predict

def predict_submission(cv_results, test_images_torch, sample_submission, parameters):
    logging.debug('Predicting on test with model')

    fold_model_names = [cv_results[fold]['model_path'] for fold in cv_results if fold != 'summary']

    all_probas = []
    for fname in fold_model_names:
        model_path = os.path.join(parameters['cv_models_dir'], fname)
        model = LeafDoctorModel()
        model.load_state_dict(torch.load(model_path))

        predictions, probas = predict(model,
                                  dataset=test_images_torch,
                                  indices=list(range(len(test_images_torch))),
                                  batch_size=parameters['batch_size'],
                                  num_workers=parameters['data_loader_workers'],
                                  transform=get_test_transforms())

        all_probas.append(probas)

    aggregated_probas = np.mean(all_probas, axis=0)
    pred_labels = np.argmax(aggregated_probas, 1)
    sample_submission.label = pred_labels
    return sample_submission


In [None]:
#Pipeline cv

def cross_validation(train_images_torch, parameters):
    cv_results = {}
    score_values = {}

    if os.path.exists(parameters['cv_models_dir']):
        raise Exception('CV models path already exists, please delete it explicitly to overwrite')
    else:
        os.makedirs(parameters['cv_models_dir'])

    cv = StratifiedKFold(n_splits=parameters['cv_splits'], random_state=parameters['seed'])
    indices = np.array(list(range(len(train_images_torch))))
    labels = train_images_torch.labels
    for fold_num, (train_idx, val_idx) in enumerate(cv.split(indices, labels)):
        logging.info('Fitting CV fold %d', fold_num)
        model_path = os.path.join(parameters['cv_models_dir'], f'model_fold_{fold_num}.pt')
        fold_parameters = copy(parameters)
        model = train_model(train_images_torch, train_idx, val_idx, fold_parameters)
        torch.save(model.state_dict(), model_path)
        scores, oof_predictions = score_model(model, train_images_torch, val_idx, fold_parameters)
        cv_results[f'fold_{fold_num}'] = {
            'model_path': model_path,
            'scores': scores,
            'val_indices': val_idx,
            'oof_predictions': oof_predictions,
        }

        for score in scores:
            if not score_values.get(score):
                score_values[score] = []
            score_values[score].append(scores[score])

    cv_results['summary'] = {

    }
    for score_name, scores in score_values.items():
        cv_results['summary'][f'{score_name}_mean'] = np.mean(scores)
        cv_results['summary'][f'{score_name}_std'] = np.std(scores)

    logging.info('Cross-validation results %s')
    return cv_results


# Parameters

In [None]:
parameters = {
    "seed": 42,
    "validation_size": 0.15,
    "gpus": -1,
    "data_loader_workers": 6,
    "batch_size": 10,
    "max_epochs": 100,
    "max_steps": 0,
    "auto_lr_find": 0,
    "lr": 0.001,
    "weight_decay": 0.0001,
    "early_stop_patience": 4,
    "reduce_lr_on_pleteau_patience": 2,
    "save_top_k_checkpoints": 1,
    "checkpoints_dir": "data/06_models/checkpoints",
    "cv_splits": 3,
    "cv_models_dir": "data/06_models/cv_folds"
}

In [None]:

DATA_DIR = '/kaggle/input/cassava-leaf-disease-classification'

train_labels = pd.read_csv(f'{DATA_DIR}/train.csv')
sample_submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
label_num_to_disease_map = pd.read_csv(f'{DATA_DIR}/label_num_to_disease_map.json')

train_images_torch = CassavaDataset(image_ids=train_labels.image_id.values, labels=train_labels.label.values, root=f'{DATA_DIR}/train_images')
test_images_torch = CassavaDataset(image_ids=sample_submission.image_id.values, labels=sample_submission.label.values, root=f'{DATA_DIR}/test_images')

submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')


# Execution

In [None]:
submission = predict_submission(cv_results, test_images_torch, sample_submission, parameters)

In [None]:
cv_results = cross_validation(train_images_torch, parameters)

In [None]:

print(print(cv_results['summary']))

submission.to_csv('submission.csv', index=False)
