In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from torch import nn
import torch
from tqdm import tqdm
import pandas as pd
import logging
import numpy as np
from torchvision import transforms

In [None]:
# file transforms.py


import albumentations as A
from albumentations.pytorch import ToTensorV2
from torchvision import transforms


def get_train_transforms():
    return A.Compose([
        A.ToFloat(max_value=1.0),
        A.Resize(256, 256),
        A.RandomResizedCrop(227, 227, scale=(0.4, 1), ratio=(0.75, 1.33)),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])


def get_test_transforms():
    return A.Compose([
        A.ToFloat(max_value=1.0),
        A.Resize(256, 256),
        A.CenterCrop(227, 227),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])



In [None]:
# file utils.py

import numpy as np
import pandas as pd
import os
from skimage import io
from torch.utils.data import Dataset
import torch
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid


class Unnormalize:
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
        Returns:
            Tensor: Normalized image.
        """
        for t, m, s in zip(tensor, self.mean, self.std):
            t.mul_(s).add_(m)
            # The normalize code -> t.sub_(m).div_(s)
        return tensor


def plot_image(img, label=None, ax=None):
    img = torch.Tensor(np.array(img))
    label_num_to_disease_map = {0: 'Cassava Bacterial Blight (CBB)',
                                1: 'Cassava Brown Streak Disease (CBSD)',
                                2: 'Cassava Green Mottle (CGM)',
                                3: 'Cassava Mosaic Disease (CMD)',
                                4: 'Healthy'}

    if not ax:
        ax = plt.gca()
    ax.imshow(img.permute(2, 1, 0))
    ax.axis('off')
    if label is not None:

        if isinstance(label, int):
            label = label_num_to_disease_map.get(label, 0)
        ax.set_title(f'{label}')


def plot_label_examples(dataset, targets, target_label):
    label_indices = np.where(targets == target_label)[0]

    sample = np.random.choice(label_indices, 6)

    fig = plt.figure(figsize=(20, 10))

    grid = ImageGrid(fig, 111,  # similar to subplot(111)
                     nrows_ncols=(2, 3),  # creates 2x2 grid of axes
                     axes_pad=0.1,  # pad between axes in inch.
                     )

    for ax, idx in zip(grid, sample):
        img, label = dataset[idx]
        assert label == target_label
        plot_image(img, ax=ax)
    plt.suptitle(f'Label {target_label}')
    plt.show()


class DatasetFromSubset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform

    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.transform:
            x = self.transform(image=x)['image']
        return x, y

    def __len__(self):
        return len(self.subset)


class CassavaDataset(Dataset):
    def __init__(self, root, image_ids, labels, transform=None):
        super().__init__()
        self.root = root
        self.image_ids = image_ids
        self.labels = labels
        self.targets = self.labels
        self.transform = transform

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        label = self.labels[idx]
        img = io.imread(os.path.join(self.root, self.image_ids[idx]))

        if self.transform:
            img = self.transform(image=img)['image']

        return img, label


In [None]:
# file models/resnet50.py

import torch
from torch import nn
import torchvision.models as models


class ResnetModel(nn.Module):
    def __init__(self):
        super().__init__()
        trunk = models.resnet18(pretrained=True)
        head = nn.Linear(trunk.fc.in_features, 5)

        self.trunk = trunk
        self.trunk.fc = head
        self.head = self.trunk.fc

    def forward(self, x):
        return self.trunk.forward(x)

    def predict(self, x):
        logits = self.forward(x)
        probabilities = nn.functional.softmax(logits, dim=1)
        return probabilities

    def predict_label(self, x):
        return torch.max(self.predict(x), 1)[1]


In [None]:
#Pipeline train

def split_data(train_labels, parameters):
    """Splits trainig data into the train and validation set"""
    train_indices, val_indices = train_test_split(range(len(train_labels)),
                     stratify=train_labels.label,
                     random_state=parameters['seed'],
                     test_size=parameters['validation_size'])
    return train_indices, val_indices


def train_model(train_images_torch, train_indices, val_indices, parameters):
    train_transform, val_transform = get_train_transforms(), get_test_transforms()

    train_dataset = DatasetFromSubset(torch.utils.data.Subset(train_images_torch, indices=train_indices),
                                      transform=train_transform)

    val_dataset = DatasetFromSubset(torch.utils.data.Subset(train_images_torch, indices=val_indices),
                                    transform=val_transform)

    train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                    batch_size=parameters['batch_size'],
                                                    num_workers=parameters['data_loader_workers'],
                                                    shuffle=True)

    val_data_loader = torch.utils.data.DataLoader(val_dataset, num_workers=parameters['data_loader_workers'], batch_size=parameters['batch_size'])

    model = ResnetModel()

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=parameters['learning_rate'], weight_decay=parameters['weight_decay'])
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=parameters['reduce_lr_on_pleteau_patience'], verbose=True)

    model = model.to(parameters['device'])
    criterion = criterion.to(parameters['device'])

    early_stop_patience = parameters['early_stop_patience']
    early_stop_counter = 0
    previous_min_val_loss = None

    train_losses = []
    validation_losses = []

    train_epoch_losses = []
    validation_epoch_losses = []

    logging.info('Training model')
    epoch_pbar = tqdm(range(parameters['train_epochs']))
    for epoch in epoch_pbar:
        model.train()

        logging.debug("Epoch %d", epoch)
        epoch_train_losses = []
        epoch_val_losses = []

        pbar = tqdm(enumerate(train_data_loader), total=len(train_data_loader))
        for i, batch in pbar:
            if i > parameters['batches_per_epoch']:
                break
            inputs, labels = batch
            inputs = inputs.to(parameters['device'])
            labels = labels.to(parameters['device'])

            optimizer.zero_grad()

            outputs = model.forward(inputs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            epoch_train_losses.append(loss.item())
            pbar.set_postfix({'batch loss': round(loss.item(), 4)})

        model.eval()
        for i, batch in tqdm(enumerate(val_data_loader), total=len(val_data_loader)):
            if i > parameters['batches_per_epoch']:
                break
            with torch.no_grad():
                inputs, labels = batch
                inputs = inputs.to(parameters['device'])
                labels = labels.to(parameters['device'])

                outputs = model.forward(inputs)
                val_loss = criterion(outputs, labels)

                epoch_val_losses.append(val_loss.item())

        epoch_mean_val_loss = sum(epoch_val_losses)/len(epoch_val_losses)
        epoch_mean_train_loss = sum(epoch_train_losses)/len(epoch_train_losses)
        if previous_min_val_loss is None:
            previous_min_val_loss = epoch_mean_val_loss
        elif epoch_mean_val_loss < previous_min_val_loss:
            previous_min_val_loss = epoch_mean_val_loss
            early_stop_counter = 0
            logging.debug('New minimum val loss %f, early stopping reset', previous_min_val_loss)
        else:
            early_stop_counter += 1
            logging.debug('Early stop counter now %d', early_stop_counter)

        lr_scheduler.step(sum(epoch_train_losses))

        train_epoch_losses.append(epoch_mean_train_loss)
        validation_epoch_losses.append(epoch_mean_val_loss)
        logging.info("Epoch mean train loss %f", epoch_mean_train_loss)
        logging.info("Epoch mean val loss %f", epoch_mean_val_loss)

        epoch_pbar.set_postfix({
            'train loss': epoch_mean_train_loss,
            'val loss': epoch_mean_val_loss,
        })

        train_losses += epoch_train_losses
        validation_losses += epoch_val_losses

        if early_stop_counter >= early_stop_patience:
            logging.info('Early stopped.')
            break

    logging.info('Training finished')

    metrics = {
        'train_losses': train_losses,
        'validation_losses': validation_losses,
        'train_epoch_losses': train_epoch_losses,
        'validation_epoch_losses': validation_epoch_losses,
        'last_epoch': epoch,
    }

    return model, metrics


def score_model(model, train_images_torch, indices, parameters):
    logging.debug('Scoring model')

    device = parameters['device']

    dataset = DatasetFromSubset(torch.utils.data.Subset(train_images_torch, indices=indices),
                      transform=get_test_transforms())
    loader = torch.utils.data.DataLoader(dataset, num_workers=parameters['data_loader_workers'], batch_size=parameters['batch_size'])

    predictions = []
    true_labels = []
    model.eval()
    model = model.to(device)
    for images, labels in tqdm(loader):
        batch_preds = model.predict_label(images.to(device))
        predictions += batch_preds.tolist()
        true_labels += labels.tolist()

    return {
        'accuracy': accuracy_score(predictions, true_labels),
        'confusion_matrix': confusion_matrix(predictions, true_labels),
        'f1_score': f1_score(predictions, true_labels, average='weighted'),
    }


In [None]:
#Pipeline predict

def predict(model, test_images_torch, sample_submission, parameters):
    logging.debug('Predicting with model')

    device = parameters['device']

    test_images_torch.transform = get_test_transforms()
    loader = torch.utils.data.DataLoader(test_images_torch, batch_size=parameters['batch_size'])

    predictions = []
    model.eval()
    model = model.to(device)
    for images, labels in tqdm(loader):
        batch_preds = model.predict_label(images.to(device))
        predictions += batch_preds.tolist()

    sample_submission.label = predictions

    return sample_submission


In [None]:
parameters = {
    "device": "cuda",
    "seed": 42,
    "validation_size": 0.2,
    "data_loader_workers": 6,
    "batch_size": 10,
    "train_epochs": 20,
    "learning_rate": 0.001,
    "weight_decay": 0.0001,
    "early_stop_patience": 4,
    "reduce_lr_on_pleteau_patience": 3,
    "batches_per_epoch": 999999999
}

In [None]:
!cp ../input/resnet18/resnet18.pth /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth

In [None]:


DATA_DIR = '../data/01_raw'

train_labels = pd.read_csv(f'{DATA_DIR}/train.csv')
sample_submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
label_num_to_disease_map = pd.read_csv(f'{DATA_DIR}/label_num_to_disease_map.json')

train_images_torch = CassavaDataset(image_ids=train_labels.image_id.values, labels=train_labels.label.values, root=f'{DATA_DIR}/train_images')
test_images_torch = CassavaDataset(image_ids=sample_submission.image_id.values, labels=sample_submission.label.values, root=f'{DATA_DIR}/test_images')

submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')



In [None]:
train_indices, val_indices = split_data(train_labels, parameters)
train_indices, val_indices

In [None]:
model, train_metrics = train_model(train_images_torch, train_indices, val_indices, parameters)
model, train_metrics

In [None]:
val_scores = score_model(model, train_images_torch, val_indices, parameters)
val_scores

In [None]:
submission = predict(model, test_images_torch, sample_submission, parameters)
submission

In [None]:
submission.to_csv('submission.csv', index=False)