In [1]:
from google.colab import drive
drive.mount("/content/drive")
!mkdir -p drive/MyDrive/bayer

Mounted at /content/drive


In [2]:
!unzip drive/MyDrive/bfg-dataset.zip -d data/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: data/tomato/yellow_leaf_curl_virus/e3332e6d-d444-4f60-9e7c-571ffaea6c82___UF.GRC_YLCV_Lab 01989.JPG  
  inflating: data/tomato/yellow_leaf_curl_virus/e346fbf7-5888-423a-b2c6-ff67dd1e3b3f___YLCV_GCREC 2166.JPG  
  inflating: data/tomato/yellow_leaf_curl_virus/e3479b46-d8bb-4c56-ad22-3364f19761cb___YLCV_NREC 2850.JPG  
  inflating: data/tomato/yellow_leaf_curl_virus/e34c6884-cf13-49cb-aac4-06d77e4c1155___YLCV_GCREC 2350.JPG  
  inflating: data/tomato/yellow_leaf_curl_virus/e35801a0-5554-4e50-8e86-4a9f599411cb___YLCV_NREC 0265.JPG  
  inflating: data/tomato/yellow_leaf_curl_virus/e3650c26-1c04-4ed1-bd66-a77da126216f___UF.GRC_YLCV_Lab 01344.JPG  
  inflating: data/tomato/yellow_leaf_curl_virus/e37881e6-7e58-4e4d-b07e-0b90102d5fdc___YLCV_GCREC 5188.JPG  
  inflating: data/tomato/yellow_leaf_curl_virus/e37a58dc-6b69-4955-8dda-1561f8b0db77___UF.GRC_YLCV_Lab 02025.JPG  
  inflating: data/tomato/yellow_leaf_curl_virus

In [3]:
!pip install https://github.com/ufoym/imbalanced-dataset-sampler/archive/master.zip

Collecting https://github.com/ufoym/imbalanced-dataset-sampler/archive/master.zip
[?25l  Downloading https://github.com/ufoym/imbalanced-dataset-sampler/archive/master.zip
[K     \ 348kB 809kB/s
Building wheels for collected packages: torchsampler
  Building wheel for torchsampler (setup.py) ... [?25l[?25hdone
  Created wheel for torchsampler: filename=torchsampler-0.1.1-cp37-none-any.whl size=3801 sha256=ab711249fa9197abae8b7fc2d957dc4b577152efa97ec2fdbc503a21ec91a072
  Stored in directory: /tmp/pip-ephem-wheel-cache-51zuusd3/wheels/c1/f1/dc/587588aa0a5f0dc76673d98a554ff4db2575aaca3984240ec1
Successfully built torchsampler
Installing collected packages: torchsampler
Successfully installed torchsampler-0.1.1


In [4]:
!pip install efficientnet_pytorch

Collecting efficientnet_pytorch
  Downloading https://files.pythonhosted.org/packages/2e/a0/dd40b50aebf0028054b6b35062948da01123d7be38d08b6b1e5435df6363/efficientnet_pytorch-0.7.1.tar.gz
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-cp37-none-any.whl size=16443 sha256=af87868d8a405842ff8cbda16ddcc754b5295ec342cda587b6b2801ce175660c
  Stored in directory: /root/.cache/pip/wheels/84/27/aa/c46d23c4e8cc72d41283862b1437e0b3ad318417e8ed7d5921
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.1


In [5]:
!pip install --upgrade albumentations

Collecting albumentations
[?25l  Downloading https://files.pythonhosted.org/packages/03/58/63fb1d742dc42d9ba2800ea741de1f2bc6bb05548d8724aa84794042eaf2/albumentations-0.5.2-py3-none-any.whl (72kB)
[K     |████▌                           | 10kB 22.2MB/s eta 0:00:01[K     |█████████                       | 20kB 16.0MB/s eta 0:00:01[K     |█████████████▋                  | 30kB 14.1MB/s eta 0:00:01[K     |██████████████████▏             | 40kB 13.2MB/s eta 0:00:01[K     |██████████████████████▊         | 51kB 7.6MB/s eta 0:00:01[K     |███████████████████████████▏    | 61kB 8.9MB/s eta 0:00:01[K     |███████████████████████████████▊| 71kB 8.4MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.6MB/s 
Collecting opencv-python-headless>=4.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/c8/84/72ec52fbac4775c2a5bf0ee5573c922a0cac35eb841907edf56493a5e313/opencv_python_headless-4.5.2.52-cp37-cp37m-manylinux2014_x86_64.whl (38.2MB)
[K     |██████████

In [1]:
import os
import random

import albumentations as A
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr
from efficientnet_pytorch import EfficientNet
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader, Dataset
from torchsampler import ImbalancedDatasetSampler
from tqdm.notebook import tqdm

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TRAIN_TRANSFORM = A.Compose([
    A.Flip(p=0.5),
    A.RandomRotate90(p=0.5),
    A.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.3, hue=0, p=0.5),
    A.Blur(blur_limit=4, p=0.5),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
])

TEST_TRANSFORM = A.Compose([
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
])

TARGETS = ["bacterial_spot", "black_measles", "black_mold", "black_rot", "black_spot", "blast", "blight", "brown_spot", "canker", "dot", "early_blight", "gray_spot", "greening", "healthy", "late_blight", "leaf_mold", "leaf_scorch", "melanose", "miner", "mosaic_virus", "mummification", "powdery_mildew", "rust", "scab", "scald", "septoria_leaf_spot", "spot", "target_spot", "tungro", "two_spotted_spider_mite", "virus", "yellow_leaf_curl_virus"]

In [4]:
class MyDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filepath = "data/" + self.df.loc[idx, "img_id"].replace("\\", "/")

        img = cv2.imread(filepath)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if self.transform is not None:
            img = self.transform(image=img)["image"]

        return torch.tensor(img.transpose((2, 0, 1))), torch.tensor(self.df.loc[idx, TARGETS])

    def get_label(self, idx):
        return self.df.loc[idx, "condition"]

In [5]:
def get_lr(optimizer):
    for p in optimizer.param_groups:
        return p["lr"]

In [6]:
# Source: https://github.com/clovaai/CutMix-PyTorch
def rand_bbox(size, lambd):
    W = size[2]
    H = size[3]

    cut_rat = np.sqrt(1 - lambd)
    cut_w = np.int(W * cut_rat)
    cut_h = np.int(H * cut_rat)

    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2


def cutmix(x, y, model, criterion):
    lambd = np.random.beta(1.0, 1.0)

    rand_index = torch.randperm(x.size()[0]).to(DEVICE)

    y_a = y
    y_b = y[rand_index]

    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lambd)
    x[:, :, bbx1:bbx2, bby1:bby2] = x[rand_index, :, bbx1:bbx2, bby1:bby2]
    lambd = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size()[-1] * x.size()[-2]))

    y_pred = model(x)

    return criterion(y_pred, y_a) * lambd + criterion(y_pred, y_b) * (1 - lambd)

In [7]:
def main(args):
    seed_everything(args["seed"])

    df = pd.read_csv("data/data_additional2.csv")
    df = df[df["diseased"] == 0] # Removes examples with unknown disease

    dataset = {
        "train": MyDataset(df[df["split"] == "train"], transform=TRAIN_TRANSFORM),
        "dev": MyDataset(df[df["split"] == "dev"], transform=TEST_TRANSFORM)
    }

    # Source: https://github.com/ufoym/imbalanced-dataset-sampler
    dataloader = {
        "train": DataLoader(dataset["train"], sampler=ImbalancedDatasetSampler(dataset["train"]), batch_size=args["batch_size"], num_workers=2, pin_memory=True),
        "dev": DataLoader(dataset["dev"], batch_size=args["batch_size"], num_workers=2, pin_memory=True)
    }

    model = EfficientNet.from_pretrained("efficientnet-b5", num_classes=1).to(DEVICE)
    model.load_state_dict(torch.load("drive/MyDrive/b5-best.pth", map_location=DEVICE))
    model._fc = nn.Linear(in_features=2048, out_features=len(TARGETS), bias=True).to(DEVICE)

    optimizer = optim.AdamW(model.parameters(), lr=args["lr"], weight_decay=args["weight_decay"])
    scheduler = lr.CosineAnnealingLR(optimizer, T_max=args["epochs"] + 1, eta_min=args["min_lr"])
    pos_weight = torch.tensor([10.123738421125397, 57.176428054953, 1017.4556962025316, 43.6740699611327, 3830.3333333333335, 52.14266842800529, 20.23462655054104, 46.636471284783894, 518.083870967742, 1057.657894736842, 27.02438174851968, 956.8333333333334, 13.570445490764216, 3.2321813686812897, 15.34328661385334, 45.941656942823805, 71.55004508566276, 6188.076923076923, 312.0661478599222, 100.33249370277078, 968.3734939759037, 25.431668856767413, 43.159165751920966, 116.45693430656934, 1086.2702702702702, 24.23776662484316, 144.49367088607596, 30.826740506329113, 57.134393063583815, 25.668213457076565, 278.36805555555554, 7.248718474472011]).to(DEVICE)
    criterion = nn.BCEWithLogitsLoss(pos_weight)

    log = pd.DataFrame(columns=["epoch", "train loss", "dev loss", "learning rate"])

    pbar_epoch = tqdm(range(args["epochs"]))

    best = {
        "dev": {
            "epoch": 0,
            "loss": np.inf,
        }
    }

    for epoch in pbar_epoch:
        stats = {
            "train": {
                "loss": 0,
            },
            "dev": {
                "loss": 0,
            }
        }

        model.train()

        for x, y in tqdm(dataloader["train"], desc="train", leave=False):
            x = x.to(DEVICE)
            y = y.to(DEVICE, torch.float)

            if np.random.rand() > 0.5:
                loss = cutmix(x, y, model, criterion)
            else:
                loss = criterion(model(x), y)

            stats["train"]["loss"] += loss.item() / len(dataloader["train"])

            loss.backward()
            clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()

        model.eval()

        with torch.no_grad():
            for x, y in tqdm(dataloader["dev"], desc="dev", leave=False):
                x = x.to(DEVICE)
                y = y.to(DEVICE, torch.float)

                loss = criterion(model(x), y)
                stats["dev"]["loss"] += loss.item() / len(dataloader["dev"])

        if stats["dev"]["loss"] < best["dev"]["loss"]:
            best["dev"]["epoch"] = epoch
            best["dev"]["loss"] = stats["dev"]["loss"]

            torch.save(model.state_dict(), "drive/MyDrive/bayer/best.pth")

        pbar_epoch.set_description("best: {:.4f} curr: {:.4f} lr: {}".format(best["dev"]["loss"], stats["dev"]["loss"], get_lr(optimizer)))

        log = log.append({
            "epoch": epoch,
            "train loss": stats["train"]["loss"],
            "dev loss": stats["dev"]["loss"],
            "learning rate": get_lr(optimizer)
        }, ignore_index=True)

        log.to_csv("drive/MyDrive/bayer/log.csv", index=False)

        scheduler.step()

        torch.save({
            "epoch": epoch,
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
        }, "drive/MyDrive/bayer/checkpoint.pth")

In [8]:
if __name__ == "__main__":
    main({
        "seed": 42,
        "epochs": 10,
        "batch_size": 32,
        "lr": 1e-3,
        "min_lr": 1e-5,
        "weight_decay": 1e-5,
    })

Loaded pretrained weights for efficientnet-b5


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='train', max=1509.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='dev', max=503.0, style=ProgressStyle(description_width='i…


