In [1]:
import pandas as pd
import numpy as np
import os 

import matplotlib.pyplot as plt

import wandb

import cv2 as cv

from sklearn.model_selection import train_test_split, KFold

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
from torch.optim.lr_scheduler import StepLR
from torch.nn.functional import conv2d, relu, pad

from pytorch_lightning import metrics
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor, EarlyStopping

from transformers import AdamW

from tqdm.notebook import tqdm

import albumentations as A
from albumentations.pytorch import ToTensorV2

import random

In [2]:
! pip install knockknock --quiet

# Importation
from knockknock import telegram_sender 
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("token")
chat_id = user_secrets.get_secret("chat_id")

@telegram_sender(token=token, chat_id=int(chat_id))
def send_msg(msg):
    print()
    return msg
send_msg('End')

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
caip-notebooks-serverextension 1.0.0 requires pyjwt>=2.0.0requests>=2.22.0, but you have pyjwt 1.7.1 which is incompatible.[0m



'End'

In [3]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
# seed_everything()

## Reading dataset

In [5]:
DATA_PATH='/kaggle/input/plant-pathology-2021-fgvc8/'

train_df=pd.read_csv(DATA_PATH+'train.csv').sample(frac=1.)

train_df['image_path']=train_df.image.apply(lambda x: DATA_PATH+'train_images/'+x)

## Label extraction

In [6]:
class AdaptLabels:
    
    def five_cats(self, X, N=-1):
        X = X.copy()
        labels=list(set([u for t in X.labels.unique() for u in t.split()]))
        labels.sort()
        
        for label in labels:
            X[label] = X.labels.apply(lambda x: 1*(label in x))

        X.drop(['image','labels'],axis=1,inplace=True)

        inds=[]
        for label in labels:
            inds.extend(X[X[label]==1].iloc[:N].index)
        inds=list(set(inds))

        X = X.iloc[inds].reset_index(drop=True)

        X, y = X.image_path.to_numpy(), X.drop(['image_path','healthy'],axis=1).to_numpy()
        
        return X, y
    
    def six_cats(self, X, N=-1):
        X = X.copy()
        labels=list(set([u for t in X.labels.unique() for u in t.split()]))
        labels.sort()
        
        for label in labels:
            X[label] = X.labels.apply(lambda x: 1*(label in x))

        X.drop(['image','labels'],axis=1,inplace=True)

        inds=[]
        for label in labels:
            inds.extend(X[X[label]==1].iloc[:N].index)
        inds=list(set(inds))

        X = X.iloc[inds].reset_index(drop=True)

        X, y = X.image_path.to_numpy(), X.drop('image_path',axis=1).to_numpy()
        
        return X, y
    
    def twelve_cats(self,X):
        X = X.copy()
        labels = list(X.labels.unique())
        labels.sort()
        
        encoder = {k:v for v,k in enumerate(labels)}
        
        X.labels = X.labels.apply(lambda x: encoder[x])
        
        X, y = X.image_path.to_numpy(), X.labels.to_numpy()
        
        return X, y
    
    def all_cats(self,X):
        X = X.copy()
        labels=list(set([u for t in X.labels.unique() for u in t.split()]))
        labels.remove('healthy')
        labels.sort()
        
        def get_indx(x):
            indx = 0
            for i,label in enumerate(labels):
                if label in x:
                    indx+=2**i
            return indx
        
        X.labels = X.labels.apply(get_indx)
        
        X, y = X.image_path.to_numpy(), X.labels.to_numpy()
        
        return X, y

        
        
adapter = AdaptLabels()
# X, y_five = adapter.five_cats(train_df)
X, y = adapter.six_cats(train_df)
# _, y_twelve = adapter.twelve_cats(train_df)
# _, y_thirty = adapter.all_cats(train_df)

## torch Dataset

In [7]:
class LeafDataset(Dataset):
    
    def __init__(self, X = None, y = None, begin = 0, end = -1, data = None):
        super().__init__()
                
        if data is not None:
            self.n = len(data)
            self.data = data
            return 
        
        self.transforms=A.Compose([
                A.Resize(428, 428),
                ToTensorV2()
        ])
                        

        self.data=[]
        with tqdm(total=X.shape[0]) as pbar:
            for x, label in zip(X,y):
                img=self.load_img(x)
                self.data.append([img,torch.tensor(label)])
                pbar.update(1)  
    
    def change_labels(self,y):
        for i in range(len(self.data)):
            self.data[i][1] = y[i]
    
    
    def load_img(self,x):
        img = cv.imread(x, cv.IMREAD_UNCHANGED)
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
        img = self.transforms(image = img)['image']
        return img
        
    def __len__(self):
        return len(self.data)
    
    def get_dataset(self, data):
        return LeafDataset(data=data)
    
    def slice(self, i):
        if isinstance(i, slice):
            indices = range(*i.indices(len(self.data)))
            t = [self.data[ii] for ii in indices]
        if isinstance(i, list):
            t = [self.data[ii] for ii in i]
        return self.get_dataset(t)
    
    def __getitem__(self,i):
        return self.data[i]
    
train_dataset = LeafDataset(X, y)
send_msg('End')

  0%|          | 0/18627 [00:00<?, ?it/s]




'End'

## PL  Datamodule

In [18]:
from torch.utils.data import random_split

class LeafDM(LightningDataModule):
    
    def __init__(self,train = None, val = None, data = None, y=None, batch_size=16):
        super().__init__()
                
        if data:
            self.data = data
            if y is not None:
                self.data.change_labels(y)
        else:
            self.train = train
            self.val = val
            if y is not None:
                self.train.change_labels(y)
                self.val.change_labels(y)
            
        self.batch_size = batch_size
        
    def setup(self, stage=None):
        if hasattr(self, 'data'):
            n = len(self.data)
            val, test = 512, 4 # int(n*.2), int(n*.2)
            train = n - (val + test)
            self.train, self.val, self.test = random_split(self.data, [train, val ,test] )
        else:
            n = len(self.train)
            test = int(n*.1)
            train = n - test
            self.train, self.test = random_split(self.train, [train, test] )
        
    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.batch_size, num_workers=2, prefetch_factor=4,
                         pin_memory=True, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.batch_size, num_workers=2, prefetch_factor=4,
                         pin_memory=True)
    
    def test_dataloader(self):
        return DataLoader(self.test, batch_size=self.batch_size, num_workers=2, prefetch_factor=4,
                         pin_memory=True)




'End'

## Preprocessing

In [20]:
from torch import nn
from torchvision import transforms as T

class Preprocess(nn.Module):
    
    def __init__(self):
        # input should be a 256 by 256 tensor.
        super().__init__()
        
        self.augment = nn.Sequential(
            T.RandomCrop((400, 400)),
            T.RandomApply(nn.ModuleList([T.CenterCrop((385,385))]),p=.5),
            T.RandomHorizontalFlip(.5),
            T.RandomVerticalFlip(.5),
            T.RandomErasing(p=.2,scale=(0.02, 0.02)), 
            T.RandomApply(nn.ModuleList([T.RandomRotation(170)]),p=.2),
        )
        
        self.normalize = nn.Sequential(
            T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        )
        
    def forward(self,x):
        x = x / 255
        x = self.augment(x)
        return self.normalize(x)

## Model

In [21]:
! pip install efficientnet_pytorch --quiet
from efficientnet_pytorch import EfficientNet

class Model(LightningModule):
    
    def __init__(self, lr=5e-4, betas=(.8, .9),weight_decay=10.,drop1=.3, drop2=.1 ):
        super().__init__()
        
        self.save_hyperparameters()
        
        self.cost = nn.BCEWithLogitsLoss()
                        
        self.preprocess = Preprocess()
        
        def load_effnet(i,out):
            effnet = EfficientNet.from_pretrained('efficientnet-b'+str(i))
            n_features = effnet._fc.in_features
            effnet._dropout = nn.Dropout(self.hparams.drop1)
            effnet._fc = nn.Sequential(
                            nn.Linear(n_features, 1000),nn.ReLU(),
                            nn.Dropout(self.hparams.drop2),
                            nn.Linear(1000, out)
            )
            return effnet
        
        self.effnet = load_effnet(5,6)
        
        
    def forward(self,x):
        x = self.preprocess(x)
        x = self.effnet(x)
        return x
    
    def calc_cost(self, preds, labels):
        cost = self.cost(preds, labels)
        return cost
                
    def configure_optimizers(self):
        opt = AdamW(self.parameters(), lr = self.hparams.lr,
                  betas = self.hparams.betas, weight_decay = self.hparams.weight_decay)
        lr_scheduler = StepLR(opt, step_size=1, gamma=.5)

        return [opt], [lr_scheduler]
    
    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self(imgs)
        cost = self.calc_cost(preds, labels.float())
        self.log('train_loss', cost, logger=True, on_epoch=True, prog_bar=True)

        return cost
    
    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self(imgs)
        cost = self.calc_cost(preds, labels.float())
        self.log('val_loss', cost, prog_bar=True)

        return cost
    
    def test_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self(imgs)
        cost = self.calc_cost(preds, labels.float())
        self.log('test_loss', cost, prog_bar=True, logger=True)

## Training

In [22]:
dm = LeafDM(data=train_dataset, batch_size=8)
dm.setup()

model = Model()

checkpoint_callback = ModelCheckpoint(
                                  monitor='val_loss',
                                  save_top_k = 10, mode = 'min', save_weights_only=True,
                                  filename = 'Model-{val_loss:.2f}',
                                  dirpath = '/kaggle/working/Models',
                                  save_last = False
                                 )

lr_monitor = LearningRateMonitor(logging_interval='epoch')

wandb_logger = WandbLogger(project='PP21', entity='aita3ssis')

trainer = Trainer(
                logger = wandb_logger,
                callbacks = [checkpoint_callback, lr_monitor],
                gpus = -1, max_epochs = 12,
                accumulate_grad_batches = 64,
                check_val_every_n_epoch = 1,
#                 limit_train_batches=512,
#                 limit_val_batches = 512,
)
trainer.fit(model, dm)
trainer.test(ckpt_path = checkpoint_callback.best_model_path)


wandb.finish()
send_msg('End')

Loaded pretrained weights for efficientnet-b5


[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Exception in thread Thread-55:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
    fd = df.detach()
  File "/opt/conda/lib/python3.7/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/opt/conda/lib/python3.7/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.current_process().a

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.041360363364219666}
--------------------------------------------------------------------------------


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
lr-AdamW,0.0
trainer/global_step,329.0
_runtime,9223.0
_timestamp,1621206362.0
_step,34.0
train_loss_epoch,0.08343
epoch,9.0
val_loss,0.06469
train_loss_step,0.0238
test_loss,0.04136


0,1
lr-AdamW,█▄▃▂▁▁▁▁▁▁
trainer/global_step,▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
_runtime,▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
_timestamp,▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train_loss_epoch,█▃▂▁▁▁▁▁▁
epoch,▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇█
val_loss,█▃▂▂▂▁▁▁▁
train_loss_step,█▇▂▇▁▃
test_loss,▁





'End'

In [None]:
wandb.finish()

## Cross validation

In [None]:
# yss = [ y_five, y_six, y_twelve, y_thirty]
# outs = [5, 6, 12, 32]
# costs = [ nn.BCEWithLogitsLoss, nn.BCEWithLogitsLoss, nn.CrossEntropyLoss, nn.CrossEntropyLoss ]

# kfold = KFold(n_splits = len(yss), shuffle = False)

# for i, (train_inds, test_inds) in enumerate(kfold.split(train_dataset)):
    
#     train = train_dataset.slice(train_inds.tolist())
#     val = train_dataset.slice(test_inds.tolist())
    
#     dm = LeafDM(train = train, val = val, y = yss[i])    
    
#     model = Model(out=outs[i], cost=costs[i])
    
#     checkpoint_callback = ModelCheckpoint(
#                                       monitor='val_loss',
#                                       save_top_k=1, mode='min', save_weights_only=True,
#                                       filename=str(i)+'-{val_loss:.2f}',
#                                       dirpath='/kaggle/working/Modelssss',
#                                       save_last=False
#                                      )
#     lr_monitor = LearningRateMonitor(logging_interval='epoch')

#     wandb_logger = WandbLogger(project='PP21', entity='aita3ssis')

#     trainer = Trainer(
#                     logger = wandb_logger,
#                     callbacks = [checkpoint_callback, lr_monitor],
#                     gpus = -1, max_epochs = 10,
#                     accumulate_grad_batches = 32,
#                     check_val_every_n_epoch = 1,
# #                     limit_train_batches=51,
# #                     limit_val_batches = 51,
# #                     log_every_n_steps=40,
# #                     flush_logs_every_n_steps=40,
#     )
#     trainer.fit(model, datamodule=dm)
#     trainer.test(ckpt_path = checkpoint_callback.best_model_path)
    

#     a = '009e1e49ab176827fce64df70c2ccafe103866fc'
#     wandb.finish()

In [None]:
# wandb.finish()

In [None]:
# model = Model.load_from_checkpoint(checkpoint_path='./Modelssss/2-val_loss=0.11.ckpt')

## Postprocessing

In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score

class Postprocess:
    
    def __init__(self):
        pass
    
    def get_predictions(self,model, dataset):
        loader = DataLoader(dataset, batch_size=8, shuffle=False)
        
        device='cuda' if torch.cuda.is_available() else 'cpu'
        
        model.to(device)
        model.eval()
        
        # Generate predictions
        preds = []
        truth=[]
        with tqdm(total=len(loader)) as pbar:
            for i,(x, label) in enumerate(loader):
                y_hat=model(x.to(device)).sigmoid().detach().cpu()
                preds.extend(list(y_hat.numpy()))
                truth.extend(list(label.numpy()))
                pbar.update(1)
                
        preds=np.vstack(preds)
        truth=np.vstack(truth)
        
        return truth, preds
    
    def find_thresholds(self,truth, preds, N=10):        
        # Find optimal threshold
        thresholds = np.random.rand(1,truth.shape[1])
        best = thresholds
        best_score = 0
        with tqdm(total=N) as pbar:
            for i in range(N):
                tmp_preds = (preds >= thresholds)*1
                score = f1_score(truth, tmp_preds, average='macro')
                if best_score < score:
                    best_score = score
                    best = thresholds
                thresholds = np.random.rand(1,truth.shape[1])
                pbar.update(1)
                pbar.set_postfix({'best_score':best_score})
            
        labels = (preds >= best)*1
        
        return labels, best, best_score
    
postprocess = Postprocess()

In [24]:
checkpoint_callback.best_model_path

'/kaggle/working/Models/Model-val_loss=0.06.ckpt'

In [25]:
model = Model.load_from_checkpoint(checkpoint_path = checkpoint_callback.best_model_path)
truth, preds = postprocess.get_predictions(model, train_dataset[2::10])
labels, threshold, score = postprocess.find_thresholds(truth, preds, N=2000)
print(threshold)

Loaded pretrained weights for efficientnet-b5


  0%|          | 0/233 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

[[0.43978631 0.44070179 0.35533386 0.73467215 0.68395072 0.3390692 ]]


In [26]:
labels, threshold, score = postprocess.find_thresholds(truth, preds, N=2000)
print(threshold)

  0%|          | 0/2000 [00:00<?, ?it/s]

[[0.51392244 0.49471279 0.16051167 0.68841642 0.34478749 0.52866761]]


In [27]:
torch.save(model.state_dict(), 'credible_model.pth')

## Evaluation

In [28]:
from sklearn.metrics import f1_score, multilabel_confusion_matrix, accuracy_score, ConfusionMatrixDisplay

class Evaluation:
    
    def f1_score(self, pred, truth):
        return f1_score(truth,pred,average='macro')
    
    def accuracy(self, pred, truth):
        return accuracy_score(truth,pred)
    
    def confusion_matrix(self, pred, truth):
        matrices = multilabel_confusion_matrix(truth, pred)
        
        return matrices

evaluate = Evaluation()

In [29]:
evaluate.confusion_matrix(labels, truth)

array([[[1625,   36],
        [  52,  150]],

       [[1417,   35],
        [  32,  379]],

       [[1345,    9],
        [   3,  506]],

       [[1736,    6],
        [   3,  118]],

       [[1635,   24],
        [   3,  201]],

       [[1282,   25],
        [  60,  496]]])

## Visualization
> * Pick some images and check their predictions
> * Visualize some submodule's output map
> * Check convolutions used in the first Conv block.

In [41]:
for count, filename in enumerate(os.listdir("Models")):
    src = 'Models/'+filename
    dst = src.replace('a','.')
          
        # rename() function will
        # rename all the files
    os.rename(src, dst)

In [42]:
! ls Models

0.06.ckpt     0.06bv2.ckpt  0.07bv1.ckpt  0.07bv3.ckpt	0.08.ckpt  Model.ckpt
0.06bv1.ckpt  0.07.ckpt     0.07bv2.ckpt  0.07bv4.ckpt	0.11.ckpt
