### import library

In [10]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pytorch_lightning as pl

from tqdm.notebook import tqdm
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
from torchmetrics.functional import accuracy
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import seed_everything, LightningModule, Trainer
from pytorch_lightning.loggers import TensorBoardLogger

  from .autonotebook import tqdm as notebook_tqdm


### custom dataset

In [11]:
class AddGaussianNoise(object):
    def __init__(self, mean=0., std=1.):
        self.std = std
        self.mean = mean
        
    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean
    
    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

In [12]:
# augmentation use transformer
dataset_transforms = {
    'train': transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((300,300)),
        transforms.RandomRotation(50,expand=True),  
        transforms.Resize((300,300)),
        transforms.RandomCrop((120,120)),
        transforms.RandomVerticalFlip(0.4),
        transforms.RandomHorizontalFlip(0.4),                     
        transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0, hue=0), 
        transforms.ToTensor(),
        AddGaussianNoise(0.1, 0.08),
        transforms.RandomErasing(),
    ]),
    'valid': transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((300, 300)),
        transforms.ToTensor()])
}

In [13]:
class PetDataset(Dataset):
    def __init__(self, file_txt, root_dir, transform=None):
        with open(file_txt, 'r') as f:
            self.img_infos = f.readlines()
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_infos)

    def __getitem__(self, idx):
        img_info = self.img_infos[idx]
        img_info = img_info.split(' ')
        img_name = img_info[0]
        
        img_label = torch.zeros(37)
        img_label[int(img_info[1]) - 1] = 1.
        img_path = os.path.join(self.root_dir, img_name + '.jpg')
        img = cv2.imread(img_path)
        if self.transform:
            img = self.transform(img)

        return img, img_label

In [14]:
cd ..

/home/hoahoang


In [15]:
cd /home/hoahoang/training/data/oxford-iiit-pet

/home/hoahoang/training/data/oxford-iiit-pet


In [16]:
dataset_path = {
    'train': PetDataset(file_txt='./annotations/trainval.txt', root_dir='./images', transform=dataset_transforms['train']),
    'valid': PetDataset(file_txt='./annotations/test.txt', root_dir='./images', transform=dataset_transforms['valid']),
}

### build a model

In [17]:
class OurModel(pl.LightningModule):
    def __init__(self, learning_rate=1e-3, batch_size=16):
        super().__init__()
        self.base_model = models.resnet50(pretrained=True)
        self.base_model.fc = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(2048, 37)
        )
        self.batch_size = batch_size
    
    def forward(self, x):
        x = self.base_model(x)
        return x

    def configure_optimizers(self):
        optimizer = optim.Adam([   
                {'params': list(model.parameters())[:-1], 'lr': 1e-4},
                {'params': list(model.parameters())[-1], 'lr': 5e-2}
                ])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
        return [optimizer], [scheduler]

    def configure_callbacks(self):
        early_stopping = EarlyStopping(monitor='train_loss', mode='min', patience=5, verbose=True, min_delta=0.001)
        checkpoint_callback = ModelCheckpoint(monitor='val_acc', mode='max', dirpath='./', save_top_k=1)
        return [early_stopping, checkpoint_callback]
    
    def train_dataloader(self):
        return DataLoader(dataset_path['train'], batch_size=self.batch_size, shuffle=True, num_workers=16)


    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss(reduction='mean')(y_hat, y)
        self.log('train_loss', loss, prog_bar=True, on_epoch=True)
        return {'loss': loss}

    def _shared_eval_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss(reduction='mean')(y_hat, y)
        pred = torch.softmax(y_hat, dim=1)
        y = y.int()
        acc = accuracy(pred, y)
        return loss, acc
    
    def val_dataloader(self):
        return DataLoader(dataset_path['valid'], batch_size=self.batch_size, shuffle=False, num_workers=16)
    
    def validation_step(self, batch, batch_idx):
        loss, acc = self._shared_eval_step(batch, batch_idx)
        metrics = {"val_acc": acc, "val_loss": loss}
        self.log_dict(metrics, prog_bar=True, on_epoch=True)
        return metrics
    
    # def validation_step(self, batch, batch_idx):
    #     x, y = batch
    #     loss = F.cross_entropy(self(x), y)
    #     return {'val_loss': loss, 'log': {'val_loss': loss}}
    
    def validation_epoch_end(self, outputs):
        val_loss_mean = sum([o['val_loss'] for o in outputs]) / len(outputs)
        # show val_acc in progress bar but only log val_loss
        results = {'progress_bar': {'val_loss': val_loss_mean.item()}, 'log': {'val_loss': val_loss_mean.item()},
                   'val_loss': val_loss_mean.item()}
        return results

    def load_state_dict(self, state_dict):
        self.base_model.load_state_dict(state_dict)

    def state_dict(self):
        return self.base_model.state_dict()

### trainer

In [18]:
logger = TensorBoardLogger("runs", name="resnet50")

In [53]:
# few functions of trainer: auto_lr_find, accumulate_grad_batches, limit_training_batches, num_sanity_val_steps

In [55]:
if __name__ == '__main__':
    early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=5, verbose=True)
    checkpoint_callback = ModelCheckpoint(monitor='val_acc', mode='max', dirpath='./', save_top_k=1)
    model = OurModel()
    trainer = Trainer(max_epochs=10, min_epochs=1, auto_lr_find=False, auto_scale_batch_size=False,logger=logger,
                      progress_bar_refresh_rate=10, callbacks=[early_stop_callback], checkpoint_callback=[checkpoint_callback],)
    trainer.tune(model)
    trainer.fit(model)
    save(model.state_dict(), 'Saved model')

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
The following callbacks returned in `LightningModule.configure_callbacks` will override existing callbacks passed to Trainer: EarlyStopping, ModelCheckpoint
Missing logger folder: runs/resnet50

  | Name       | Type   | Params
--------------------------------------
0 | base_model | ResNet | 23.6 M
--------------------------------------
23.6 M    Trainable params
0         Non-trainable params
23.6 M    Total params
94.335    Total estimated model params size (MB)


Epoch 0:  30%|███████████████████████████                                                              | 140/460 [01:18<03:00,  1.77it/s, loss=3.12, v_num=0, train_loss_step=3.280]

Corrupt JPEG data: premature end of data segment


Epoch 0:  37%|████████████████████████████████▉                                                        | 170/460 [01:34<02:42,  1.79it/s, loss=2.99, v_num=0, train_loss_step=2.710]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 0:  50%|█████████████████████████████████████████████                                             | 230/460 [02:06<02:06,  1.82it/s, loss=2.8, v_num=0, train_loss_step=2.850]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 0:  52%|██████████████████████████████████████████████▉                                           | 240/460 [02:18<02:06,  1.74it/s, loss=2.8, v_num=0, train_loss_step=2.850][A
Epoch 0:  54%|████████████████████████████████████████████████▉                                         | 250/460 [02:27<02:04,  1.69it/s, loss=2.8, v_num=0, train_loss_step=2.850][A
Epoch 0:  57%|██████████████████████████████████████████████████▊                                       | 260/460 [02:36<02:00,  1.66it/s, loss=2.8, v_num=0, train_loss_step=2.850][A
Epoch 0:  59%|███████████████████████████████████

Metric train_loss improved. New best score: 3.241


Epoch 1:  20%|██████▊                            | 90/460 [06:27<26:34,  4.31s/it, loss=2.59, v_num=0, train_loss_step=2.940, val_acc=0.973, val_loss=2.850, train_loss_epoch=3.240]

Corrupt JPEG data: premature end of data segment


Epoch 1:  35%|███████████▊                      | 160/460 [07:02<13:12,  2.64s/it, loss=2.67, v_num=0, train_loss_step=2.660, val_acc=0.973, val_loss=2.850, train_loss_epoch=3.240]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 1:  50%|█████████████████                 | 230/460 [07:37<07:37,  1.99s/it, loss=2.68, v_num=0, train_loss_step=2.600, val_acc=0.973, val_loss=2.850, train_loss_epoch=3.240]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 1:  52%|█████████████████▋                | 240/460 [07:49<07:10,  1.96s/it, loss=2.68, v_num=0, train_loss_step=2.600, val_acc=0.973, val_loss=2.850, train_loss_epoch=3.240][A
Epoch 1:  54%|██████████████████▍               | 250/460 [07:58<06:42,  1.92s/it, loss=2.68, v_num=0, train_loss_step=2.600, val_acc=0.973, val_loss=2.850, train_loss_epoch=3.240][A
Epoch 1:  57%|███████████████████▏              | 260/460 [08:08<06:15,  1.88s/it, loss=2.68, v_num=0, train_loss_step=2.600, val_acc=0.973, val_loss=2.850, train_loss_epoch=3.240][A
Epoch 1:  59%|███████████████████▉              |

Metric train_loss improved by 0.559 >= min_delta = 0.001. New best score: 2.682


Epoch 2:  11%|███▌                             | 50/460 [11:44<1:36:14, 14.08s/it, loss=2.56, v_num=0, train_loss_step=2.290, val_acc=0.973, val_loss=2.860, train_loss_epoch=2.680]

Corrupt JPEG data: premature end of data segment


Epoch 2:  26%|█████████▏                         | 120/460 [12:16<34:48,  6.14s/it, loss=2.5, v_num=0, train_loss_step=2.380, val_acc=0.973, val_loss=2.860, train_loss_epoch=2.680]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 2:  50%|█████████████████                 | 230/460 [13:08<13:08,  3.43s/it, loss=2.39, v_num=0, train_loss_step=1.970, val_acc=0.973, val_loss=2.860, train_loss_epoch=2.680]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 2:  52%|█████████████████▋                | 240/460 [13:20<12:13,  3.33s/it, loss=2.39, v_num=0, train_loss_step=1.970, val_acc=0.973, val_loss=2.860, train_loss_epoch=2.680][A
Epoch 2:  54%|██████████████████▍               | 250/460 [13:31<11:21,  3.25s/it, loss=2.39, v_num=0, train_loss_step=1.970, val_acc=0.973, val_loss=2.860, train_loss_epoch=2.680][A
Epoch 2:  57%|███████████████████▏              | 260/460 [13:43<10:33,  3.17s/it, loss=2.39, v_num=0, train_loss_step=1.970, val_acc=0.973, val_loss=2.860, train_loss_epoch=2.680][A
Epoch 2:  59%|███████████████████▉              |

Metric train_loss improved by 0.190 >= min_delta = 0.001. New best score: 2.492


Epoch 3:   2%|▋                              | 10/460 [16:59<12:44:33, 101.94s/it, loss=2.35, v_num=0, train_loss_step=1.980, val_acc=0.973, val_loss=2.650, train_loss_epoch=2.490]

Corrupt JPEG data: premature end of data segment


Epoch 3:  30%|██████████▋                        | 140/460 [18:04<41:18,  7.75s/it, loss=2.3, v_num=0, train_loss_step=2.990, val_acc=0.973, val_loss=2.650, train_loss_epoch=2.490]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 3:  50%|█████████████████                 | 230/460 [18:49<18:49,  4.91s/it, loss=2.22, v_num=0, train_loss_step=1.680, val_acc=0.973, val_loss=2.650, train_loss_epoch=2.490]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 3:  52%|█████████████████▋                | 240/460 [19:01<17:26,  4.76s/it, loss=2.22, v_num=0, train_loss_step=1.680, val_acc=0.973, val_loss=2.650, train_loss_epoch=2.490][A
Epoch 3:  54%|██████████████████▍               | 250/460 [19:11<16:06,  4.60s/it, loss=2.22, v_num=0, train_loss_step=1.680, val_acc=0.973, val_loss=2.650, train_loss_epoch=2.490][A
Epoch 3:  57%|███████████████████▏              | 260/460 [19:20<14:52,  4.46s/it, loss=2.22, v_num=0, train_loss_step=1.680, val_acc=0.973, val_loss=2.650, train_loss_epoch=2.490][A
Epoch 3:  59%|███████████████████▉              |

Metric train_loss improved by 0.164 >= min_delta = 0.001. New best score: 2.328


Epoch 4:  20%|██████▋                           | 90/460 [23:15<1:35:36, 15.50s/it, loss=2.2, v_num=0, train_loss_step=2.660, val_acc=0.973, val_loss=2.700, train_loss_epoch=2.330]

Corrupt JPEG data: premature end of data segment


Epoch 4:  28%|█████████▌                        | 130/460 [23:34<59:50, 10.88s/it, loss=2.21, v_num=0, train_loss_step=1.970, val_acc=0.973, val_loss=2.700, train_loss_epoch=2.330]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 4:  50%|█████████████████                 | 230/460 [24:22<24:22,  6.36s/it, loss=2.23, v_num=0, train_loss_step=2.750, val_acc=0.973, val_loss=2.700, train_loss_epoch=2.330]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 4:  52%|█████████████████▋                | 240/460 [24:34<22:31,  6.14s/it, loss=2.23, v_num=0, train_loss_step=2.750, val_acc=0.973, val_loss=2.700, train_loss_epoch=2.330][A
Epoch 4:  54%|██████████████████▍               | 250/460 [24:43<20:46,  5.93s/it, loss=2.23, v_num=0, train_loss_step=2.750, val_acc=0.973, val_loss=2.700, train_loss_epoch=2.330][A
Epoch 4:  57%|███████████████████▏              | 260/460 [24:52<19:08,  5.74s/it, loss=2.23, v_num=0, train_loss_step=2.750, val_acc=0.973, val_loss=2.700, train_loss_epoch=2.330][A
Epoch 4:  59%|███████████████████▉              |

Metric train_loss improved by 0.102 >= min_delta = 0.001. New best score: 2.226


Epoch 5:   4%|█▍                              | 20/460 [28:02<10:16:46, 84.10s/it, loss=2.12, v_num=0, train_loss_step=2.270, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.230]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 5:  33%|██████████▊                      | 150/460 [29:03<1:00:03, 11.62s/it, loss=2.3, v_num=0, train_loss_step=2.550, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.230]

Corrupt JPEG data: premature end of data segment


Epoch 5:  50%|█████████████████                 | 230/460 [29:41<29:41,  7.75s/it, loss=2.17, v_num=0, train_loss_step=2.000, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.230]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 5:  52%|█████████████████▋                | 240/460 [29:53<27:24,  7.47s/it, loss=2.17, v_num=0, train_loss_step=2.000, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.230][A
Epoch 5:  54%|██████████████████▍               | 250/460 [30:03<25:14,  7.21s/it, loss=2.17, v_num=0, train_loss_step=2.000, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.230][A
Epoch 5:  57%|███████████████████▏              | 260/460 [30:12<23:14,  6.97s/it, loss=2.17, v_num=0, train_loss_step=2.000, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.230][A
Epoch 5:  59%|███████████████████▉              |

Metric train_loss improved by 0.045 >= min_delta = 0.001. New best score: 2.181


Epoch 6:   4%|█▎                             | 20/460 [33:30<12:17:00, 100.50s/it, loss=2.19, v_num=0, train_loss_step=2.320, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.180]

Corrupt JPEG data: premature end of data segment


Epoch 6:  26%|████████▎                       | 120/460 [34:17<1:37:08, 17.14s/it, loss=2.11, v_num=0, train_loss_step=2.740, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.180]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 6:  50%|█████████████████                 | 230/460 [35:08<35:08,  9.17s/it, loss=2.03, v_num=0, train_loss_step=2.490, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.180]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 6:  52%|█████████████████▋                | 240/460 [35:20<32:24,  8.84s/it, loss=2.03, v_num=0, train_loss_step=2.490, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.180][A
Epoch 6:  54%|██████████████████▍               | 250/460 [35:29<29:49,  8.52s/it, loss=2.03, v_num=0, train_loss_step=2.490, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.180][A
Epoch 6:  57%|███████████████████▏              | 260/460 [35:38<27:25,  8.23s/it, loss=2.03, v_num=0, train_loss_step=2.490, val_acc=0.974, val_loss=2.440, train_loss_epoch=2.180][A
Epoch 6:  59%|███████████████████▉              |

Metric train_loss improved by 0.091 >= min_delta = 0.001. New best score: 2.090


Epoch 7:  33%|██████████▍                     | 150/460 [39:52<1:22:23, 15.95s/it, loss=2.23, v_num=0, train_loss_step=2.160, val_acc=0.974, val_loss=2.160, train_loss_epoch=2.090]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 7:  37%|███████████▊                    | 170/460 [40:01<1:08:16, 14.13s/it, loss=1.95, v_num=0, train_loss_step=2.370, val_acc=0.974, val_loss=2.160, train_loss_epoch=2.090]

Corrupt JPEG data: premature end of data segment


Epoch 7:  50%|█████████████████▌                 | 230/460 [40:29<40:29, 10.56s/it, loss=1.9, v_num=0, train_loss_step=1.740, val_acc=0.974, val_loss=2.160, train_loss_epoch=2.090]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 7:  52%|██████████████████▎                | 240/460 [40:41<37:18, 10.17s/it, loss=1.9, v_num=0, train_loss_step=1.740, val_acc=0.974, val_loss=2.160, train_loss_epoch=2.090][A
Epoch 7:  54%|███████████████████                | 250/460 [40:51<34:18,  9.80s/it, loss=1.9, v_num=0, train_loss_step=1.740, val_acc=0.974, val_loss=2.160, train_loss_epoch=2.090][A
Epoch 7:  57%|███████████████████▊               | 260/460 [41:00<31:32,  9.46s/it, loss=1.9, v_num=0, train_loss_step=1.740, val_acc=0.974, val_loss=2.160, train_loss_epoch=2.090][A
Epoch 7:  59%|████████████████████▌              

Metric train_loss improved by 0.037 >= min_delta = 0.001. New best score: 2.053


Epoch 8:  26%|████████▎                       | 120/460 [45:08<2:07:53, 22.57s/it, loss=2.07, v_num=0, train_loss_step=2.400, val_acc=0.973, val_loss=2.740, train_loss_epoch=2.050]

Corrupt JPEG data: premature end of data segment


Epoch 8:  37%|███████████▊                    | 170/460 [45:31<1:17:39, 16.07s/it, loss=2.07, v_num=0, train_loss_step=2.330, val_acc=0.973, val_loss=2.740, train_loss_epoch=2.050]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 8:  50%|█████████████████                 | 230/460 [45:59<45:59, 12.00s/it, loss=1.93, v_num=0, train_loss_step=2.000, val_acc=0.973, val_loss=2.740, train_loss_epoch=2.050]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 8:  52%|█████████████████▋                | 240/460 [46:11<42:20, 11.55s/it, loss=1.93, v_num=0, train_loss_step=2.000, val_acc=0.973, val_loss=2.740, train_loss_epoch=2.050][A
Epoch 8:  54%|██████████████████▍               | 250/460 [46:20<38:55, 11.12s/it, loss=1.93, v_num=0, train_loss_step=2.000, val_acc=0.973, val_loss=2.740, train_loss_epoch=2.050][A
Epoch 8:  57%|███████████████████▏              | 260/460 [46:30<35:46, 10.73s/it, loss=1.93, v_num=0, train_loss_step=2.000, val_acc=0.973, val_loss=2.740, train_loss_epoch=2.050][A
Epoch 8:  59%|███████████████████▉              |

Metric train_loss improved by 0.037 >= min_delta = 0.001. New best score: 2.017


Epoch 9:  11%|███▌                             | 50/460 [50:05<6:50:48, 60.12s/it, loss=1.94, v_num=0, train_loss_step=2.950, val_acc=0.974, val_loss=2.460, train_loss_epoch=2.020]

Corrupt JPEG data: premature end of data segment


Epoch 9:  33%|██████████▍                     | 150/460 [50:57<1:45:18, 20.38s/it, loss=2.07, v_num=0, train_loss_step=2.170, val_acc=0.974, val_loss=2.460, train_loss_epoch=2.020]

Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


Epoch 9:  50%|█████████████████                 | 230/460 [51:35<51:35, 13.46s/it, loss=2.04, v_num=0, train_loss_step=2.060, val_acc=0.974, val_loss=2.460, train_loss_epoch=2.020]
Validation: 0it [00:00, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                              | 0/230 [00:01<?, ?it/s][A
Epoch 9:  52%|█████████████████▋                | 240/460 [51:47<47:28, 12.95s/it, loss=2.04, v_num=0, train_loss_step=2.060, val_acc=0.974, val_loss=2.460, train_loss_epoch=2.020][A
Epoch 9:  54%|██████████████████▍               | 250/460 [51:55<43:37, 12.46s/it, loss=2.04, v_num=0, train_loss_step=2.060, val_acc=0.974, val_loss=2.460, train_loss_epoch=2.020][A
Epoch 9:  57%|███████████████████▏              | 260/460 [52:04<40:03, 12.02s/it, loss=2.04, v_num=0, train_loss_step=2.060, val_acc=0.974, val_loss=2.460, train_loss_epoch=2.020][A
Epoch 9:  59%|███████████████████▉              |

Metric train_loss improved by 0.031 >= min_delta = 0.001. New best score: 1.986


Epoch 9: 100%|██████████████████████████████████| 460/460 [54:57<00:00,  7.17s/it, loss=2.04, v_num=0, train_loss_step=2.060, val_acc=0.971, val_loss=3.130, train_loss_epoch=1.990]


NameError: name 'save' is not defined

In [57]:
trainer.validate()

  f"`.{fn}(ckpt_path=None)` was called without a model."
The following callbacks returned in `LightningModule.configure_callbacks` will override existing callbacks passed to Trainer: EarlyStopping, ModelCheckpoint
Restoring states from the checkpoint path at /home/hoahoang/training/data/oxford-iiit-pet/epoch=6-step=1610-v1.ckpt
Loaded model weights from checkpoint at /home/hoahoang/training/data/oxford-iiit-pet/epoch=6-step=1610-v1.ckpt


Validation DataLoader 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [03:27<00:00,  1.11it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.9744840264320374
        val_loss             2.155937433242798
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_acc': 0.9744840264320374, 'val_loss': 2.155937433242798}]

### visualization

In [56]:
! tensorboard --logdir=runs --load_fast=false --bind_all

TensorFlow installation not found - running with reduced feature set.
TensorBoard 2.8.0 at http://labserver:6009/ (Press CTRL+C to quit)
^C
