In [9]:
!pip install timm
!pip install ttach

Collecting timm
  Downloading timm-0.6.7-py3-none-any.whl (509 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.0/510.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: timm
Successfully installed timm-0.6.7
[0mCollecting ttach
  Downloading ttach-0.0.3-py3-none-any.whl (9.8 kB)
Installing collected packages: ttach
Successfully installed ttach-0.0.3
[0m

In [10]:
import os
import cv2
import time
import math
import torch
import torch.nn as nn
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, StratifiedKFold
from torch.optim import Adam, AdamW
from torch.nn.parameter import Parameter
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from sklearn import metrics
import urllib
import pickle
import torch.nn.functional as F
import seaborn as sns
import random
import sys
import gc
import shutil
from tqdm.autonotebook import tqdm
import albumentations
from albumentations import pytorch as AT
from matplotlib import pyplot as plt
import scipy.special
sigmoid = lambda x: scipy.special.expit(x)
from scipy.special import softmax
from sklearn.preprocessing import LabelEncoder
#import torch.utils as tu 
import timm
import warnings
warnings.filterwarnings("ignore")
import ttach as tta

In [11]:
'''
从timm扒来的抽象代码，知道了怎么用，不过具体的还没有怎么看。作用是mixup和cutmix
'''
def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'):
    x = x.long().view(-1, 1)
    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value)


def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda'):
    off_value = smoothing / num_classes
    on_value = 1. - smoothing + off_value
    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
    return y1 * lam + y2 * (1. - lam)


def mixup_target_multi_binary(target, lam=1., smoothing=0.0, device='cuda'):
    target = target * (1. - smoothing) + smoothing / 2.
    y1 = target.to(device)
    y2 = target.flip(0).to(device)
    return y1 * lam + y2 * (1. - lam)


def rand_bbox(img_shape, lam, margin=0., count=None):
    """ Standard CutMix bounding-box
    Generates a random square bbox based on lambda value. This impl includes
    support for enforcing a border margin as percent of bbox dimensions.
    Args:
        img_shape (tuple): Image shape as tuple
        lam (float): Cutmix lambda value
        margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
        count (int): Number of bbox to generate
    """
    ratio = np.sqrt(1 - lam)
    img_h, img_w = img_shape[-2:]
    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
    yl = np.clip(cy - cut_h // 2, 0, img_h)
    yh = np.clip(cy + cut_h // 2, 0, img_h)
    xl = np.clip(cx - cut_w // 2, 0, img_w)
    xh = np.clip(cx + cut_w // 2, 0, img_w)
    return yl, yh, xl, xh


def rand_bbox_minmax(img_shape, minmax, count=None):
    """ Min-Max CutMix bounding-box
    Inspired by Darknet cutmix impl, generates a random rectangular bbox
    based on min/max percent values applied to each dimension of the input image.
    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.
    Args:
        img_shape (tuple): Image shape as tuple
        minmax (tuple or list): Min and max bbox ratios (as percent of image size)
        count (int): Number of bbox to generate
    """
    assert len(minmax) == 2
    img_h, img_w = img_shape[-2:]
    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
    yl = np.random.randint(0, img_h - cut_h, size=count)
    xl = np.random.randint(0, img_w - cut_w, size=count)
    yu = yl + cut_h
    xu = xl + cut_w
    return yl, yu, xl, xu


def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None):
    """ Generate bbox and apply lambda correction.
    """
    if ratio_minmax is not None:
        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
    else:
        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
    if correct_lam or ratio_minmax is not None:
        bbox_area = (yu - yl) * (xu - xl)
        lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
    return (yl, yu, xl, xu), lam


class Mixup:
    """ 
    Mixup/Cutmix that applies different params to each element or whole batch
    Args:
        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
        prob (float): probability of applying mixup or cutmix per batch or element
        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
        onehot (bool): whether one hot dtype Long label input or float multi-hot or soft label
        label_smoothing (float): apply label smoothing to the mixed target tensor
        num_classes (int): number of classes for target
    Examples::
        >>> mixup, cutmix = 0.35, 0.15
        >>> prob = mixup + cutmix
        >>> switch_prob = cutmix / prob
        >>> mixup_fn = Mixup(prob=prob, switch_prob=switch_prob, onthot=False, label_smoothing=0.0)
        >>> for batch_idx, (input, target) in enumerate(loader):
        >>>     input, target = input.cuda(), target.cuda()
        >>>     input, target = mixup_fn(input, target)
    """

    def __init__(self, mixup_alpha=0.2, cutmix_alpha=1.0, cutmix_minmax=None, prob=0.2, switch_prob=0.3,
                 mode='elem', correct_lam=True, onehot=True, label_smoothing=0.0, num_classes=1000):
        self.mixup_alpha = mixup_alpha
        self.cutmix_alpha = cutmix_alpha
        self.cutmix_minmax = cutmix_minmax
        if self.cutmix_minmax is not None:
            assert len(self.cutmix_minmax) == 2
            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
            self.cutmix_alpha = 1.0
        self.mix_prob = prob
        self.switch_prob = switch_prob
        self.onehot = onehot
        self.label_smoothing = label_smoothing
        self.num_classes = num_classes
        self.mode = mode
        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)

    def _params_per_elem(self, batch_size):
        lam = np.ones(batch_size, dtype=np.float32)
        use_cutmix = np.zeros(batch_size, dtype=np.bool)
        if self.mixup_enabled:
            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
                use_cutmix = np.random.rand(batch_size) < self.switch_prob
                lam_mix = np.where(
                    use_cutmix,
                    np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size),
                    np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size))
            elif self.mixup_alpha > 0.:
                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)
            elif self.cutmix_alpha > 0.:
                use_cutmix = np.ones(batch_size, dtype=np.bool)
                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size)
            else:
                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
            lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam)
        return lam, use_cutmix

    def _params_per_batch(self):
        lam = 1.
        use_cutmix = False
        if self.mixup_enabled and np.random.rand() < self.mix_prob:
            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
                use_cutmix = np.random.rand() < self.switch_prob
                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
            elif self.mixup_alpha > 0.:
                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
            elif self.cutmix_alpha > 0.:
                use_cutmix = True
                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
            else:
                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
            lam = float(lam_mix)
        return lam, use_cutmix

    def _mix_elem(self, x):
        batch_size = len(x)
        lam_batch, use_cutmix = self._params_per_elem(batch_size)
        x_orig = x.clone()  # need to keep an unmodified original for mixing source
        for i in range(batch_size):
            j = batch_size - i - 1
            lam = lam_batch[i]
            if lam != 1.:
                if use_cutmix[i]:
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
                    lam_batch[i] = lam
                else:
                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)

    def _mix_pair(self, x):
        batch_size = len(x)
        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
        x_orig = x.clone()  # need to keep an unmodified original for mixing source
        for i in range(batch_size // 2):
            j = batch_size - i - 1
            lam = lam_batch[i]
            if lam != 1.:
                if use_cutmix[i]:
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
                    x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
                    lam_batch[i] = lam
                else:
                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)

    def _mix_batch(self, x):
        lam, use_cutmix = self._params_per_batch()
        if lam == 1.:
            return 1.
        if use_cutmix:
            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
            x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
        else:
            x_flipped = x.flip(0).mul_(1. - lam)
            x.mul_(lam).add_(x_flipped)
        return lam

    def __call__(self, x, target):
        assert len(x) % 2 == 0, 'Batch size should be even when using this'
        if self.mode == 'elem':
            lam = self._mix_elem(x)
        elif self.mode == 'pair':
            lam = self._mix_pair(x)
        else:
            lam = self._mix_batch(x)
        if self.onehot:
            target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device=x.device)
        else:
            target = mixup_target_multi_binary(target, lam, self.label_smoothing, device=x.device)
        return x, target


In [12]:
'''
随机旋转90°，转置，翻转，平移缩放旋转
'''
train_transform = albumentations.Compose([
    albumentations.RandomRotate90(p=0.5),
    albumentations.Transpose(p=0.5),
    albumentations.Flip(p=0.5),
    albumentations.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.0625, rotate_limit=45, border_mode=1, p=0.5),
#     albumentations.OneOf([
#             albumentations.Blur(blur_limit=4, p=1),
#             albumentations.MotionBlur(blur_limit=4, p=1),
#             albumentations.MedianBlur(blur_limit=4, p=1)
#         ], p=0.5),
    albumentations.Normalize(),
    AT.ToTensorV2(),
    ])
    
test_transform = albumentations.Compose([
    albumentations.Normalize(),
    AT.ToTensorV2(),
    ])


class LeavesDataset(Dataset):
    '''
    读取并创建训练集、测试集
    '''
    def __init__(self, csv, transform = train_transform):
        self.csv = csv
        self.transform = transform
  
    def __len__(self):
        return len(self.csv['image'])
  
    def __getitem__(self, idx):
        img = cv2.imread('../input/classify-leaves/' + self.csv['image'][idx])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        label = self.csv['label'][idx]
        if self.transform:
            img = self.transform(image = img)['image']
        return img, torch.tensor(label).type(torch.LongTensor)

    
class LeavesTestDataset(Dataset):
    '''
    读取并创建测试集
    '''
    def __init__(self, csv, transform = test_transform):
        self.csv = csv
        self.transform = transform
  
    def __len__(self):
        return len(self.csv['image'])
    
    def __getitem__(self, idx):
        img = cv2.imread('../input/classify-leaves/' + self.csv['image'][idx])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(image = img)['image']
        return img


In [34]:
'''
hyperparameter
'''
SEED = 42
CLASSES = 176
FOLD = 5
EPOCH = 10
MIXUP = 0.1 # 0 to 1
batch_size = 64
test_batch_size = 8
#设置种子 3407 is all you need！
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True

base_dir = '../input/classify-leaves'
train_df = pd.read_csv(os.path.join(base_dir, 'train.csv'))
train_df.head()

Unnamed: 0,image,label
0,images/0.jpg,maclura_pomifera
1,images/1.jpg,maclura_pomifera
2,images/2.jpg,maclura_pomifera
3,images/3.jpg,maclura_pomifera
4,images/4.jpg,maclura_pomifera


In [14]:
'''
随机划分训练集和验证集
'''
csv = pd.read_csv('../input/classify-leaves/train.csv')
#保证训练集验证集集类别比例相同的kfold
labelencoder = LabelEncoder()
origin_label = csv['label']
labelencoder.fit(origin_label)
transform_label = labelencoder.transform(origin_label)
csv['label'] = transform_label
sfolder = StratifiedKFold(n_splits=FOLD,random_state=SEED,shuffle=True)
train_folds = []
val_folds = []
for train_idx, val_idx in sfolder.split(csv['image'], transform_label):
    train_folds.append(train_idx)
    val_folds.append(val_idx)
    print(len(train_idx), len(val_idx))

14682 3671
14682 3671
14682 3671
14683 3670
14683 3670


In [15]:
'''
如果不用mixup的话就使用label-smoothing就可以了，这个函数(以及nn自带的crossentropy）默认导入的target是int不是one-hot编码。
如果用了mixup，那么在其中的mixup_target（timm包）会自动加上label-smoothing。之后被混合的图片的target会由两个数据的target混合在一起。
'''
class SoftTargetCrossEntropy(nn.Module):

    def __init__(self):
        super(SoftTargetCrossEntropy, self).__init__()

    def forward(self, x, target):
        loss = torch.sum(-target * F.log_softmax(x, dim=-1), dim=-1)
        return loss.mean()


class LabelSmoothing(nn.Module):
    """NLL loss with label smoothing.
    """
    def __init__(self, smoothing=0.0):
        """Constructor for the LabelSmoothing module.
        :param smoothing: label smoothing factor
        """
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing

    def forward(self, x, target):
        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
        nll_loss = nll_loss.squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()
    

In [9]:
'''
首先设置非常小的学习率，然后训练一个batch，看看loss是多少。之后逐渐增大学习率，查看loss变化。因为较小的lr对网络中参数的影响对于更大的lr而言可以忽略不记，所以每个batch都近似于用该学习率进行初始的训练。
详见Cyclical Learning Rates for Training Neural Networks中的3.3节。
'''

def find_lr(model, factor, train_dl, optimizer, loss_fn, device, init_lr=1e-8, final_lr=1e-1, beta=0.98, plot=True, save_dir=None):
    num = len(train_dl) - 1
    mult = (final_lr / init_lr) ** (1/num)
    lr = init_lr
    optimizer.param_groups[0]['lr'] = lr
    avg_loss = 0.
    best_loss = 0.
    batch_num = 0
    losses = []
    log_lrs = []
    scaler = torch.cuda.amp.GradScaler() # for 0

    if 1:
          for x, y in train_dl:
            x, y = x.to(device), y.to(device)
            batch_num += 1
            optimizer.zero_grad()
            #混合精度计算
            with torch.cuda.amp.autocast():
                out = model(x)
                loss = loss_fn(out, y)
              #smoothen the loss
            avg_loss = beta * avg_loss + (1-beta) * loss.data.item() #check
            smoothed_loss = avg_loss / (1 - beta**batch_num) #bias correction
              #stop if loss explodes
            if batch_num > 1 and smoothed_loss > 4 * best_loss: #prevents explosion
                  break
              #record the best loss
            if smoothed_loss < best_loss or batch_num == 1:
                  best_loss = smoothed_loss
              #store the values
            losses.append(smoothed_loss)
            log_lrs.append(math.log10(lr))
              #sgd 算法来调整learning rate
              #loss.backward()
              #optimizer.step()
            scaler.scale(loss).backward() #计算梯度
            scaler.step(optimizer) #调整lr
            scaler.update() #更新梯度
              #update the lr for the next step
            lr *= mult
            optimizer.param_groups[0]['lr'] = lr
    #Suggest a learning rate
    log_lrs, losses = np.array(log_lrs), np.array(losses)
    idx_min = np.argmin(losses)
    min_log_lr = log_lrs[idx_min]
    lr_auto = (10 ** (min_log_lr)) /factor
    if plot:
        selected = [np.argmin(np.abs(log_lrs - (min_log_lr-1)))] #highlight the suggested lr 但是这suggested lr是怎么来的，很令人困惑。
        plt.figure()
        plt.plot(log_lrs, losses,'-gD', markevery=selected)
        plt.xlabel('log_lrs')
        plt.ylabel('loss')
        plt.title('LR Range Test')
        if save_dir is not None:
            plt.savefig(f'{save_dir}/lr_range_test.png')
        else:
            plt.savefig(f'lr_range_test.png')
    return lr_auto

In [10]:
# lr_suggested = find_lr(model, 100, train_dataloader, optimizer, loss_fn, 'cuda', init_lr=1e-10, final_lr=1.) #run if u want suggestion from autolr

In [35]:
'''
一个外挂的储存超参数的函数。lr_scheduler 是在训练过程中可以对lr进行调整的类（比我那个lr函数不知道高到哪里去了）。
'''
def get_learner(lr, nb, epochs, model_name='resnet50d', MIXUP=0.1):
    mixup_fn = Mixup(prob=MIXUP, switch_prob=0.0, onehot=True, label_smoothing=0.05, num_classes=len(set(transform_label)))
    model = timm.create_model(model_name, pretrained=True)
    ###预训练模型也可以改每一层的结构和参数。其中最后的一层fc输出数量和本项目不一样，所以进行改动
    model.fc = nn.Linear(model.fc.in_features, len(set(transform_label)))
    #当网络变化较大时，初始化会使模型更加稳定，xavier_uniform 均匀分布
    nn.init.xavier_uniform_(model.fc.weight)
    model.cuda()

    params_1x = [param for name, param in model.named_parameters()
              if name not in ["fc.weight", "fc.bias"]]

    optimizer = torch.optim.AdamW([{'params': params_1x},
                                    {'params': model.fc.parameters(),
                                      'lr': lr*10}],
                                  lr=lr, weight_decay=2e-4)

    loss_fn = SoftTargetCrossEntropy() if MIXUP else LabelSmoothing(0.1)
    loss_fn_test = F.cross_entropy
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs*nb, eta_min=lr/20)
    return model, optimizer, loss_fn, loss_fn_test, lr_scheduler, mixup_fn

#model = torchvision.models.resnet50(pretrained=True)
#model = torchvision.models.resnext101_32x8d(pretrained=True)
#model = timm.create_model('seresnext50_32x4d', pretrained=True)
#model = timm.create_model('resnet50d', pretrained=True)
#model = timm.create_model('resnest50d', pretrained=True)
#model = timm.create_model('tf_efficientnetv2_l_in21ft1k', pretrained=True)

In [36]:
'''
五折交叉验证，其中GradScaler和autocast是为了AMP（混合精度训练），这个之后随便用用就不研究了。
TTA（test time augmentation）是在测试时对测试集进行augmentation，然后对某个样本的所有图像的预测取均值或最多值。常用于图像分类，以此来增强模型的泛用性和robust等。
其可以通过缩放或裁剪放大提取出图片的纹理或者特征来加强预测效果。
然而，粗暴地对其进行augmentation并取均值可能会在某些类上取到反效果（corruption），因为裁剪或者缩放可能导致一些关键特征难以出现在该样本的大多数图像中。
对此Better Aggregation in Test-Time Augmentation提出了一种更加体面的aggregation方式来聚合这些augmentation之后的测试集结果，也即为其赋予一个权重并进行学习，被称为Class-Weighted TTA
但是搜了一下好像没有现成的库，kaggle服务器这周也用到期了，就下次一定...
（更详细的介绍可以见推送）
'''
device = 'cuda'
save_dir = './'
 
scaler = torch.cuda.amp.GradScaler() # for AMP training 
test_csv = pd.read_csv('../input/classify-leaves/test.csv')

for fold in range(FOLD):
    print(f'Start Fold{fold}...')
    train_csv = csv.iloc[train_folds[fold]].reset_index()
    val_csv = csv.iloc[val_folds[fold]].reset_index()
    train_dataset = LeavesDataset(train_csv, train_transform)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True)
    val_dataset = LeavesDataset(val_csv, train_transform)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, drop_last=False)
    model, optimizer, loss_fn, loss_fn_test, lr_scheduler, mixup_fn = get_learner(3e-4, len(train_dataloader), EPOCHS, model_name='resnet50d', MIXUP=MIXUP)
    model_name = f'5fold_test_fold{fold}'
    train_losses = [] 
    val_losses = []
    train_accus = []
    val_accus = []
    best_accu = 0
    best_loss = float('inf')
    lrs = []
    for epoch in range(EPOCHS):
        t1 = time.time()
        val_accu = 0
        train_accu = 0
        train_losses_tmp = []
        #Train
        model.train()
        t_inf = 0
        for x, y in train_dataloader:
            if MIXUP:
                x, y = mixup_fn(x, y)
                x, y = x.to(device), y.to(device)
            #Forward
            with torch.cuda.amp.autocast():
                x = x.to(device)
                pred = model(x)
                loss = loss_fn(pred, y)
            #Backward
            #loss.backward()
            #optimizer.step()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()
            optimizer.zero_grad()
            #Statistics
            lrs.append(optimizer.param_groups[0]['lr']) #group 0,1,2 share the learning rate
            train_losses_tmp.append(loss.data.item())
            pred_labels = torch.argmax(pred.data, dim=1)
            y_labels = torch.argmax(y.data, dim=1) if MIXUP else y.data
            train_accu += (pred_labels==y_labels).float().sum()
        t_inf /= len(train_dataloader)
        train_losses.append(np.mean(np.array(train_losses_tmp)))
        train_accu /= len(train_dataset)
        train_accus.append(train_accu.data.item())

        t2 = time.time()
        #Validation
        val_losses_tmp = []
        model.eval()
        with torch.no_grad():
            for x, y in val_dataloader:
                x, y = x.to(device), y.to(device)
                logit = model(x)
                val_loss = loss_fn_test(logit, y) 
                val_losses_tmp.append(val_loss.data.item())
                pred = torch.argmax(logit.data, dim=1)
                val_accu += (pred==y.data).float().sum()
        t3 = time.time()
        val_loss = np.mean(np.array(val_losses_tmp))
        val_losses.append(val_loss)
        val_accu /= len(val_dataset)
        val_accus.append(val_accu.data.item())
        #print('fold', fold, 'epoch', epoch, 'train_loss', train_losses[epoch], 'val_loss', val_losses[epoch], 'val_accu', val_accu, 'train_accu', train_accu, 'train time', t2-t1, 'val time', t3-t2, 'lr[0]', lrs[-1])
        if save_dir is not None:
            if val_accu == best_accu:
                if val_loss < best_loss: #never satisfied
                    checkpoint = {"model": model.state_dict()}
                    torch.save(checkpoint, os.path.join(save_dir,f'{model_name}_best.pth'))
                    print(f'Stored a new best model in {save_dir}')
                    best_loss = val_loss
            elif val_accu > best_accu:
                checkpoint = {"model": model.state_dict()}
                torch.save(checkpoint, os.path.join(save_dir,f'{model_name}_best.pth'))
                print(f'Stored a new best model in {save_dir}')
                best_accu = val_accu
    
    #test time
    tta_model = tta.ClassificationTTAWrapper(model, tta.aliases.flip_transform(),  merge_mode='mean')
    tta_model.eval()
    preds = []
    for x in test_dataloader:
        x = x.to(device)
        logit = tta_model(x)
        pred = torch.argmax(logit.data, dim=1).cpu().numpy()
        preds += list(pred)
    res = labelencoder.inverse_transform(preds)
    test_csv.insert(1, 'label', res)
    test_csv.to_csv(f'submission_e50{model_name}_fold{fold}.csv', index=False)
    print('test cvs is saved')

Start Fold0...


RuntimeError: gather(): Expected dtype int64 for index