In [1]:
!nvidia-smi

Sat Apr 22 08:03:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Defaul

# Directory settings

In [2]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [3]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True # CPMP
    competition='AMC23'
    _wandb_kernel='a-rakshit'
    debug=False
    apex=True
    print_freq=20
    num_workers=1 # CPMP
    model="DenilsenAxel/nlp-text-classification"
    gradient_checkpointing=False # CPMP 
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=16e-5
    decoder_lr=16e-5
    num_classes = 1425
    min_lr=1e-6
    eps=1e-6
    s=30.0
    m=0.5
    ls_eps=0.0
    betas=(0.9, 0.999)
    batch_size=96 # CPMP: not using checkpointing
    max_len=64
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=["encoded"]
    seed=42
    n_fold=5
    GeM = False
    arcface = True
    trn_fold=[0]
    partial_data = True
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [4]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("WANDB_API_KEY")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='AMC23', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33ma-rakshit[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Library

In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch import linalg
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -y transformers')
# os.system('pip uninstall -y tokenizers')
# os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
# os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

# CPMP: declare the two GPUs
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

# CPMP: avoids some issues when using more than one worker
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting iterative-stratification==0.1.7
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7




tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1
env: TOKENIZERS_PARALLELISM=true


# Utils

In [6]:
# ====================================================
# Utils
# ====================================================
le = LabelEncoder()
le.classes_ = np.load('/kaggle/input/amc23-filtered/classes.npy')

def metric(y_trues, y_preds):
    score = max( 0 , 100*(1-metrics.mean_absolute_percentage_error(y_trues, y_preds)))
    return score


def get_score(y_trues, y_preds):
    score = metric(y_trues, y_preds)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [7]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('/kaggle/input/amc23-filtered/filtered_train.csv')
train = train.dropna(subset = ["TITLE"]).reset_index(drop = True)
test = pd.read_csv('/kaggle/input/amc-2023/dataset/test.csv')
submission = pd.read_csv('/kaggle/input/amc-2023/dataset/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (2212993, 7)


Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH,encoded
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.0,1068
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.0,270
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.0,574
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.0,597
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.0,447


test.shape: (734736, 5)


Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,Manuel d'Héliogravure Et de Photogravure En Re...,,,6142
1,1729783,DCGARING Microfiber Throw Blanket Warm Fuzzy P...,[QUALITY GUARANTEED: Luxury cozy plush polyest...,<b>DCGARING Throw Blanket</b><br><br> <b>Size ...,1622
2,1871949,I-Match Auto Parts Front License Plate Bracket...,"[Front License Plate Bracket Made Of Plastic,D...",Replacement for The Following Vehicles:2020 LE...,7540
3,1107571,PinMart Gold Plated Excellence in Service 1 Ye...,[Available as a single item or bulk packed. Se...,Our Excellence in Service Lapel Pins feature a...,12442
4,624253,"Visual Mathematics, Illustrated by the TI-92 a...",,,6318


submission.shape: (734736, 2)


Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
0,604373,701.093794
1,1729783,734.506163
2,1871949,741.360258
3,1107571,730.327767
4,624253,666.847946


# CV split

In [8]:
def get_folds(df):
    gkf = StratifiedKFold(n_splits=CFG.n_fold)
    for fold, ( _, val_) in enumerate(gkf.split(X=df, y=df.encoded)):
        df.loc[val_ , "fold"] = int(fold)
        
    df["fold"] = df["fold"].astype(int)
    return df

In [9]:
if CFG.debug:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)

In [10]:
# ====================================================
# CV split
# ====================================================
train = get_folds(train)
if CFG.partial_data:
    train = train.loc[train["fold"] == 0].reset_index(drop = True)
    train = get_folds(train)
display(train.groupby('fold').size())

fold
0    88520
1    88520
2    88520
3    88520
4    88519
dtype: int64

# tokenizer

In [11]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

# Dataset

In [12]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['TITLE'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths)
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/442599 [00:00<?, ?it/s]

max_len: 298


In [13]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['TITLE'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.long)
        # CPMP: return a single dictionary containing all inputs to avoid issues when using DP
        return {'input_ids' : inputs['input_ids'], 
                'attention_mask' : inputs['attention_mask'], 
                'labels' : label}
    
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        # CPMP: no need to truncate labels
        if k != 'labels':
            inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [14]:
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'

In [15]:
class ArcFace(nn.Module):
    def __init__(self, cin, cout, s=8, m=0.5):
        super().__init__()
        self.s = s
        self.sin_m = torch.sin(torch.tensor(m))
        self.cos_m = torch.cos(torch.tensor(m))
        self.cout = cout
        self.fc = nn.Linear(cin, cout, bias=False)

    def forward(self, x, label=None):
        w_L2 = linalg.norm(self.fc.weight.detach(), dim=1, keepdim=True).T
        x_L2 = linalg.norm(x, dim=1, keepdim=True)
        cos = self.fc(x) / (x_L2 * w_L2)

        if label is not None:
            sin_m, cos_m = self.sin_m, self.cos_m
            one_hot = F.one_hot(label, num_classes=self.cout)
            sin = (1 - cos ** 2) ** 0.5
            angle_sum = cos * cos_m - sin * sin_m
            cos = angle_sum * one_hot + cos * (1 - one_hot)
            cos = cos * self.s
            cos = torch.mean(cos, axis = 0)            
        return cos

In [16]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.GeM = GeM()
        self.pool = MeanPooling()
        
        self.fc = ArcFace(self.config.hidden_size, CFG.num_classes, s=CFG.s, m=CFG.m)
        self.fc1 = nn.Linear(self.config.hidden_size, 1024)
        self.fc2 = nn.Linear(1024, CFG.num_classes)
        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    # CPMP: pass inputs explicitly
    def feature(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, attention_mask)
        if CFG.GeM:
            feature = self.GeM(feature).flatten(1)
        return feature

    # CPMP: extract features from batch dicitonary
    def forward(self, batch):
        feature = self.feature(batch['input_ids'], batch['attention_mask'])
        if CFG.arcface:
            output = self.fc(feature, batch['labels'])
        else:
            output = self.fc1(feature)
            output = self.fc2(output)
        return output

# Loss

In [17]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [18]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    # CPMP: iterating on batch dictionaries
    for step, batch in enumerate(train_loader):
        inputs = collate(batch)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = inputs['labels'].view(-1)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    # CPMP: iterating on batch dictionaries
    for step, batch in enumerate(valid_loader):
        inputs = collate(batch)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = inputs['labels'].view(-1)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.argmax(predictions, axis = 1)
    predictions = le.inverse_transform(predictions)
    return losses.avg, predictions

# train loop

In [19]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    valid_labels = le.inverse_transform(valid_labels)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    # CPMP: wrap the model to use all GPUs
    model = nn.DataParallel(model)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        # CPMP: when using DataParallel the original model is now the module attribute
        optimizer_parameters = [
            {'params': [p for n, p in model.module.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.module.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.module.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.CrossEntropyLoss() # RMSELoss(reduction="mean")
    
    best_score = -np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            # CPMP: save the original model. It is stored as the module attribute of the DP model.
            torch.save({'model': model.module.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds["preds"] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [20]:
if __name__ == '__main__':
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in CFG.trn_fold:
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()



Downloading:   0%|          | 0.00/957 [00:00<?, ?B/s]

BertConfig {
  "_name_or_path": "DenilsenAxel/nlp-text-classification",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at DenilsenAxel/nlp-text-classification were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/3688] Elapsed 0m 4s (remain 286m 8s) Loss: 8.2122(8.2122) Grad: 606956.5000  LR: 0.00016000  
Epoch: [1][20/3688] Elapsed 0m 10s (remain 29m 26s) Loss: 5.8507(6.5243) Grad: 326470.7500  LR: 0.00016000  
Epoch: [1][40/3688] Elapsed 0m 15s (remain 23m 26s) Loss: 5.6898(6.1551) Grad: 279214.2500  LR: 0.00016000  
Epoch: [1][60/3688] Elapsed 0m 21s (remain 21m 6s) Loss: 5.9152(5.9702) Grad: 307325.1250  LR: 0.00015999  
Epoch: [1][80/3688] Elapsed 0m 26s (remain 19m 46s) Loss: 5.9121(5.8676) Grad: 253129.8750  LR: 0.00015999  
Epoch: [1][100/3688] Elapsed 0m 32s (remain 19m 6s) Loss: 5.2108(5.7787) Grad: 267537.9062  LR: 0.00015998  
Epoch: [1][120/3688] Elapsed 0m 37s (remain 18m 36s) Loss: 5.5877(5.7192) Grad: 324533.2188  LR: 0.00015997  
Epoch: [1][140/3688] Elapsed 0m 43s (remain 18m 15s) Loss: 5.5438(5.6649) Grad: 266326.3125  LR: 0.00015996  
Epoch: [1][160/3688] Elapsed 0m 49s (remain 17m 54s) Loss: 5.3901(5.6132) Grad: 221326.2344  LR: 0.00015995  
Epoch: [1][180/3688

Epoch 1 - avg_train_loss: 4.4721  avg_val_loss: 3.8635  time: 1301s
Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model


Epoch: [2][0/3688] Elapsed 0m 0s (remain 26m 25s) Loss: 3.4247(3.4247) Grad: nan  LR: 0.00013656  
Epoch: [2][20/3688] Elapsed 0m 6s (remain 17m 39s) Loss: 4.2379(3.7483) Grad: 135706.0469  LR: 0.00013632  
Epoch: [2][40/3688] Elapsed 0m 11s (remain 17m 40s) Loss: 3.9771(3.7080) Grad: 147587.7969  LR: 0.00013608  
Epoch: [2][60/3688] Elapsed 0m 17s (remain 17m 16s) Loss: 4.1173(3.7401) Grad: 115023.6328  LR: 0.00013583  
Epoch: [2][80/3688] Elapsed 0m 23s (remain 17m 24s) Loss: 4.1517(3.7409) Grad: 122207.1875  LR: 0.00013559  
Epoch: [2][100/3688] Elapsed 0m 29s (remain 17m 13s) Loss: 3.4776(3.7316) Grad: 54307.7695  LR: 0.00013534  
Epoch: [2][120/3688] Elapsed 0m 34s (remain 17m 1s) Loss: 4.3973(3.7392) Grad: 70323.6406  LR: 0.00013510  
Epoch: [2][140/3688] Elapsed 0m 40s (remain 16m 55s) Loss: 4.2561(3.7317) Grad: 69512.1094  LR: 0.00013485  
Epoch: [2][160/3688] Elapsed 0m 46s (remain 16m 58s) Loss: 3.7805(3.7412) Grad: 66209.8203  LR: 0.00013460  
Epoch: [2][180/3688] Elapsed 0m

Epoch 2 - avg_train_loss: 3.7377  avg_val_loss: 3.6107  time: 1299s
Epoch 2 - Score: 0.0000


Epoch: [3][0/3688] Elapsed 0m 0s (remain 24m 6s) Loss: 3.1723(3.1723) Grad: nan  LR: 0.00007999  
Epoch: [3][20/3688] Elapsed 0m 6s (remain 17m 53s) Loss: 3.4496(3.0869) Grad: 138420.9688  LR: 0.00007965  
Epoch: [3][40/3688] Elapsed 0m 12s (remain 18m 8s) Loss: 3.3812(3.0657) Grad: 135170.9844  LR: 0.00007931  
Epoch: [3][60/3688] Elapsed 0m 17s (remain 17m 49s) Loss: 2.8918(3.0539) Grad: 123747.7109  LR: 0.00007897  
Epoch: [3][80/3688] Elapsed 0m 23s (remain 17m 40s) Loss: 3.1679(3.0539) Grad: 123958.2344  LR: 0.00007863  
Epoch: [3][100/3688] Elapsed 0m 29s (remain 17m 35s) Loss: 3.0786(3.0397) Grad: 142183.3281  LR: 0.00007829  
Epoch: [3][120/3688] Elapsed 0m 35s (remain 17m 21s) Loss: 2.9082(3.0220) Grad: 141184.2969  LR: 0.00007795  
Epoch: [3][140/3688] Elapsed 0m 41s (remain 17m 17s) Loss: 3.0580(3.0178) Grad: 134618.5312  LR: 0.00007761  
Epoch: [3][160/3688] Elapsed 0m 47s (remain 17m 10s) Loss: 3.1190(3.0150) Grad: 156639.1562  LR: 0.00007727  
Epoch: [3][180/3688] Elapsed

Epoch 3 - avg_train_loss: 2.9112  avg_val_loss: 3.6683  time: 1302s
Epoch 3 - Score: 0.0000


Epoch: [4][0/3688] Elapsed 0m 0s (remain 23m 18s) Loss: 2.1966(2.1966) Grad: inf  LR: 0.00002343  
Epoch: [4][20/3688] Elapsed 0m 6s (remain 17m 33s) Loss: 1.9832(2.2645) Grad: 141999.9844  LR: 0.00002319  
Epoch: [4][40/3688] Elapsed 0m 11s (remain 17m 19s) Loss: 1.5977(2.1958) Grad: 135390.8125  LR: 0.00002295  
Epoch: [4][60/3688] Elapsed 0m 17s (remain 17m 21s) Loss: 2.1483(2.1989) Grad: 155375.7500  LR: 0.00002271  
Epoch: [4][80/3688] Elapsed 0m 23s (remain 17m 30s) Loss: 2.1598(2.1889) Grad: 152395.6719  LR: 0.00002247  
Epoch: [4][100/3688] Elapsed 0m 29s (remain 17m 28s) Loss: 2.1261(2.1878) Grad: 149832.4844  LR: 0.00002224  
Epoch: [4][120/3688] Elapsed 0m 34s (remain 17m 11s) Loss: 2.3303(2.1640) Grad: 160326.7500  LR: 0.00002200  
Epoch: [4][140/3688] Elapsed 0m 40s (remain 16m 59s) Loss: 1.6740(2.1537) Grad: 144311.6250  LR: 0.00002177  
Epoch: [4][160/3688] Elapsed 0m 46s (remain 16m 51s) Loss: 2.3127(2.1482) Grad: 179241.1719  LR: 0.00002153  
Epoch: [4][180/3688] Elaps

Epoch 4 - avg_train_loss: 2.0207  avg_val_loss: 4.0101  time: 1301s
Epoch 4 - Score: 0.0000


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,█▆▄▁
[fold0] avg_val_loss,▅▁▂█
[fold0] epoch,▁▃▆█
[fold0] loss,█▇▆▇▅▅▆▅▅▅▆▄▄▅▅▄▅▄▄▄▄▃▄▃▃▃▄▄▄▃▁▂▂▁▂▂▂▂▂▂
[fold0] lr,███████▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,▁▁▁▁

0,1
[fold0] avg_train_loss,2.02074
[fold0] avg_val_loss,4.01014
[fold0] epoch,4.0
[fold0] loss,2.01956
[fold0] lr,0.0
[fold0] score,0.0
