In [1]:
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import nvidia_smi
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


In [2]:
# Custom imports.
from utils import f2_score, seed_everything, AverageMeter, timeSince, get_vram, get_param_counts
from model_utils import MeanPooling
from train_utils import get_model_fold_paths, save_best_models, select_optimizer, select_scheduler

In [3]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvincenttu[0m (use `wandb login --relogin` to force relogin)


True

In [4]:
class CFG:
    print_freq = 500
    num_workers = 4
    model = "xlm-roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model)
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    epochs = 5
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 32
    weight_decay = 0.01
    max_grad_norm = 0.012
    max_len = 512
    n_folds = 5
    seed = 42

In [5]:
def read_data(train):
    train['title1'].fillna("Title does not exist", inplace = True)
    train['title2'].fillna("Title does not exist", inplace = True)

    # Create feature column
    train['text'] = train['title1'] + '[SEP]' + train['title2']
    
    return train

In [6]:
def cv_split(train, cfg):
    kfold = StratifiedGroupKFold(n_splits = cfg.n_folds, shuffle = True, random_state = cfg.seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train, train['target'], train['topics_ids'])):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    
    return train

In [7]:
def get_max_length(train, cfg):
    lengths = []
    for text in tqdm(train['text'].fillna("").values, total = len(train)):
        length = len(cfg.tokenizer(text, add_special_tokens = False)['input_ids'])
        lengths.append(length)
    cfg.max_len = max(lengths) + 2 # cls & sep
    print(f"max_len: {cfg.max_len}")

In [8]:
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

class custom_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label
    
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [9]:
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [10]:
# =========================================================================================
# Train function loop
# =========================================================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, target) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.cuda.amp.autocast(enabled = True):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]))
            get_vram()

    return losses.avg

# =========================================================================================
# Valid function loop
# =========================================================================================
def valid_fn(valid_loader, model, criterion, device, cfg):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, target) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, 
                          len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))))
            get_vram()
            
    predictions = np.concatenate(preds, axis = 0)
    
    return losses.avg, predictions

In [11]:
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.1, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold

In [12]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
        'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
        'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
        'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [13]:
# Seed everything
seed_everything(CFG)

# Read data
correlations = pd.read_csv("../input/correlations.csv")
train = read_data(pd.read_csv("../input/train.csv"))

# Split data
cv_split(train, CFG)

# Get max length
get_max_length(train, CFG)

  0%|          | 0/615170 [00:00<?, ?it/s]

max_len: 172


In [14]:
fold = 0
cfg = CFG()

In [15]:
# Split train & validation
x_train = train[train['fold'] != fold]
x_val = train[train['fold'] == fold]
valid_labels = x_val['target'].values
train_dataset = custom_dataset(x_train, cfg)
valid_dataset = custom_dataset(x_val, cfg)
train_loader = DataLoader(
    train_dataset, 
    batch_size = cfg.batch_size, 
    shuffle = True, 
    num_workers = cfg.num_workers, 
    pin_memory = True, 
    drop_last = True
)
valid_loader = DataLoader(
    valid_dataset, 
    batch_size = cfg.batch_size, 
    shuffle = False, 
    num_workers = cfg.num_workers, 
    pin_memory = True, 
    drop_last = False
)

In [16]:
# Get model
model = custom_model(cfg)
_ = model.to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
optimizer_parameters = get_optimizer_params(
    model, 
    encoder_lr = cfg.encoder_lr, 
    decoder_lr = cfg.decoder_lr,
    weight_decay = cfg.weight_decay
)
optimizer = AdamW(
    optimizer_parameters, 
    lr = cfg.encoder_lr, 
    eps = cfg.eps, 
    betas = cfg.betas
)

# Scheduler
num_train_steps = int(len(x_train) / cfg.batch_size * cfg.epochs)
num_warmup_steps = num_train_steps * cfg.warmup_ratio
scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = num_warmup_steps, 
    num_training_steps = num_train_steps, 
    num_cycles = cfg.num_cycles
    )

# Criterion
criterion = nn.BCEWithLogitsLoss(reduction = "mean")

In [18]:
print("Printing GPU stats...")
get_vram()

Printing GPU stats...
Device 0: b'NVIDIA A40', Memory : (81.13% free): 17.995312768034637 (total), 14.598828393034637 (free), 3.396484375 (used)


In [19]:
project = "LECR_0.297_baseline"
cfg_params = [i for i in dir(cfg) if "__" not in i]
cfg_params = dict(zip(cfg_params, [getattr(cfg, i) for i in cfg_params]))
total_params, trainable_params, nontrainable_params = get_param_counts(model)
cfg_params.update({
    "total_params": total_params,
    "trainable_params": trainable_params,
    "nontrainable_params": nontrainable_params
})

save_root = "../models/0297_baseline/"
os.makedirs(save_root, exist_ok=True)

run = wandb.init(project=project, config=cfg_params, name=f"fold{fold}", dir="/tmp")

[34m[1mwandb[0m: wandb version 0.13.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [None]:
best_score = 0
for epoch in range(cfg.epochs):
    start_time = time.time()
    
    # Train
    avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg)
    
    # Validation
    avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device, cfg)
    
    # Compute f2_score
    score, threshold = get_best_threshold(x_val, predictions, correlations)
    
    elapsed = time.time() - start_time
    
    print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
    print(f'Epoch {epoch+1} - Score: {score:.4f} - Threshold: {threshold:.5f}')
    
    run.log({
        "epoch": epoch,
        "avg_train_loss": avg_loss,
        "avg_val_loss": avg_val_loss,
        "f2_score": score,
        "threshold": threshold
    })
    
    if score > best_score:
        best_score = score
        print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
        save_p = os.path.join(save_root, f"{cfg.model.replace('/', '-')}_fold{fold}_ep{epoch}.pth")
        torch.save(model.state_dict(), save_p)
        # torch.save(
        #     {'model': model.state_dict(), 'predictions': predictions}, 
        #     f"{cfg.model.replace('/', '-')}_fold{fold}_{cfg.seed}.pth"
        #     )
        val_predictions = predictions
        
torch.cuda.empty_cache()
gc.collect()
# Get best threshold
best_score, best_threshold = get_best_threshold(x_val, val_predictions, correlations)
print(f'Our CV score is {best_score} using a threshold of {best_threshold}')

run.finish()

Epoch: [1][0/15381] Elapsed 0m 0s (remain 119m 58s) Loss: 0.8532(0.8532) Grad: inf  LR: 0.00000000  
Device 0: b'NVIDIA A40', Memory : (71.05% free): 17.995312768034637 (total), 12.786328393034637 (free), 5.208984375 (used)
Epoch: [1][500/15381] Elapsed 0m 46s (remain 22m 53s) Loss: 0.5878(0.7907) Grad: 6.3956  LR: 0.00000065  
Device 0: b'NVIDIA A40', Memory : (53.26% free): 17.995312768034637 (total), 9.585156518034637 (free), 8.41015625 (used)
Epoch: [1][1000/15381] Elapsed 1m 32s (remain 22m 3s) Loss: 0.2023(0.5462) Grad: 4.6545  LR: 0.00000130  
Device 0: b'NVIDIA A40', Memory : (52.92% free): 17.995312768034637 (total), 9.522656518034637 (free), 8.47265625 (used)
Epoch: [1][1500/15381] Elapsed 2m 18s (remain 21m 19s) Loss: 0.1050(0.4402) Grad: 9.4569  LR: 0.00000195  
Device 0: b'NVIDIA A40', Memory : (52.81% free): 17.995312768034637 (total), 9.503125268034637 (free), 8.4921875 (used)
Epoch: [1][2000/15381] Elapsed 3m 4s (remain 20m 32s) Loss: 0.2976(0.3828) Grad: 7.6789  LR: 0.

In [21]:
run.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…