## CV Split

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from multiprocesspandas import applyparallel
from tqdm import tqdm
import regex as re
import string

In [3]:
N_SPLITS = 5

In [4]:
topic_df = pd.read_csv('topics.csv')
content_df = pd.read_csv('content.csv')
corr_df = pd.read_csv('correlations.csv')
# topic_df = topic_df.rename(columns={'id': 'topic_id'}).merge(corr_df)
topic_df_non_source = topic_df[topic_df['category']!='source'].reset_index(drop=True)
topic_df_non_source['stratify'] = topic_df_non_source['category'] + \
topic_df_non_source['language'] + topic_df_non_source['description'].apply(lambda x: str(isinstance(x, str))) + \
topic_df_non_source['has_content'].apply(str)

In [5]:
kf = StratifiedGroupKFold(n_splits=N_SPLITS)
folds = list(kf.split(topic_df_non_source, y=topic_df_non_source["stratify"], groups=topic_df_non_source["channel"]))
topic_df_non_source['fold'] = -1

for fold, (train_idx, val_idx) in enumerate(folds):
    topic_df_non_source.loc[val_idx, "fold"] = fold




In [6]:
fold_df =  topic_df.merge(topic_df_non_source[['id', 'fold']], on='id', how='left').reset_index(drop=True)[['id', 'fold']].fillna(-1).rename(columns={'id': 'topic_id'})
fold_df['fold'] = fold_df['fold'].astype(int)

In [7]:
corr_df['content_ids'] = corr_df['content_ids'].apply(lambda x:x.split())
corr_df = corr_df.explode('content_ids').reset_index(drop=True)

In [8]:
topic_df = topic_df.fillna('')
topic_df['topic_full_text'] =  topic_df['title'] + ' [SEP] ' + topic_df['description']
topic_df = topic_df[['id', 'topic_full_text', 'language']]
df = corr_df.merge(topic_df, left_on='topic_id', right_on='id', how='left')
df = df[['topic_id','content_ids','topic_full_text','language']]
df = df.rename(columns={'language':'topic_language'})

In [9]:
content_df = content_df.fillna('')

content_df['content_full_text'] =  content_df['title'] + ' [SEP] ' + content_df['description'] + ' [SEP] ' + content_df['text']
content_df = content_df[['id', 'content_full_text', 'language']]
df = df.merge(content_df, left_on='content_ids', right_on='id', how='left')
df = df.rename(columns={'language':'content_language'})
df['label'] = 1

In [10]:
df.head()

Unnamed: 0,topic_id,content_ids,topic_full_text,topic_language,id,content_full_text,content_language,label
0,t_00004da3a1b2,c_1108dd0c7a5d,Откриването на резисторите [SEP] Изследване на...,bg,c_1108dd0c7a5d,Молив като резистор [SEP] Моливът причинява пр...,bg,1
1,t_00004da3a1b2,c_376c5a8eb028,Откриването на резисторите [SEP] Изследване на...,bg,c_376c5a8eb028,Да чуем променливото съпротивление [SEP] Тук ч...,bg,1
2,t_00004da3a1b2,c_5bc0e1e2cba0,Откриването на резисторите [SEP] Изследване на...,bg,c_5bc0e1e2cba0,Променлив резистор (реостат) с графит от молив...,bg,1
3,t_00004da3a1b2,c_76231f9d0b5e,Откриването на резисторите [SEP] Изследване на...,bg,c_76231f9d0b5e,Последователно свързване на галваничен елемент...,bg,1
4,t_00068291e9a4,c_639ea2ef9c95,Entradas e saídas de uma função [SEP] Entenda ...,pt,c_639ea2ef9c95,Dados e resultados de funções: gráficos [SEP] ...,pt,1


In [11]:
df = df[['topic_id', 'topic_full_text', 'content_full_text', 'label']]
df = pd.concat([df])
df = df.drop_duplicates()

In [12]:
df = df.merge(fold_df, left_on='topic_id', right_on='topic_id', how='left')
df = df[['topic_full_text', 'content_full_text', 'label' ,'fold']]

#df = df[df['fold'].isin([0, 1, 2, 3, 4])]

In [13]:
#df.to_csv('train_folds.csv', index=None)

In [14]:
def clean_text(text):
    for punctuation in list(string.punctuation): text = text.replace(punctuation, '')
    output = re.sub('\r+', ' ', text)
    output = re.sub('\n+', ' ', output)
    
    return output
df['topic_full_text'] = df['topic_full_text'].apply(lambda x:clean_text(x))
df['content_full_text'] = df['content_full_text'].apply(lambda x:clean_text(x))

In [15]:
#df = pd.read_csv('train_folds.csv')
df = df[df['fold'].isin([0, 1, 2, 3, 4])]

## create CFG

In [16]:
import numpy as np
import pandas as pd
import time
import math
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import copy
import torch.nn as nn
import os
import json
import gc
import random
from torch.cuda.amp import autocast, GradScaler

In [17]:
class CFG:
    input_path = '/root/autodl-tmp/'
    model_path = 'xlm-roberta-base' 
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 300
    epochs = 5  # 5
    encoder_lr = 20e-6
    decoder_lr = 1e-3
    min_lr = 0.5e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0
    num_fold = 5
    batch_size = 100
    seed = 1006
    OUTPUT_DIR = '/root/autodl-tmp/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    apex=False
    start_awp_epoch = 2 # 开始AWP epoch
    adv_lr = 1e-5 # AWP学习率
    adv_eps = 1e-3 # AWP epsilon
    adv_step = 1 # AWP step

In [18]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [19]:
class TrainDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.topic = df['topic_full_text'].values
        self.content = df['content_full_text'].values
        self.label = df['label'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.topic)
    def __getitem__(self, item):
        topic = self.topic[item].replace('[SEP]', self.sep_token)
        content = self.content[item].replace('[SEP]', self.sep_token)
        label = int(self.label[item])

        
        inputs_topic = self.tokenizer(topic, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        inputs_content = self.tokenizer(content, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        return torch.as_tensor(inputs_topic['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_topic['attention_mask'], dtype=torch.long), \
            torch.as_tensor(inputs_content['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_content['attention_mask'], dtype=torch.long), \
            torch.as_tensor(label, dtype=torch.float)

## build model

In [20]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput

class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.linear = nn.Linear(self.config.hidden_size*3, 1)

    def forward(self,
        topic_input_ids,
        content_input_ids,
        topic_attention_mask=None,
        content_attention_mask=None, 
        labels=None):
        topic_output = self.base(input_ids=topic_input_ids,attention_mask=topic_attention_mask)
        topic_output = topic_output.last_hidden_state
        topic_output = torch.mean(topic_output, dim=1)

        content_output = self.base(input_ids=content_input_ids,attention_mask=content_attention_mask)
        content_output = content_output.last_hidden_state
        content_output = torch.mean(content_output, dim=1)

        diff = torch.abs(topic_output-content_output)

        sentence_embedding = torch.cat([topic_output, content_output, diff], 1)

        output = self.linear(sentence_embedding)
        
        loss = None
        if labels is not None:
            loss = F.binary_cross_entropy_with_logits(output.view(-1), labels.view(-1))
        
        return loss, sentence_embedding
    


## build logger

In [22]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [23]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



## build pipeline

### adversarial attacks

In [None]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}
 
    def attack(self, epsilon=.01, emb_name='word_embedding'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)
 
    def restore(self, emb_name='word_embedding'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

def train_fn_adv(train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    fgm = FGM(model)
    for step, batch in enumerate(train_loader):
        batch = [i.to(device) for i in batch]
        topic_input_ids, topic_attention_mask, content_input_ids, content_attention_mask, label = batch
        batch_size = label.size(0)
        loss = model(topic_input_ids, content_input_ids, topic_attention_mask, content_attention_mask, label)[0]
        losses.update(loss.item(), batch_size)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 500)
        # 对抗训练
        fgm.attack() # embedding被修改了
        loss_adv =model(topic_input_ids, content_input_ids, topic_attention_mask, content_attention_mask, label)[0]
        loss_adv.backward() # 反向传播，在正常的grad基础上，累加对抗训练的梯度
        fgm.restore() # 恢复Embedding的参数
        # 梯度下降，更新参数
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg

## R-drop

In [None]:
def compute_kl_loss(p, q, pad_mask=None):
    
    p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none') # b, 36
    q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')
    
    # pad_mask is for seq-level tasks
    if pad_mask is not None:
        p_loss.masked_fill_(pad_mask, 0.)
        q_loss.masked_fill_(pad_mask, 0.)

    p_loss = p_loss.sum()
    q_loss = q_loss.sum()

    loss = (p_loss + q_loss) / 2
    return loss

def train_fn_r_drop(train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        batch = [i.to(device) for i in batch]
        topic_input_ids, topic_attention_mask, content_input_ids, content_attention_mask, label = batch
        batch_size = label.size(0)
        loss_0,  = model(topic_input_ids, content_input_ids, topic_attention_mask, content_attention_mask, label)
        loss_0 = output_0.loss
        logits_0 = output_0.logits # batch , num_labels
        output_1 = model(input_ids, mask, labels=label)
        loss_1 = output_1.loss
        logits_1 = output_1.logits
        ce_loss = 0.5 * (loss_0 + loss_1)
        kl_loss = compute_kl_loss(logits_0, logits_1)
        loss = ce_loss + 0.5 * kl_loss
        losses.update(loss.item(), batch_size)
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 500)
        optimizer.step()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg

In [24]:
def train_fn(train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        batch = [i.to(device) for i in batch]
        topic_input_ids, topic_attention_mask, content_input_ids, content_attention_mask, label = batch
        batch_size = label.size(0)
        loss = model(topic_input_ids, content_input_ids, topic_attention_mask, content_attention_mask, label)
        losses.update(loss.item(), batch_size)
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 50000)
        optimizer.step()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        label = batch[2].to(device)
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        batch_size = label.size(0)
        with torch.no_grad():
            output = model(input_ids, mask, labels=label)
        loss = output.loss
        y_preds = output.logits.argmax(dim=-1)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    #print(predictions)
    return losses.avg, predictions, labels

def train_loop(fold, model, train_dataset, valid_dataset):
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    #model = Custom_Bert_Simple()
    #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    model.to(CFG.device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
        ]
        return optimizer_parameters

    def get_optimizer(model):

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': CFG.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': 0.0}
            
        ]
        optimizer = AdamW(optimizer_parameters, lr = CFG.encoder_lr, eps = CFG.eps, betas = CFG.betas)
        return optimizer

    
    optimizer = get_optimizer(model)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

    # criterion = LabelSmoothingLoss()
    best_score = float('inf')

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        #avg_loss = train_fn_awp(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        
        avg_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        # eval
        #avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

        # scoring
        #score = get_score(predictions, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch + 1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        #LOGGER.info(f'Epoch {epoch + 1} - Score: {score:.4f}')


        if best_score > avg_loss:
            best_score = avg_loss
            #best_predictions = predictions
            LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
            torch.save(model.state_dict(),
                       CFG.OUTPUT_DIR + "{}_best{}.pth".format(CFG.model_path.replace('/', '_'),fold))



    torch.cuda.empty_cache()
    gc.collect()
    del scheduler, optimizer, model
    return 

In [25]:
model = Custom_Bert_Simple()
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
fold = 0
tr_data = df[df['fold']!=fold].reset_index(drop=True)
va_data = df[df['fold']==fold].reset_index(drop=True)
tr_dataset = TrainDataset(tr_data,tokenizer)
va_dataset = TrainDataset(va_data,tokenizer)
val_result = train_loop(fold, model,tr_dataset, va_dataset)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/831] Elapsed 0m 1s (remain 26m 51s) Loss: 806.6549(806.6549) Grad: 87543.5391  LR: 0.00002000  
Epoch: [1][100/831] Elapsed 1m 12s (remain 8m 43s) Loss: 20.1461(73.3689) Grad: 155.8936  LR: 0.00001997  
Epoch: [1][200/831] Elapsed 2m 23s (remain 7m 28s) Loss: 17.0593(45.8183) Grad: 40.3874  LR: 0.00001988  
Epoch: [1][300/831] Elapsed 3m 33s (remain 6m 16s) Loss: 15.6878(36.1321) Grad: 18.2892  LR: 0.00001974  
Epoch: [1][400/831] Elapsed 4m 44s (remain 5m 4s) Loss: 15.8164(31.1323) Grad: 18.4240  LR: 0.00001954  
Epoch: [1][500/831] Elapsed 5m 54s (remain 3m 53s) Loss: 15.6834(28.0694) Grad: 14.4285  LR: 0.00001929  
Epoch: [1][600/831] Elapsed 7m 5s (remain 2m 42s) Loss: 15.3419(25.9890) Grad: 10.3700  LR: 0.00001899  
Epoch: [1][700/831] Elapsed 8m 15s (remain 1m 31s) Loss: 16.0209(24.5034) Grad: 15.0206  LR: 0.00001863  
Epoch: [1][800/831] Elapsed 9m 26s (remain 0m 21s) Loss: 15.5440(23.3798) Grad: 15.3436  LR: 0.00001822  
Epoch: [1][830/831] Elapsed 9m 47s (remain 0

Epoch 1 - avg_train_loss: 23.0953  time: 587s
Epoch 1 - Save Best Score: 23.0953 Model


Epoch: [2][0/831] Elapsed 0m 1s (remain 20m 15s) Loss: 14.9996(14.9996) Grad: 16.3990  LR: 0.00001809  
Epoch: [2][100/831] Elapsed 1m 11s (remain 8m 37s) Loss: 15.8836(15.4350) Grad: 31.9335  LR: 0.00001762  
Epoch: [2][200/831] Elapsed 2m 21s (remain 7m 25s) Loss: 15.2005(15.4547) Grad: 30.5830  LR: 0.00001711  
Epoch: [2][300/831] Elapsed 3m 32s (remain 6m 13s) Loss: 15.5030(15.4238) Grad: 25.4832  LR: 0.00001656  
Epoch: [2][400/831] Elapsed 4m 42s (remain 5m 2s) Loss: 16.0732(15.3795) Grad: 21.1309  LR: 0.00001597  
Epoch: [2][500/831] Elapsed 5m 52s (remain 3m 52s) Loss: 14.7536(15.3508) Grad: 23.9177  LR: 0.00001534  
Epoch: [2][600/831] Elapsed 7m 2s (remain 2m 41s) Loss: 15.5580(15.3261) Grad: 38.9713  LR: 0.00001469  
Epoch: [2][700/831] Elapsed 8m 13s (remain 1m 31s) Loss: 14.9598(15.3139) Grad: 34.0624  LR: 0.00001401  
Epoch: [2][800/831] Elapsed 9m 23s (remain 0m 21s) Loss: 15.2251(15.3329) Grad: 35.1238  LR: 0.00001331  
Epoch: [2][830/831] Elapsed 9m 44s (remain 0m 0s) 

Epoch 2 - avg_train_loss: 15.3361  time: 585s
Epoch 2 - Save Best Score: 15.3361 Model


Epoch: [3][0/831] Elapsed 0m 1s (remain 19m 37s) Loss: 15.2550(15.2550) Grad: 47.4542  LR: 0.00001309  
Epoch: [3][100/831] Elapsed 1m 11s (remain 8m 38s) Loss: 14.2830(14.9392) Grad: 35.0368  LR: 0.00001236  
Epoch: [3][200/831] Elapsed 2m 22s (remain 7m 25s) Loss: 14.2871(14.6457) Grad: 44.5427  LR: 0.00001162  
Epoch: [3][300/831] Elapsed 3m 32s (remain 6m 13s) Loss: 12.1099(14.2348) Grad: 136.3542  LR: 0.00001087  
Epoch: [3][400/831] Elapsed 4m 42s (remain 5m 2s) Loss: 10.0546(13.5060) Grad: 130.6047  LR: 0.00001011  
Epoch: [3][500/831] Elapsed 5m 52s (remain 3m 52s) Loss: 10.9827(12.7933) Grad: 123.9030  LR: 0.00000936  
Epoch: [3][600/831] Elapsed 7m 2s (remain 2m 41s) Loss: 10.2919(12.1848) Grad: 311.5224  LR: 0.00000861  
Epoch: [3][700/831] Elapsed 8m 12s (remain 1m 31s) Loss: 8.0424(11.7302) Grad: 130.8241  LR: 0.00000786  
Epoch: [3][800/831] Elapsed 9m 22s (remain 0m 21s) Loss: 7.8816(11.3165) Grad: 91.4000  LR: 0.00000713  
Epoch: [3][830/831] Elapsed 9m 44s (remain 0m 0

Epoch 3 - avg_train_loss: 11.2065  time: 584s
Epoch 3 - Save Best Score: 11.2065 Model


Epoch: [4][0/831] Elapsed 0m 1s (remain 20m 46s) Loss: 8.6012(8.6012) Grad: 93.6070  LR: 0.00000691  
Epoch: [4][100/831] Elapsed 1m 11s (remain 8m 37s) Loss: 7.4083(8.0515) Grad: 117.4740  LR: 0.00000620  
Epoch: [4][200/831] Elapsed 2m 21s (remain 7m 24s) Loss: 6.1776(7.8014) Grad: 79.3413  LR: 0.00000551  
Epoch: [4][300/831] Elapsed 3m 32s (remain 6m 13s) Loss: 7.4478(7.6976) Grad: 115.0042  LR: 0.00000485  
Epoch: [4][400/831] Elapsed 4m 42s (remain 5m 2s) Loss: 6.8173(7.5749) Grad: 143.3564  LR: 0.00000422  
Epoch: [4][500/831] Elapsed 5m 52s (remain 3m 52s) Loss: 7.3648(7.5072) Grad: 178.0390  LR: 0.00000362  
Epoch: [4][600/831] Elapsed 7m 3s (remain 2m 41s) Loss: 8.3927(7.4817) Grad: 107.7244  LR: 0.00000305  
Epoch: [4][700/831] Elapsed 8m 13s (remain 1m 31s) Loss: 8.4091(7.4298) Grad: 112.7753  LR: 0.00000253  
Epoch: [4][800/831] Elapsed 9m 23s (remain 0m 21s) Loss: 7.3314(7.3719) Grad: 104.4535  LR: 0.00000205  
Epoch: [4][830/831] Elapsed 9m 44s (remain 0m 0s) Loss: 7.181

Epoch 4 - avg_train_loss: 7.3528  time: 585s
Epoch 4 - Save Best Score: 7.3528 Model


Epoch: [5][0/831] Elapsed 0m 1s (remain 21m 0s) Loss: 8.5523(8.5523) Grad: 102.4305  LR: 0.00000191  
Epoch: [5][100/831] Elapsed 1m 11s (remain 8m 38s) Loss: 5.7938(6.7671) Grad: 120.0356  LR: 0.00000149  
Epoch: [5][200/831] Elapsed 2m 21s (remain 7m 24s) Loss: 6.6514(6.7210) Grad: 91.3494  LR: 0.00000112  
Epoch: [5][300/831] Elapsed 3m 32s (remain 6m 13s) Loss: 7.1213(6.7228) Grad: 134.2628  LR: 0.00000079  
Epoch: [5][400/831] Elapsed 4m 42s (remain 5m 3s) Loss: 6.6185(6.7335) Grad: 96.8177  LR: 0.00000053  
Epoch: [5][500/831] Elapsed 5m 53s (remain 3m 52s) Loss: 5.8534(6.7059) Grad: 102.6544  LR: 0.00000031  
Epoch: [5][600/831] Elapsed 7m 3s (remain 2m 41s) Loss: 7.4289(6.6983) Grad: 108.3129  LR: 0.00000015  
Epoch: [5][700/831] Elapsed 8m 13s (remain 1m 31s) Loss: 6.4916(6.6908) Grad: 107.8180  LR: 0.00000005  
Epoch: [5][800/831] Elapsed 9m 23s (remain 0m 21s) Loss: 7.0968(6.6988) Grad: 105.1216  LR: 0.00000000  
Epoch: [5][830/831] Elapsed 9m 45s (remain 0m 0s) Loss: 7.4129

Epoch 5 - avg_train_loss: 6.6916  time: 585s
Epoch 5 - Save Best Score: 6.6916 Model


## Inference

In [14]:
import numpy as np
import pandas as pd
import time
import math
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import copy
import torch.nn as nn
import os
import json
import gc
import random
from torch.cuda.amp import autocast, GradScaler
import hnswlib

ModuleNotFoundError: No module named 'hnswlib'

In [None]:
class CFG:
    input_path = '/media/will/data/LECR'
    model_path = 'microsoft/mdeberta-v3-base' 
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 124
    epochs = 5  # 5
    encoder_lr = 20e-6
    decoder_lr = 1e-3
    min_lr = 0.5e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0
    num_fold = 5
    batch_size = 32
    seed = 1006
    OUTPUT_DIR = '/media/will/data/LECR'
    num_workers = 2
    device='cuda'
    print_freq = 100
    apex=False
    start_awp_epoch = 2 # 开始AWP epoch
    adv_lr = 1e-5 # AWP学习率
    adv_eps = 1e-3 # AWP epsilon
    adv_step = 1 # AWP step

In [None]:
class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)

    def forward(self,
        input_ids,
        attention_mask=None):
        output = self.base(input_ids=input_ids,attention_mask=attention_mask)
        output = output.last_hidden_state
        output = torch.mean(output, dim=1)
        return output

In [None]:
model = Custom_Bert_Simple()
model.load_state_dict(torch.load('LECRmicrosoft_mdeberta-v3-base_best0.pth'),strict=False)
model.to(CFG.device)
model.eval()

In [None]:
content_df = pd.read_csv('content.csv')
correlations_df = pd.read_csv('correlations.csv')
topics_df = pd.read_csv('topics.csv')
#topics_df = topics_df[topics_df['category']!='source'].reset_index(drop=True)
sub_df = pd.read_csv('sample_submission.csv')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

In [None]:
class Testataset(Dataset):
    def __init__(self,df,tokenizer):
        self.title = df['title'].values
        self.description = df['description'].values
        self.text = None
        if 'text' in df.columns:
            self.text = df['text'].values
        
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.title)
    def __getitem__(self, item):
        
        input_text = self.title[item]
        if isinstance(input_text, float):
            input_text = ''
        if not isinstance(self.description[item], float):
            #print(self.description[item])
            input_text += ' ' + self.sep_token + ' ' + self.description[item]
        
        if self.text is not None and not isinstance(self.text[item], float):
            input_text += ' ' + self.sep_token + self.text[item]
            
        output = self.tokenizer(input_text, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        
        return torch.as_tensor(output['input_ids'], dtype=torch.long), \
            torch.as_tensor(output['attention_mask'], dtype=torch.long)

In [None]:
topic_dataset = Testataset(topics_df[topics_df['id'].isin(sub_df['topic_id'])], tokenizer)
content_dataset = Testataset(content_df, tokenizer)
topic_loader = DataLoader(topic_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
content_loader = DataLoader(content_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
def infer(model, dataloader):
    res = []
    for batch in tqdm(dataloader):
        input_ids, attention_mask = [i.to(CFG.device) for i in batch]
        with torch.no_grad():
            output = model(input_ids, attention_mask)
            res.append(output.cpu().numpy())
    
    return np.vstack(res)

In [79]:
topic_result = infer(model, topic_loader)

100%|██████████| 1/1 [00:00<00:00,  2.51it/s]


In [35]:
content_result = infer(model, content_loader)

100%|██████████| 2407/2407 [12:43<00:00,  3.15it/s]


In [46]:
content_ids = [i for i in range(len(content_df))]

In [47]:
def build_index(embeddings, ids):

    index = hnswlib.Index(space="cosine", dim=embeddings.shape[-1])

    # Initializing index
    # max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
    # during insertion of an element.
    # The capacity can be increased by saving/loading the index, see below.
    #
    # ef_construction - controls index search speed/build speed tradeoff
    #
    # M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
    # Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
    index.init_index(max_elements=embeddings.shape[0], ef_construction=200, M=160)

    # Controlling the recall by setting ef:
    # higher ef leads to better accuracy, but slower search
    index.set_ef(50)

    # Set number of threads used during batch search/construction
    # By default using all available cores
    index.set_num_threads(16)

    
    index.add_items(embeddings, ids)


    return index

In [48]:
content_index = build_index(content_result, content_ids)

In [80]:
results = content_index.knn_query(topic_result, k = 5, num_threads = -1)

In [81]:
pred = []
conten_uid = content_df['id']
for result in tqdm(results[0]):
    top_same = ' '.join(conten_uid[result].to_list())
    pred.append(top_same)

100%|██████████| 5/5 [00:00<00:00, 107.23it/s]


In [82]:
pred[0]

'c_9d61ca64065c c_7c38160748ad c_b922de5db068 c_5c0cfe8772fe c_88b54048c6ae'

In [83]:
sub_df['content_ids'] = pred
sub_df.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_9d61ca64065c c_7c38160748ad c_b922de5db068 c...
1,t_00068291e9a4,c_207bb2e7346f c_42c8b513508c c_3a9fabe1f4e0 c...
2,t_00069b63a70a,c_487defefd442 c_b7e629d2a6d0 c_74fc8d315563 c...
3,t_0006d41a73a8,c_96c5ae7cd9f9 c_aaac446c7b8b c_6953a88de9f6 c...
4,t_4054df11a74e,c_542e610aa1e1 c_4cc5d89eb9e3 c_2fc11e484b99 c...


In [84]:
sub_df.to_csv('submission.csv', index=None)