In [1]:
import sys
print(sys.version)
print(sys.executable)

3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]
/root/miniconda3/bin/python


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from multiprocesspandas import applyparallel
from tqdm import tqdm
from transformers import AutoTokenizer
import regex as re
import string

In [3]:
N_SPLITS = 5
topic_df = pd.read_csv('topics.csv')
content_df = pd.read_csv('content.csv')
corr_df = pd.read_csv('correlations.csv')
# topic_df = topic_df.rename(columns={'id': 'topic_id'}).merge(corr_df)
topic_df_non_source = topic_df[topic_df['category']!='source'].reset_index(drop=True)
topic_df_non_source['stratify'] = topic_df_non_source['category'] + \
topic_df_non_source['language'] + topic_df_non_source['description'].apply(lambda x: str(isinstance(x, str))) + \
topic_df_non_source['has_content'].apply(str)

In [4]:
kf = StratifiedGroupKFold(n_splits=N_SPLITS)
folds = list(kf.split(topic_df_non_source, y=topic_df_non_source["stratify"], groups=topic_df_non_source["channel"]))
topic_df_non_source['fold'] = -1

for fold, (train_idx, val_idx) in enumerate(folds):
    topic_df_non_source.loc[val_idx, "fold"] = fold



In [5]:
fold_df =  topic_df.merge(topic_df_non_source[['id', 'fold']], on='id', how='left').reset_index(drop=True)[['id', 'fold']].fillna(-1).rename(columns={'id': 'topic_id'})
fold_df['fold'] = fold_df['fold'].astype(int)
corr_df['content_ids'] = corr_df['content_ids'].apply(lambda x:x.split())
corr_df = corr_df.explode('content_ids').reset_index(drop=True)

In [6]:
topic_df = topic_df.fillna('')
topic_df['topic_full_text'] =  topic_df['title'] + ' [SEP] ' + topic_df['description']
topic_df = topic_df[['id','title' ,'topic_full_text', 'language']]
df = corr_df.merge(topic_df, left_on='topic_id', right_on='id', how='left')
df = df[['topic_id','content_ids','topic_full_text','language', 'title']]
df = df.rename(columns={'language':'topic_language', 'title': 'topic_title'})

In [7]:
content_df = content_df.fillna('')
content_df['content_full_text'] =  content_df['title'] + ' [SEP] ' + content_df['description'] + ' [SEP] ' + content_df['text']
content_df = content_df[['id', 'title', 'content_full_text', 'language']]
df = df.merge(content_df, left_on='content_ids', right_on='id', how='left')
df = df.rename(columns={'language':'content_language', 'title': 'content_title'})
df['label'] = 1

In [8]:
df = df.merge(fold_df, on='topic_id', how='left')

In [9]:
df.head()

Unnamed: 0,topic_id,content_ids,topic_full_text,topic_language,topic_title,id,content_title,content_full_text,content_language,label,fold
0,t_00004da3a1b2,c_1108dd0c7a5d,Откриването на резисторите [SEP] Изследване на...,bg,Откриването на резисторите,c_1108dd0c7a5d,Молив като резистор,Молив като резистор [SEP] Моливът причинява пр...,bg,1,-1
1,t_00004da3a1b2,c_376c5a8eb028,Откриването на резисторите [SEP] Изследване на...,bg,Откриването на резисторите,c_376c5a8eb028,Да чуем променливото съпротивление,Да чуем променливото съпротивление [SEP] Тук ч...,bg,1,-1
2,t_00004da3a1b2,c_5bc0e1e2cba0,Откриването на резисторите [SEP] Изследване на...,bg,Откриването на резисторите,c_5bc0e1e2cba0,Променлив резистор (реостат) с графит от молив,Променлив резистор (реостат) с графит от молив...,bg,1,-1
3,t_00004da3a1b2,c_76231f9d0b5e,Откриването на резисторите [SEP] Изследване на...,bg,Откриването на резисторите,c_76231f9d0b5e,Последователно свързване на галваничен елемент...,Последователно свързване на галваничен елемент...,bg,1,-1
4,t_00068291e9a4,c_639ea2ef9c95,Entradas e saídas de uma função [SEP] Entenda ...,pt,Entradas e saídas de uma função,c_639ea2ef9c95,Dados e resultados de funções: gráficos,Dados e resultados de funções: gráficos [SEP] ...,pt,1,-1


In [10]:
neg_df = pd.read_parquet('random_negative_for_recall_exp4.parquet')

In [11]:
neg_df = neg_df.merge(fold_df, on='topic_id', how='left')

In [12]:
neg_df = neg_df[['topic_full_text', 'content_full_text', 'topic_id','label', 'fold']]

In [13]:
df = df[['topic_full_text', 'content_full_text', 'topic_id','label', 'fold']]
df = pd.concat([df, neg_df])
df.head()

Unnamed: 0,topic_full_text,content_full_text,topic_id,label,fold
0,Откриването на резисторите [SEP] Изследване на...,Молив като резистор [SEP] Моливът причинява пр...,t_00004da3a1b2,1,-1
1,Откриването на резисторите [SEP] Изследване на...,Да чуем променливото съпротивление [SEP] Тук ч...,t_00004da3a1b2,1,-1
2,Откриването на резисторите [SEP] Изследване на...,Променлив резистор (реостат) с графит от молив...,t_00004da3a1b2,1,-1
3,Откриването на резисторите [SEP] Изследване на...,Последователно свързване на галваничен елемент...,t_00004da3a1b2,1,-1
4,Entradas e saídas de uma função [SEP] Entenda ...,Dados e resultados de funções: gráficos [SEP] ...,t_00068291e9a4,1,-1


## clean text

In [14]:
def clean_text(text):
    for punctuation in list(string.punctuation): text = text.replace(punctuation, '')
    output = re.sub('\r+', ' ', text)
    output = re.sub('\n+', ' ', output)
    
    return output

In [15]:
#df = pd.read_parquet('train_fold_recall_exp1.parquet')

In [16]:
#df.to_parquet('train_fold_recall_exp3.parquet')

In [17]:
#neg_df.to_parquet('random_negative_for_recall_exp1.parquet')

## create CFG

In [18]:
import numpy as np
import pandas as pd
import time
import math
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import copy
import torch.nn as nn
import os
import json
import gc
import random
from torch.cuda.amp import autocast, GradScaler
import hnswlib

In [19]:
class CFG:
    input_path = '/root/autodl-tmp/'
    model_path = 'xlm-roberta-base' 
    exp_name = 'recall_exp4'
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0.1
    max_input_length = 256
    epochs = 5  # 5
    encoder_lr = 20e-6
    decoder_lr = 1e-3
    min_lr = 0.5e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0
    num_fold = 5
    batch_size = 128
    seed = 1006
    OUTPUT_DIR = '/root/autodl-tmp/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    apex=False
    start_awp_epoch = 2 # 开始AWP epoch
    adv_lr = 1e-5 # AWP学习率
    adv_eps = 1e-3 # AWP epsilon
    adv_step = 1 # AWP step

In [20]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [21]:
class TrainDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.topic = df['topic_full_text'].values
        self.content = df['content_full_text'].values
        self.label = df['label'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.topic)
    def __getitem__(self, item):
        topic = self.topic[item].replace('[SEP]', self.sep_token)
        content = self.content[item].replace('[SEP]', self.sep_token)
        label = int(self.label[item])

        
        inputs_topic = self.tokenizer(topic, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        inputs_content = self.tokenizer(content, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        return torch.as_tensor(inputs_topic['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_topic['attention_mask'], dtype=torch.long), \
            torch.as_tensor(inputs_content['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_content['attention_mask'], dtype=torch.long), \
            torch.as_tensor(label, dtype=torch.float)
    
    
class TopicTestDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.input = df['topic_full_text'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.input)
    def __getitem__(self, item):
        
        input_text = self.input[item]
        output = self.tokenizer(input_text, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        
        return torch.as_tensor(output['input_ids'], dtype=torch.long), \
            torch.as_tensor(output['attention_mask'], dtype=torch.long)
    
class ContentTestDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.input = df['content_full_text'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.input)
    def __getitem__(self, item):
        
        input_text = self.input[item]
        output = self.tokenizer(input_text, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        
        return torch.as_tensor(output['input_ids'], dtype=torch.long), \
            torch.as_tensor(output['attention_mask'], dtype=torch.long)

## build model

In [22]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput

class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.drop = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size*3, 1)

    def forward(self,
        topic_input_ids,
        content_input_ids,
        topic_attention_mask=None,
        content_attention_mask=None, 
        labels=None):
        topic_output = self.base(input_ids=topic_input_ids,attention_mask=topic_attention_mask)
        topic_output = self.drop(topic_output.last_hidden_state)
        topic_output_mask = torch.unsqueeze(topic_attention_mask, 2)
        #print(topic_output_mask)
        topic_output *= topic_output_mask
        topic_output = torch.sum(topic_output, dim=1)
        topic_output_mask = torch.sum(topic_output_mask, dim=1)
        topic_output /= topic_output_mask
        #print(topic_output)
        
        content_output = self.base(input_ids=content_input_ids,attention_mask=content_attention_mask)
        content_output = self.drop(content_output.last_hidden_state)
        content_output_mask = torch.unsqueeze(content_attention_mask, 2)
        content_output *= content_output_mask
        content_output = torch.sum(content_output, dim=1)
        content_output_mask = torch.sum(content_output_mask, dim=1)
        content_output /= content_output_mask
        
        
        diff = torch.abs(topic_output-content_output)
        sentence_embedding = torch.cat([topic_output, content_output, diff], 1)

        output = self.linear(sentence_embedding)
        
        loss = None
        if labels is not None:
            loss = F.binary_cross_entropy_with_logits(output.view(-1), labels.view(-1))
        
        return loss

In [23]:
class Custom_Bert_SimCSE(nn.Module):
    def __init__(self, margin=0.3, scale=30):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.margin = margin
        # Used scaling cosine similarity to ease converge
        self.sacle = scale

    def forward(self,
        topic_input_ids,
        content_input_ids,
        topic_attention_mask=None,
        content_attention_mask=None, 
        labels=None):
        topic_output = self.base(input_ids=topic_input_ids,attention_mask=topic_attention_mask)
        topic_output = topic_output.last_hidden_state
        topic_output = torch.mean(topic_output, dim=1)

        content_output = self.base(input_ids=content_input_ids,attention_mask=content_attention_mask)
        content_output = content_output.last_hidden_state
        content_output = torch.mean(content_output, dim=1)
        
        # topic: 64 * 768
        # content: 64 * 768
        cosine_sim = torch.matmul(topic_output, content_output.T)
        # 64 * 64 --> batch * batch 其中 cosine_sim[i][j] 第i个topic和第j个content

        # substract margin from all positive samples cosine_sim()
        margin_diag = torch.full(
            [topic_output.shape[0]], fill_value=self.margin
        )
        # margin_diag --》batch * batch 元素全是fill_value的一个矩阵
        
        cosine_sim = cosine_sim - torch.diag(margin_diag).to(CFG.device)

        # scale cosine to ease training converge
        cosine_sim *= self.sacle
        
        ## batch * batch 

        labels = torch.arange(0, topic_output.shape[0])
        labels = torch.reshape(labels, shape=[-1]).to(CFG.device)
        
        # print(cosine_sim.shape)
        # print(labels.shape)
        
        
        loss = F.cross_entropy(cosine_sim, labels)
        
        return loss

In [24]:
class Custom_Bert_SimCSE_Test(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)

    def forward(self,
        input_ids,
        attention_mask=None):
        output = self.base(input_ids=input_ids,attention_mask=attention_mask)
        output = output.last_hidden_state
        output = torch.mean(output, dim=1)
        return output

In [25]:
class Custom_Bert_Simple_Test(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.drop = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size*3, 1)

    def forward(self,
        input_ids,
        attention_mask=None):
        output = self.base(input_ids=input_ids,attention_mask=attention_mask)
        output = self.drop(output.last_hidden_state)
        attention_mask = torch.unsqueeze(attention_mask, 2)
        #print(topic_output_mask)
        output *= attention_mask
        output = torch.sum(output, dim=1)
        attention_mask = torch.sum(attention_mask, dim=1)
        output /= attention_mask
        return output

## build logger

In [26]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [27]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



## build pipeline

In [28]:
def build_index(embeddings, ids):

    index = hnswlib.Index(space="cosine", dim=embeddings.shape[-1])

    # Initializing index
    # max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
    # during insertion of an element.
    # The capacity can be increased by saving/loading the index, see below.
    #
    # ef_construction - controls index search speed/build speed tradeoff
    #
    # M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
    # Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
    index.init_index(max_elements=embeddings.shape[0], ef_construction=200, M=1000)

    # Controlling the recall by setting ef:
    # higher ef leads to better accuracy, but slower search
    index.set_ef(1000)

    # Set number of threads used during batch search/construction
    # By default using all available cores
    index.set_num_threads(16)

    
    index.add_items(embeddings, ids)


    return index

In [29]:
def train_fn(train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        batch = [i.to(device) for i in batch]
        topic_input_ids, topic_attention_mask, content_input_ids, content_attention_mask, label = batch
        batch_size = label.size(0)
        loss = model(topic_input_ids, content_input_ids, topic_attention_mask, content_attention_mask, label)
        losses.update(loss.item(), batch_size)
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 500)
        optimizer.step()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def infer(model, dataloader):
    res = []
    for batch in tqdm(dataloader):
        input_ids, attention_mask = [i.to(CFG.device) for i in batch]
        with torch.no_grad():
            output = model(input_ids, attention_mask)
            res.append(output.cpu().numpy())

    return np.vstack(res)


def recall(targets, preds): return len([x for x in targets if x in preds])/(len(targets)+ 1e-16)

def f2_score(y_true, y_pred):

    y_true = [set(i.split()) for i in y_true]
    y_pred = [set(i.split()) for i in y_pred]
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recs = [recall(t,p) for t,p in list(zip(y_true, y_pred))]
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4), np.nanmean(recs)


def valid_fn(val_df, tokenizer):
    val_df = val_df.reset_index(drop=True)
    val_topic_id = val_df['topic_id'].unique().tolist()
    content_df = pd.read_csv('content.csv')
    content_df = content_df.fillna('')
    content_df['content_full_text'] = content_df['title'] + ' [SEP] ' + content_df['description'] + ' [SEP] ' + content_df['text']
    topic_df = pd.read_csv('topics.csv')
    topic_df = topic_df[topic_df['id'].isin(val_topic_id)]
    topic_df = topic_df.fillna('')
    topic_df['topic_full_text'] = topic_df['title'] + ' [SEP] ' + topic_df['description']
    topic_dataset = TopicTestDataset(topic_df, tokenizer)
    content_dataset = ContentTestDataset(content_df, tokenizer)
    topic_loader = DataLoader(topic_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    content_loader = DataLoader(content_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    model = Custom_Bert_Simple_Test()
    model.load_state_dict(torch.load(CFG.OUTPUT_DIR + "{}_{}_best{}_tmp.pth".format(CFG.model_path.replace('/', '_'),CFG.exp_name,fold)),strict=False)
    model.to(CFG.device)
    model.eval()
    topic_result = infer(model, topic_loader)
    content_result = infer(model, content_loader)
    content_ids = [i for i in range(len(content_df))]
    content_index = build_index(content_result, content_ids)
    results = content_index.knn_query(topic_result, k = 5, num_threads = -1)
    pred = []
    content_uid = content_df['id']
    for result in tqdm(results[0]):
        top_same = ' '.join(content_uid[result].to_list())
        pred.append(top_same)
    corr_df_init = pd.read_csv('correlations.csv')
    corr_df_init = corr_df_init[corr_df_init['topic_id'].isin(val_topic_id)]
    gts = topic_df.merge(corr_df_init, how='left', left_on='id', right_on='topic_id')['content_ids'].to_list()                                 
    score, recall = f2_score(gts, pred)
    del model
    gc.collect()
    return score, recall

def train_loop(fold, model, train_dataset, va_data):
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    
    # ====================================================
    # model & optimizer
    # ====================================================
    #model = Custom_Bert_Simple()
    #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    model.to(CFG.device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
        ]
        return optimizer_parameters

    def get_optimizer(model):

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': CFG.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': 0.0}
            
        ]
        optimizer = AdamW(optimizer_parameters, lr = CFG.encoder_lr, eps = CFG.eps, betas = CFG.betas)
        return optimizer

    
    optimizer = get_optimizer(model)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

    # criterion = LabelSmoothingLoss()
    best_score = 0

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        #avg_loss = 0.01#train_fn_awp(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        
        avg_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        # eval
        torch.save(model.state_dict(),
                       CFG.OUTPUT_DIR + "{}_{}_best{}_tmp.pth".format(CFG.model_path.replace('/', '_'),CFG.exp_name,fold))
        
        score, recall = valid_fn(va_data, tokenizer)

        # scoring
        #score = get_score(predictions, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch + 1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch + 1} - Score: {score:.4f} - Recall:{recall:.4f}')
        
        

        if best_score < score:
            best_score = score
            #best_predictions = predictions
            LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
            torch.save(model.state_dict(),
                       CFG.OUTPUT_DIR + "{}_{}_best{}.pth".format(CFG.model_path.replace('/', '_'),CFG.exp_name,fold))



    torch.cuda.empty_cache()
    gc.collect()
    del scheduler, optimizer, model
    return 

In [30]:
model = Custom_Bert_Simple()
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
fold = 0
tr_data = df[df['fold']!=fold].reset_index(drop=True)
va_data = df[df['fold']==fold].reset_index(drop=True)
tr_dataset = TrainDataset(tr_data,tokenizer)
va_dataset = TrainDataset(va_data,tokenizer)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## train

In [None]:
va_data.to_parquet('va_data_0_{}.parquet'.format(CFG.exp_name))
val_result = train_loop(fold, model,tr_dataset, va_data)



Epoch: [1][0/41910] Elapsed 0m 3s (remain 2182m 21s) Loss: 0.7683(0.7683) Grad: 13.0857  LR: 0.00000000  
Epoch: [1][100/41910] Elapsed 1m 15s (remain 523m 30s) Loss: 0.7702(0.7685) Grad: 12.6939  LR: 0.00000010  
Epoch: [1][200/41910] Elapsed 2m 28s (remain 513m 33s) Loss: 0.7298(0.7597) Grad: 11.6402  LR: 0.00000019  
Epoch: [1][300/41910] Elapsed 3m 41s (remain 509m 38s) Loss: 0.6841(0.7431) Grad: 12.0506  LR: 0.00000029  
Epoch: [1][400/41910] Elapsed 4m 53s (remain 507m 2s) Loss: 0.5979(0.7187) Grad: 10.7986  LR: 0.00000038  
Epoch: [1][500/41910] Elapsed 6m 6s (remain 504m 54s) Loss: 0.4241(0.6797) Grad: 7.5669  LR: 0.00000048  
Epoch: [1][600/41910] Elapsed 7m 19s (remain 503m 8s) Loss: 0.1288(0.6064) Grad: 2.0316  LR: 0.00000057  
Epoch: [1][700/41910] Elapsed 8m 31s (remain 501m 31s) Loss: 0.1922(0.5480) Grad: 2.2735  LR: 0.00000067  
Epoch: [1][800/41910] Elapsed 9m 44s (remain 500m 3s) Loss: 0.1445(0.5022) Grad: 0.9380  LR: 0.00000076  
Epoch: [1][900/41910] Elapsed 10m 57s 

## eval

In [26]:
#va_data.to_csv('val_0.csv', index=None)
va_data = pd.read_parquet('va_data_0_exp1.parquet')  # random_negative_for_recall_exp4

In [27]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def infer_sbert(model, dataloader):
    res = []
    for batch in tqdm(dataloader):
        input_ids, attention_mask = [i.to(CFG.device) for i in batch]
        with torch.no_grad():
            output = model(input_ids, attention_mask)
            # Perform pooling
            sentence_embeddings = mean_pooling(output, attention_mask)

            # Normalize embeddings
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
            res.append(sentence_embeddings.cpu().numpy())

    return np.vstack(res)

def recall(targets, preds): return len([x for x in targets if x in preds])/(len(targets)+ 1e-16)

def f2_score(y_true, y_pred):

    y_true = [set(i.split()) for i in y_true]
    y_pred = [set(i.split()) for i in y_pred]
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recs = [recall(t,p) for t,p in list(zip(y_true, y_pred))]
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4), np.nanmean(recs)
    
def valid_fn(val_df, fold=0):
    #CFG.model_path = 'sentence-transformers/all-MiniLM-L12-v2'
    tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
    val_df = val_df.reset_index(drop=True)
    val_topic_id = val_df['topic_id'].unique().tolist()
    content_df = pd.read_csv('content.csv')
    content_df = content_df.fillna('')
    content_df['content_full_text'] = content_df['title'] + ' [SEP] ' + content_df['description'] + ' [SEP] ' + content_df['text']
    content_df['content_full_text'] = content_df['content_full_text'].apply(lambda x:clean_text(x))
    topic_df = pd.read_csv('topics.csv')
    topic_df = topic_df[topic_df['id'].isin(val_topic_id)]
    topic_df = topic_df.fillna('')
    topic_df['topic_full_text'] = topic_df['title'] + ' [SEP] ' + topic_df['description']
    topic_df['topic_full_text'] = topic_df['topic_full_text'].apply(lambda x:clean_text(x))
    topic_dataset = TopicTestDataset(topic_df, tokenizer)
    content_dataset = ContentTestDataset(content_df, tokenizer)
    topic_loader = DataLoader(topic_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    content_loader = DataLoader(content_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    #model = AutoModel.from_pretrained(CFG.model_path)
    model = Custom_Bert_Simple_Test()
    model.load_state_dict(torch.load(CFG.OUTPUT_DIR + "{}_{}_best{}.pth".format(CFG.model_path.replace('/', '_'),CFG.exp_name,fold),strict=False))
    model.to(CFG.device)
    model.eval()
    topic_result = infer(model, topic_loader)
    content_result = infer(model, content_loader)
    content_ids = [i for i in range(len(content_df))]
    content_index = build_index(content_result, content_ids)
    results = content_index.knn_query(topic_result, k = 100, num_threads = -1)
    pred = []
    content_uid = content_df['id']
    for result in tqdm(results[0]):
        top_same = ' '.join(content_uid[result].to_list())
        pred.append(top_same)
    corr_df_init = pd.read_csv('correlations.csv')
    corr_df_init = corr_df_init[corr_df_init['topic_id'].isin(val_topic_id)]
    val_corr_df = topic_df.merge(corr_df_init, how='left', left_on='id', right_on='topic_id')
    val_corr_df['pred'] = pred
    gts = val_corr_df['content_ids'].to_list()                                 
    score, recall = f2_score(gts, pred)
    del model
    gc.collect()
    return score, recall, val_corr_df

score, recall, val_corr_df = valid_fn(va_data)
print(score)
print( )

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 105/105 [00:21<00:00,  4.99it/s]
100%|██████████| 2407/2407 [07:59<00:00,  5.02it/s]
100%|██████████| 6665/6665 [01:10<00:00, 94.92it/s]


0.0368
0.21595706789469252


In [33]:
val_corr_df.head()

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,topic_full_text,topic_id,content_ids,pred
0,t_001bcbb22694,Lección 1,,6e90a7,aligned,6,es,t_5d10d6819e04,True,Lección 1 [SEP],t_001bcbb22694,c_1d9dfc709413,c_87da6a40ebc8 c_f274e9c1688a c_31ff9848b09b c...
1,t_001c75b83927,Lección 2,,6e90a7,aligned,6,es,t_b2ae11936b02,True,Lección 2 [SEP],t_001c75b83927,c_60d8a4f8eff9,c_87da6a40ebc8 c_f274e9c1688a c_31ff9848b09b c...
2,t_0021d8020514,Lección 2,,6e90a7,aligned,6,es,t_e26cb5145027,True,Lección 2 [SEP],t_0021d8020514,c_e7e44cb2c32d,c_87da6a40ebc8 c_f274e9c1688a c_31ff9848b09b c...
3,t_002dfcaaf1d7,2.9: L'Hôpital's Rule,,1fb613,supplemental,5,en,t_e19c46e71ee3,True,2.9: L'Hôpital's Rule [SEP],t_002dfcaaf1d7,c_7c35d77064e5,c_5d40a2fae718 c_c62372581afa c_e12deaea53ef c...
4,t_003e944a4758,Lección 12,,6e90a7,aligned,6,es,t_c059c108eb80,True,Lección 12 [SEP],t_003e944a4758,c_8f6966ad85f6,c_87da6a40ebc8 c_f274e9c1688a c_43df58c3332a c...


In [35]:
def f(x1, x2):
    return x1 in x2
val_corr_df['if_recall'] = val_corr_df.apply(lambda x: f(x['content_ids'], x['pred']), axis=1)

In [37]:
np.where(val_corr_df, 1, 0).sum() / len(val_corr_df)

12.155138784696174

In [38]:
val_corr_df

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,topic_full_text,topic_id,content_ids,pred,if_recall
0,t_001bcbb22694,Lección 1,,6e90a7,aligned,6,es,t_5d10d6819e04,True,Lección 1 [SEP],t_001bcbb22694,c_1d9dfc709413,c_87da6a40ebc8 c_f274e9c1688a c_31ff9848b09b c...,True
1,t_001c75b83927,Lección 2,,6e90a7,aligned,6,es,t_b2ae11936b02,True,Lección 2 [SEP],t_001c75b83927,c_60d8a4f8eff9,c_87da6a40ebc8 c_f274e9c1688a c_31ff9848b09b c...,False
2,t_0021d8020514,Lección 2,,6e90a7,aligned,6,es,t_e26cb5145027,True,Lección 2 [SEP],t_0021d8020514,c_e7e44cb2c32d,c_87da6a40ebc8 c_f274e9c1688a c_31ff9848b09b c...,False
3,t_002dfcaaf1d7,2.9: L'Hôpital's Rule,,1fb613,supplemental,5,en,t_e19c46e71ee3,True,2.9: L'Hôpital's Rule [SEP],t_002dfcaaf1d7,c_7c35d77064e5,c_5d40a2fae718 c_c62372581afa c_e12deaea53ef c...,False
4,t_003e944a4758,Lección 12,,6e90a7,aligned,6,es,t_c059c108eb80,True,Lección 12 [SEP],t_003e944a4758,c_8f6966ad85f6,c_87da6a40ebc8 c_f274e9c1688a c_43df58c3332a c...,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6660,t_ffd57d147a69,Lección 20,,6e90a7,aligned,6,es,t_83ed18a38507,True,Lección 20 [SEP],t_ffd57d147a69,c_02c73701948c,c_87da6a40ebc8 c_31ff9848b09b c_7048090da02f c...,False
6661,t_ffdc013937fc,Book: Introduction to Algebraic Structures (De...,,1fb613,supplemental,3,en,t_6888f65a0882,True,Book: Introduction to Algebraic Structures (De...,t_ffdc013937fc,c_c27c5e711e25,c_cd80b4931223 c_e7ce72a553bd c_b16f03694474 c...,True
6662,t_fff05585df72,11: Systems of Equations and Inequalities,,1fb613,supplemental,4,en,t_5ab3d2eac617,True,11: Systems of Equations and Inequalities [SEP],t_fff05585df72,c_6f255c97f381 c_743e6319d5ae c_88bc7ee86c8b c...,c_933ec74fe303 c_88bc7ee86c8b c_ad9da9f1a277 c...,False
6663,t_fff7782561f4,Introduction,"In certain situations, comparison by division ...",d5fb04,supplemental,3,en,t_2a4dc28b0431,True,"Introduction [SEP] In certain situations, comp...",t_fff7782561f4,c_bca8280a9ad1,c_58e4b13049f6 c_8760f3b2c12f c_69b61f90d63e c...,True
