# Step 3: Get our reranker

Use recalled data from step 2 to continue finetuning our finetuned retriever model in step 1.

In [1]:
import os
import gc
import time
import math
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
warnings.filterwarnings("ignore")
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


In [3]:
class CFG:
    print_freq = 2000
    num_workers = 4
    model = 'autodl-tmp/paraphrase-distilroberta-base-v1-exp21_fold0_epochs10'
    tokenizer = AutoTokenizer.from_pretrained(model)
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    epochs = 10
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 64
    weight_decay = 0.01
    max_grad_norm = 0.012
    max_len = 512
    n_folds = 5
    OUTPUT_DIR = '/root/autodl-tmp/'
    seed = 1006

In [4]:
# def modity_state_dict(cfg):
#     config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
#     config.hidden_dropout = 0.0
#     config.hidden_dropout_prob = 0.0
#     config.attention_dropout = 0.0
#     config.attention_probs_dropout_prob = 0.0
#     model = AutoModel.from_pretrained(cfg.model, config=config)
#     state = model.state_dict()
    
#     state['embeddings.position_ids'] = torch.arange(0, cfg.max_len, 1).unsqueeze(dim=0)
#     state['embeddings.position_embeddings.weight'] = torch.randn(cfg.max_len, 768)
    
#     return state

In [5]:
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True

In [6]:
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

In [7]:
# def combine_data():
#     train = pd.DataFrame()
#     for i in range(1,6):
#         print(f'========read file {i}========')
#         tmp = pd.read_csv(f'autodl-tmp/recall_data_top100_fold0_exp21_{i}.csv')
#         train = pd.concat([train,tmp], axis=0, ignore_index=True)
#         train.drop_duplicates(subset=['topics_ids','content_ids'], keep='first', inplace=True, ignore_index=True)
#         del tmp
#         gc.collect()
#     train.to_csv('autodl-tmp/recall_data_top100_fold0_exp21_full.csv')
#     return train

In [8]:
# def get_max_length(train, cfg):
#     max_length = 0
#     for index, row in tqdm(train.iterrows(), total = len(train)):
#         length = len(cfg.tokenizer(row['title1']+'[TAC]'+row['title2'], add_special_tokens=True)['input_ids'])
#         max_length = max(max_length, length)
#     cfg.max_len = max_length + 2 # cls & sep
#     print(f"max_len: {cfg.max_len}")

In [9]:
def prepare_input(text, cfg):
    text_enc = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    
    for k, v in text_enc.items():
        text_enc[k] = torch.tensor(v, dtype = torch.long)
        
    return text_enc

In [10]:
class custom_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.text = df['text'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.text)
    def __getitem__(self, item):
        inputs = prepare_input(self.text[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label

In [11]:
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [12]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        # modify model architecture
        # self.state_dict = modity_state_dict(cfg)
        # self.model = AutoModel.from_pretrained(cfg.model, config=self.config, state_dict=self.state_dict)
        # ignore
        # self.model = AutoModel.from_pretrained(cfg.model, config=self.config, ignore_mismatched_sizes=True)
        # normal
        self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [13]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))



In [15]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, target) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.cuda.amp.autocast(enabled = True):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]))
    return losses.avg

In [16]:
def valid_fn(valid_loader, model, criterion, device, cfg):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, target) in enumerate(tqdm(valid_loader)):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, 
                          len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds, axis = 0)
    return losses.avg, predictions

In [17]:
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 1, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold

In [18]:
def train_and_evaluate_one_fold(train, correlations, fold, cfg):
    print(f"========== fold: {fold} training ==========")
    # Split train & validation
    x_train = train[train['fold'] != fold]
    x_val = train[train['fold'] == fold]
    valid_labels = x_val['target'].values
    train_dataset = custom_dataset(x_train, cfg)
    valid_dataset = custom_dataset(x_val, cfg)
    
    print("build dataloader")
    train_loader = DataLoader(
        train_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = True, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    
    # Get model
    model = custom_model(cfg)
    # Add special token
    # print(f'vocab_size={model.model.config.vocab_size}, tokenizer_len={len(cfg.tokenizer)}')
    # model.model.resize_token_embeddings(len(cfg.tokenizer))
    # print(f'Add special token, vocab_size={model.model.config.vocab_size}, tokenizer_len={len(cfg.tokenizer)}')
    model.to(device)
    # Optimizer
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters
    optimizer_parameters = get_optimizer_params(
        model, 
        encoder_lr = cfg.encoder_lr, 
        decoder_lr = cfg.decoder_lr,
        weight_decay = cfg.weight_decay
    )
    optimizer = AdamW(
        optimizer_parameters, 
        lr = cfg.encoder_lr, 
        eps = cfg.eps, 
        betas = cfg.betas
    )
    num_train_steps = int(len(x_train) / cfg.batch_size * cfg.epochs)
    num_warmup_steps = num_train_steps * cfg.warmup_ratio
    # Scheduler
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = num_warmup_steps, 
        num_training_steps = num_train_steps, 
        num_cycles = cfg.num_cycles
        )
    # Training & Validation loop
    criterion = nn.BCEWithLogitsLoss(reduction = "mean")
    best_score = 0
    for epoch in range(cfg.epochs):
        print(f'Epoch {epoch+1} start train')
        start_time = time.time()
        # Train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg)
        # Validation
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device, cfg)
        # Compute f2_score
        print(f'Epoch {epoch+1} start get best threshold')
        score, threshold = get_best_threshold(x_val, predictions, correlations)
        elapsed = time.time() - start_time
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f} - Threshold: {threshold:.5f}')
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save(
                {'model': model.state_dict(), 'predictions': predictions}, 
                f"{cfg.model.replace('/', '-')}_fold{fold}_rerank_exp21_top100_5models.pth"
                )
            val_predictions = predictions
    torch.cuda.empty_cache()
    gc.collect()
    # Get best threshold
    best_score, best_threshold = get_best_threshold(x_val, val_predictions, correlations)
    LOGGER.info(f'Our CV score is {best_score} using a threshold of {best_threshold}')

In [19]:
# Seed everything
seed_everything(CFG)

In [20]:
# train = combine_data()

In [21]:
# train = pd.read_csv('autodl-tmp/recall_data_top100_fold0_exp21_full.csv', usecols=['topics_ids','content_ids','target','fold'])
# train.head()

In [22]:
# change full_text to title
# topics_df = pd.read_csv('topics.csv', usecols=['id', 'title']).fillna("")
# content_df = pd.read_csv('content.csv', usecols=['id', 'title']).fillna("")
# train = train.merge(topics_df, how='left', left_on='topics_ids', right_on='id').drop(['id'], axis=1).rename(columns={'title':'topic_title'})
# train = train.merge(content_df, how='left', left_on='content_ids', right_on='id').drop(['id'], axis=1).rename(columns={'title':'content_title'})
# train['text'] = train['topic_title'] + '[SEP]' + train['content_title']
# train.drop(['topic_title', 'content_title'], axis=1, inplace=True)

# del topics_df, content_df
# gc.collect()

# train.head()

In [23]:
# train.to_csv('autodl-tmp/recall_data_top100_fold0_exp21_title.csv')

In [24]:
train = pd.read_csv('autodl-tmp/recall_data_top100_fold0_exp21_title.csv')

In [25]:
correlations = pd.read_csv('correlations.csv')

In [None]:
# Train and evaluate one fold
train_and_evaluate_one_fold(train, correlations, 0, CFG)

build dataloader
Epoch 1 start train
Epoch: [1][0/191842] Elapsed 0m 2s (remain 6562m 6s) Loss: 0.7001(0.7001) Grad: 5.7036  LR: 0.00000000  
Epoch: [1][2000/191842] Elapsed 3m 48s (remain 362m 2s) Loss: 0.2815(0.5644) Grad: 3.1695  LR: 0.00000010  
Epoch: [1][4000/191842] Elapsed 7m 37s (remain 357m 49s) Loss: 0.0582(0.3507) Grad: 0.6148  LR: 0.00000021  
Epoch: [1][6000/191842] Elapsed 11m 24s (remain 353m 20s) Loss: 0.0089(0.2649) Grad: 0.2097  LR: 0.00000031  
Epoch: [1][8000/191842] Elapsed 15m 13s (remain 349m 43s) Loss: 0.0097(0.2222) Grad: 0.2448  LR: 0.00000042  
Epoch: [1][10000/191842] Elapsed 18m 59s (remain 345m 18s) Loss: 0.2324(0.1956) Grad: 1.2684  LR: 0.00000052  
Epoch: [1][12000/191842] Elapsed 22m 47s (remain 341m 35s) Loss: 0.0105(0.1785) Grad: 0.2813  LR: 0.00000063  
Epoch: [1][14000/191842] Elapsed 26m 33s (remain 337m 25s) Loss: 0.0697(0.1663) Grad: 0.6603  LR: 0.00000073  
Epoch: [1][16000/191842] Elapsed 30m 20s (remain 333m 28s) Loss: 0.0887(0.1565) Grad: 0.

  0%|          | 0/48332 [00:00<?, ?it/s]

EVAL: [0/48332] Elapsed 0m 1s (remain 1429m 9s) Loss: 0.6380(0.6380) 
EVAL: [2000/48332] Elapsed 0m 38s (remain 15m 0s) Loss: 0.2341(0.2192) 
EVAL: [4000/48332] Elapsed 1m 22s (remain 15m 16s) Loss: 0.0656(0.1869) 
EVAL: [6000/48332] Elapsed 2m 5s (remain 14m 46s) Loss: 0.1222(0.1710) 
EVAL: [8000/48332] Elapsed 2m 45s (remain 13m 52s) Loss: 0.0005(0.1591) 
EVAL: [10000/48332] Elapsed 3m 24s (remain 13m 4s) Loss: 0.1169(0.1542) 
EVAL: [12000/48332] Elapsed 4m 9s (remain 12m 35s) Loss: 0.2145(0.1533) 
EVAL: [14000/48332] Elapsed 5m 1s (remain 12m 20s) Loss: 0.1781(0.1543) 
EVAL: [16000/48332] Elapsed 5m 52s (remain 11m 53s) Loss: 0.0032(0.1561) 
EVAL: [18000/48332] Elapsed 6m 38s (remain 11m 12s) Loss: 0.3083(0.1595) 
EVAL: [20000/48332] Elapsed 7m 14s (remain 10m 16s) Loss: 0.0006(0.1598) 
EVAL: [22000/48332] Elapsed 8m 5s (remain 9m 40s) Loss: 0.0008(0.1481) 
EVAL: [24000/48332] Elapsed 8m 51s (remain 8m 58s) Loss: 0.0015(0.1384) 
EVAL: [26000/48332] Elapsed 9m 39s (remain 8m 17s) Los

Epoch 1 - avg_train_loss: 0.0894  avg_val_loss: 0.0770  time: 23683s
Epoch 1 - Score: 0.3951 - Threshold: 0.00700
Epoch 1 - Save Best Score: 0.3951 Model


Epoch 2 start train
Epoch: [2][0/191842] Elapsed 0m 1s (remain 6115m 46s) Loss: 0.0046(0.0046) Grad: 0.7005  LR: 0.00001000  
Epoch: [2][2000/191842] Elapsed 3m 44s (remain 355m 7s) Loss: 0.2849(0.0853) Grad: 2.6948  LR: 0.00001000  
Epoch: [2][4000/191842] Elapsed 7m 32s (remain 353m 44s) Loss: 0.0003(0.0858) Grad: 0.0167  LR: 0.00001000  
Epoch: [2][6000/191842] Elapsed 11m 17s (remain 349m 41s) Loss: 0.0004(0.0859) Grad: 0.0114  LR: 0.00001000  
Epoch: [2][8000/191842] Elapsed 15m 2s (remain 345m 48s) Loss: 0.0512(0.0857) Grad: 3.0009  LR: 0.00001000  
Epoch: [2][10000/191842] Elapsed 18m 49s (remain 342m 12s) Loss: 0.0682(0.0865) Grad: 2.0958  LR: 0.00001000  
Epoch: [2][12000/191842] Elapsed 22m 34s (remain 338m 22s) Loss: 0.1046(0.0867) Grad: 1.3413  LR: 0.00001000  
Epoch: [2][14000/191842] Elapsed 26m 18s (remain 334m 14s) Loss: 0.0920(0.0864) Grad: 2.0612  LR: 0.00001000  
Epoch: [2][16000/191842] Elapsed 30m 5s (remain 330m 41s) Loss: 0.1249(0.0862) Grad: 0.8915  LR: 0.000010