这一部分代码用于训练出我们的baseline模型

In [5]:
!pip install kaggle

Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple














In [7]:
!mkdir -p /root/.kaggle
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [8]:
!kaggle competitions download -c learning-equality-curriculum-recommendations

Downloading learning-equality-curriculum-recommendations.zip to /root

 99%|███████████████████████████████████████▋| 252M/254M [00:34<00:00, 18.5MB/s]

100%|████████████████████████████████████████| 254M/254M [00:35<00:00, 7.57MB/s]


In [9]:
!unzip learning-equality-curriculum-recommendations.zip

Archive:  learning-equality-curriculum-recommendations.zip

  inflating: content.csv             

  inflating: correlations.csv        

  inflating: sample_submission.csv   

  inflating: topics.csv              


In [10]:
!pip install transformers -q
!pip install multiprocesspandas -q
!pip install sentencepiece



Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple

Collecting sentencepiece

  Downloading https://repo.huaweicloud.com/repository/pypi/packages/0e/7e/a69d054029c7c0470e490b3265bbd1497df9492599b1820b9d5be2c60444/sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)

[K     |████████████████████████████████| 1.3 MB 8.2 MB/s eta 0:00:01

[?25hInstalling collected packages: sentencepiece

Successfully installed sentencepiece-0.1.97



In [13]:
pip install scikit-learn

Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple







Note: you may need to restart the kernel to use updated packages.


# CV Split

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from multiprocesspandas import applyparallel
from tqdm import tqdm

In [15]:
N_SPLITS = 5

In [16]:
topic_df = pd.read_csv('topics.csv')
content_df = pd.read_csv('content.csv')
corr_df = pd.read_csv('correlations.csv')

# 因为test set中没有来自source的数据，因此在baseline中，暂时忽略掉这一部分
topic_df_non_source = topic_df[topic_df['category']!='source'].reset_index(drop=True)
topic_df_non_source['stratify'] = topic_df_non_source['category'] + \
topic_df_non_source['language'] + topic_df_non_source['description'].apply(lambda x: str(isinstance(x, str))) + \
topic_df_non_source['has_content'].apply(str)

In [17]:
kf = StratifiedGroupKFold(n_splits=N_SPLITS)
# 我们想要来自于同一个topic_tree的数据都能同时出现在训练or测试集中，避免数据泄露
# 所以group设置为channel
folds = list(kf.split(topic_df_non_source, y=topic_df_non_source["stratify"], groups=topic_df_non_source["channel"]))
topic_df_non_source['fold'] = -1

for fold, (train_idx, val_idx) in enumerate(folds):
    topic_df_non_source.loc[val_idx, "fold"] = fold




In [18]:
fold_df =  topic_df.merge(topic_df_non_source[['id', 'fold']], on='id', how='left').reset_index(drop=True)[['id', 'fold']].fillna(-1).rename(columns={'id': 'topic_id'})
fold_df['fold'] = fold_df['fold'].astype(int)

In [19]:
corr_df['content_ids'] = corr_df['content_ids'].apply(lambda x:x.split())
corr_df = corr_df.explode('content_ids').reset_index(drop=True)

In [20]:
topic_df = topic_df.fillna('')
topic_df['topic_full_text'] =  topic_df['title'] + ' [SEP] ' + topic_df['description']
topic_df = topic_df[['id', 'topic_full_text', 'language']]
df = corr_df.merge(topic_df, left_on='topic_id', right_on='id', how='left')
df = df[['topic_id','content_ids','topic_full_text','language']]
df = df.rename(columns={'language':'topic_language'})

In [21]:
content_df = content_df.fillna('')

content_df['content_full_text'] =  content_df['title'] + ' [SEP] ' + content_df['description'] + ' [SEP] ' + content_df['text']
content_df = content_df[['id', 'content_full_text', 'language']]
df = df.merge(content_df, left_on='content_ids', right_on='id', how='left')
df = df.rename(columns={'language':'content_language'})
# 这里的都是根据correlation制造出的正样本，因此把代表着相似度的label设置为1
df['label'] = 1

In [23]:
df.head()

Unnamed: 0,topic_id,content_ids,topic_full_text,topic_language,id,content_full_text,content_language,label
0,t_00004da3a1b2,c_1108dd0c7a5d,Откриването на резисторите [SEP] Изследване на...,bg,c_1108dd0c7a5d,Молив като резистор [SEP] Моливът причинява пр...,bg,1
1,t_00004da3a1b2,c_376c5a8eb028,Откриването на резисторите [SEP] Изследване на...,bg,c_376c5a8eb028,Да чуем променливото съпротивление [SEP] Тук ч...,bg,1
2,t_00004da3a1b2,c_5bc0e1e2cba0,Откриването на резисторите [SEP] Изследване на...,bg,c_5bc0e1e2cba0,Променлив резистор (реостат) с графит от молив...,bg,1
3,t_00004da3a1b2,c_76231f9d0b5e,Откриването на резисторите [SEP] Изследване на...,bg,c_76231f9d0b5e,Последователно свързване на галваничен елемент...,bg,1
4,t_00068291e9a4,c_639ea2ef9c95,Entradas e saídas de uma função [SEP] Entenda ...,pt,c_639ea2ef9c95,Dados e resultados de funções: gráficos [SEP] ...,pt,1


# Random Sample According to Language
* 这里后续可以考虑尝试 bm2.5 or tfidf>0.8 采样进行提分

In [None]:
neg_df = []
sample_n = 5

def negative_smaple(x, candidates):
    topic_language = x['topic_language'][0]
    candidates = candidates[candidates['content_language'] == topic_language]

    return candidates[['topic_full_text', 'content_full_text']].sample(n=sample_n)

for topic_id in tqdm(df['topic_id'].unique()):
    sub_df = df[df['topic_id'] == topic_id]
    topic_language = sub_df['topic_language'].unique()[0]
    candidates = df[df['content_language'] == topic_language]
    sample_neg = candidates[['topic_full_text', 'content_full_text']]
    sample_neg = sample_neg[-(sample_neg['content_full_text'].isin(sub_df['content_full_text'].to_list()))].sample(n=sample_n)
    sample_neg['topic_id'] = topic_id
    sample_neg['label'] = 0  # 负采样，把这些作为负样本
    neg_df.append(sample_neg)
neg_df = pd.concat(neg_df)
neg_df

 22%|██▏       | 13663/61517 [12:11<39:27, 20.21it/s]  

In [None]:
df = df[['topic_id', 'topic_full_text', 'content_full_text', 'label']]
df = pd.concat([df, neg_df])
df = df.drop_duplicates()

In [None]:
df = df.merge(fold_df, left_on='topic_id', right_on='topic_id', how='left')
df = df[['topic_full_text', 'content_full_text', 'label' ,'fold']]
df = df[df['fold'].isin([0, 1, 2, 3, 4])]

In [None]:
df.to_csv('train_folds.csv', index=None)

In [None]:
df = pd.read_csv('train_folds.csv')
df = df[df['fold'].isin([0, 1, 2, 3, 4])]

# Create CFG

In [None]:
import numpy as np
import pandas as pd
import time
import math
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import copy
import torch.nn as nn
import os
import json
import gc
import random
from torch.cuda.amp import autocast, GradScaler

In [None]:
class CFG:
    input_path = 'LECR'
    model_path = 'microsoft/mdeberta-v3-base' 
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 124
    epochs = 5  # 5
    encoder_lr = 20e-6
    decoder_lr = 1e-3
    min_lr = 0.5e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0
    num_fold = 5
    batch_size = 32
    seed = 42
    OUTPUT_DIR = 'LECR'
    num_workers = 2
    device='cuda'
    print_freq = 100
    apex=False
    start_awp_epoch = 2 # 开始AWP epoch
    adv_lr = 1e-5 # AWP学习率
    adv_eps = 1e-3 # AWP epsilon
    adv_step = 1 # AWP step

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [None]:
class TrainDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.topic = df['topic_full_text'].values
        self.content = df['content_full_text'].values
        self.label = df['label'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.topic)
    def __getitem__(self, item):
        topic = self.topic[item].replace('[SEP]', self.sep_token)
        content = self.content[item].replace('[SEP]', self.sep_token)
        label = int(self.label[item])

        
        inputs_topic = self.tokenizer(topic, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        inputs_content = self.tokenizer(content, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        return torch.as_tensor(inputs_topic['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_topic['attention_mask'], dtype=torch.long), \
            torch.as_tensor(inputs_content['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_content['attention_mask'], dtype=torch.long), \
            torch.as_tensor(label, dtype=torch.float)

# Build Model

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput

class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.linear = nn.Linear(self.config.hidden_size*3, 1)

    def forward(self,
        topic_input_ids,
        content_input_ids,
        topic_attention_mask=None,
        content_attention_mask=None, 
        labels=None):
        topic_output = self.base(input_ids=topic_input_ids,attention_mask=topic_attention_mask)
        topic_output = topic_output.last_hidden_state
        topic_output = torch.mean(topic_output, dim=1)

        content_output = self.base(input_ids=content_input_ids,attention_mask=content_attention_mask)
        content_output = content_output.last_hidden_state
        content_output = torch.mean(content_output, dim=1)

        diff = torch.abs(topic_output-content_output)

        sentence_embedding = torch.cat([topic_output, content_output, diff], 1)

        output = self.linear(sentence_embedding)
        
        loss = None
        if labels is not None:
            loss = F.binary_cross_entropy_with_logits(output.view(-1), labels.view(-1))
        
        return loss

# Build Logger

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))

# Build Pipeline

In [None]:
def train_fn(train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        batch = [i.to(device) for i in batch]
        topic_input_ids, topic_attention_mask, content_input_ids, content_attention_mask, label = batch
        batch_size = label.size(0)
        loss = model(topic_input_ids, content_input_ids, topic_attention_mask, content_attention_mask, label)
        losses.update(loss.item(), batch_size)
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 500)
        optimizer.step()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        label = batch[2].to(device)
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        batch_size = label.size(0)
        with torch.no_grad():
            output = model(input_ids, mask, labels=label)
        loss = output.loss
        y_preds = output.logits.argmax(dim=-1)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    #print(predictions)
    return losses.avg, predictions, labels

def train_loop(fold, model, train_dataset, valid_dataset):
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    #model = Custom_Bert_Simple()
    #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    model.to(CFG.device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
        ]
        return optimizer_parameters

    def get_optimizer(model):

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': CFG.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': 0.0}
            
        ]
        optimizer = AdamW(optimizer_parameters, lr = CFG.encoder_lr, eps = CFG.eps, betas = CFG.betas)
        return optimizer

    
    optimizer = get_optimizer(model)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

    # criterion = LabelSmoothingLoss()
    best_score = float('inf')

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        #avg_loss = train_fn_awp(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        
        avg_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        # eval
        #avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

        # scoring
        #score = get_score(predictions, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch + 1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        #LOGGER.info(f'Epoch {epoch + 1} - Score: {score:.4f}')


        if best_score > avg_loss:
            best_score = avg_loss
            #best_predictions = predictions
            LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
            torch.save(model.state_dict(),
                       CFG.OUTPUT_DIR + "{}_best{}.pth".format(CFG.model_path.replace('/', '_'),fold))



    torch.cuda.empty_cache()
    gc.collect()
    del scheduler, optimizer, model
    return 

In [None]:
model = Custom_Bert_Simple()
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
fold = 0
tr_data = df[df['fold']!=fold].reset_index(drop=True)
va_data = df[df['fold']==fold].reset_index(drop=True)
tr_dataset = TrainDataset(tr_data,tokenizer)
va_dataset = TrainDataset(va_data,tokenizer)
val_result = train_loop(fold, model,tr_dataset, va_dataset)

# Inference

In [None]:
!pip install hnswlib

In [None]:
import numpy as np
import pandas as pd
import time
import math
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import copy
import torch.nn as nn
import os
import json
import gc
import random
from torch.cuda.amp import autocast, GradScaler
import hnswlib  # 在kaggle上这个库要提前离线安装一下

In [None]:
class CFG:
    input_path = 'LECR'
    model_path = 'microsoft/mdeberta-v3-base' 
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 124
    epochs = 5  # 5
    encoder_lr = 20e-6
    decoder_lr = 1e-3
    min_lr = 0.5e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0
    num_fold = 5
    batch_size = 32
    seed = 42
    OUTPUT_DIR = 'LECR'
    num_workers = 2
    device='cuda'
    print_freq = 100
    apex=False
    start_awp_epoch = 2 # 开始AWP epoch
    adv_lr = 1e-5 # AWP学习率
    adv_eps = 1e-3 # AWP epsilon
    adv_step = 1 # AWP step

In [None]:
class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)

    def forward(self,
        input_ids,
        attention_mask=None):
        output = self.base(input_ids=input_ids,attention_mask=attention_mask)
        output = output.last_hidden_state
        output = torch.mean(output, dim=1)
        return output

In [None]:
model = Custom_Bert_Simple()
model.load_state_dict(torch.load('LECRmicrosoft_mdeberta-v3-base_best0.pth'),strict=False)
model.to(CFG.device)
model.eval()

In [None]:
content_df = pd.read_csv('content.csv')
correlations_df = pd.read_csv('correlations.csv')
topics_df = pd.read_csv('topics.csv')
#topics_df = topics_df[topics_df['category']!='source'].reset_index(drop=True)
sub_df = pd.read_csv('sample_submission.csv')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

In [None]:
class Testataset(Dataset):
    def __init__(self,df,tokenizer):
        self.title = df['title'].values
        self.description = df['description'].values
        self.text = None
        if 'text' in df.columns:
            self.text = df['text'].values
        
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.title)
    def __getitem__(self, item):
        
        input_text = self.title[item]
        if isinstance(input_text, float):
            input_text = ''
        if not isinstance(self.description[item], float):
            #print(self.description[item])
            input_text += ' ' + self.sep_token + ' ' + self.description[item]
        
        if self.text is not None and not isinstance(self.text[item], float):
            input_text += ' ' + self.sep_token + self.text[item]
            
        output = self.tokenizer(input_text, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        
        return torch.as_tensor(output['input_ids'], dtype=torch.long), \
            torch.as_tensor(output['attention_mask'], dtype=torch.long)

In [None]:
topic_dataset = Testataset(topics_df[topics_df['id'].isin(sub_df['topic_id'])], tokenizer)
content_dataset = Testataset(content_df, tokenizer)
topic_loader = DataLoader(topic_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
content_loader = DataLoader(content_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
def infer(model, dataloader):
    res = []
    for batch in tqdm(dataloader):
        input_ids, attention_mask = [i.to(CFG.device) for i in batch]
        with torch.no_grad():
            output = model(input_ids, attention_mask)
            res.append(output.cpu().numpy())
    
    return np.vstack(res)

In [None]:
topic_result = infer(model, topic_loader)
content_result = infer(model, content_loader)

In [None]:
content_ids = [i for i in range(len(content_df))]

In [None]:
def build_index(embeddings, ids):

    index = hnswlib.Index(space="cosine", dim=embeddings.shape[-1])

    # Initializing index
    # max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
    # during insertion of an element.
    # The capacity can be increased by saving/loading the index, see below.
    #
    # ef_construction - controls index search speed/build speed tradeoff
    #
    # M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
    # Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
    index.init_index(max_elements=embeddings.shape[0], ef_construction=200, M=160)

    # Controlling the recall by setting ef:
    # higher ef leads to better accuracy, but slower search
    index.set_ef(50)

    # Set number of threads used during batch search/construction
    # By default using all available cores
    index.set_num_threads(16)

    
    index.add_items(embeddings, ids)


    return index

In [None]:
content_index = build_index(content_result, content_ids)
results = content_index.knn_query(topic_result, k = 5, num_threads = -1)

In [None]:
pred = []
conten_uid = content_df['id']
for result in tqdm(results[0]):
    top_same = ' '.join(conten_uid[result].to_list())
    pred.append(top_same)

In [None]:
sub_df['content_ids'] = pred
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=None)