# Step 4: Inference

Use our finetuned retriever and reranker for inference, then submit to the competition.

## Config

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, Markdown
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
import os
import gc
import re
import time
import math
import string
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=false


In [3]:
# =========================================================================================
# Configurations
# =========================================================================================
class base_CFG:
    print_freq = 3000
    num_workers = 0
    rerank_model = '/kaggle/input/lecr-retriever/paraphrase-multilingual-mpnet-base-v2-exp21_fold0_epochs10/paraphrase-multilingual-mpnet-base-v2-exp21_fold0_epochs10'
    rerank_tokenizer = AutoTokenizer.from_pretrained(rerank_model)
    gradient_checkpointing = False
    batch_size = 64
    max_input_length = 512
    n_folds = 5
    top_n = 50
    seed = 1006
    threshold = 0.002
    
class CFG_1_para_mpnet(base_CFG):
    recall_model = "/kaggle/input/lecr-retriever/paraphrase-multilingual-mpnet-base-v2-exp21_fold0_epochs10/paraphrase-multilingual-mpnet-base-v2-exp21_fold0_epochs10"
    recall_tokenizer = AutoTokenizer.from_pretrained(recall_model)
    
class CFG_2_all_minilm(base_CFG):
    recall_model = "/kaggle/input/lecr-retriever/all-MiniLM-L6-v2-exp21_fold0_epochs10/all-MiniLM-L6-v2-exp21_fold0_epochs10"
    recall_tokenizer = AutoTokenizer.from_pretrained(recall_model)
    
class CFG_3_stsb_roberta(base_CFG):
    recall_model = "/kaggle/input/lecr-retriever/stsb-roberta-base-v2-exp21_fold0_epochs10/stsb-roberta-base-v2-exp21_fold0_epochs10"
    recall_tokenizer = AutoTokenizer.from_pretrained(recall_model)

class CFG_4_distilroberta(base_CFG):
    recall_model = "/kaggle/input/lecr-retriever/paraphrase-distilroberta-base-v1-exp21_fold0_epochs10/paraphrase-distilroberta-base-v1-exp21_fold0_epochs10"
    recall_tokenizer = AutoTokenizer.from_pretrained(recall_model)
    
class CFG_5_all_mpnet(base_CFG):
    recall_model = "/kaggle/input/lecr-retriever/all-mpnet-base-v2-exp21_fold0_epochs10/all-mpnet-base-v2-exp21_fold0_epochs10"
    recall_tokenizer = AutoTokenizer.from_pretrained(recall_model)
    
CFG_list = [
    CFG_1_para_mpnet, 
    CFG_2_all_minilm, 
    CFG_3_stsb_roberta, 
    CFG_4_distilroberta, 
    CFG_5_all_mpnet
]

## Tree

In [4]:
topics_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/topics.csv", index_col=0).fillna({"title": "", "description": ""})

In [5]:
# define some helper functions and classes to aid with data traversal
class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    def get_breadcrumbs(self, separator=" | ", include_self=False, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"

## Recall

In [6]:
def read_tree_data():
    topics_tmp = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/topics.csv')
    content_tmp = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/content.csv')
    sample_submission = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv')
    # Merge topics with sample submission to only infer test topics
    topics_tmp = topics_tmp.merge(sample_submission, how = 'inner', left_on = 'id', right_on = 'topic_id')
    del sample_submission
    gc.collect()
    
    # Fillna
    topics_tmp = topics_tmp.fillna("")
    content_tmp = content_tmp.fillna("")
    # content full text
    content_tmp['full_text'] = content_tmp['title'] + '[SEP]' + content_tmp['description'] + '[SEP]' + content_tmp['text']
    # topic tree text
    breadcrumbs = []
    for index, row in tqdm(topics_tmp.iterrows(), total=topics_tmp.shape[0]):
        tmp = Topic(row['id'])
        breadcrumbs.append(tmp.get_breadcrumbs())
    topics_tmp['tree_context'] = breadcrumbs
    del breadcrumbs
    gc.collect()
    
    # topic full text
    topics_tmp['full_text'] = topics_tmp['title'] + '[SEP]' + topics_tmp['description'] + '[SEP]' + topics_tmp['tree_context']
    # Sort by title length to make inference faster
    topics_tmp['length'] = topics_tmp['full_text'].apply(lambda x: len(x))
    content_tmp['length'] = content_tmp['full_text'].apply(lambda x: len(x))
    topics_tmp.sort_values('length', inplace = True)
    content_tmp.sort_values('length', inplace = True)
    # Drop cols
    topics_new = topics_tmp[['id','title','full_text']]
    del topics_tmp
    gc.collect()
    # topics.drop(['description', 'channel', 'category', 'level', 'language', 'parent', 'has_content', 'length', 'tree_context'], axis=1, inplace=True)
    content_new = content_tmp[['id','title','full_text']]
    del content_tmp
    # content.drop(['description', 'kind', 'language', 'text', 'copyright_holder', 'license', 'length'], axis=1, inplace=True)
    gc.collect()
    # Reset index
    topics_new.reset_index(drop = True, inplace = True)
    content_new.reset_index(drop = True, inplace = True)
    return topics_new, content_new

# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_recall_input(text, cfg):
    inputs = cfg.recall_tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True,
        max_length = cfg.max_input_length,
        truncation = True,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Recall dataset
# =========================================================================================
class recall_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['full_text'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_recall_input(self.texts[item], self.cfg)
        return inputs
    
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
# =========================================================================================
# Recall model
# =========================================================================================
class recall_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.recall_model)
        self.model = AutoModel.from_pretrained(cfg.recall_model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

In [7]:
# =========================================================================================
# Get neighbors
# =========================================================================================
def get_neighbors(topics, content, cfg):
    # Create topics dataset
    topics_dataset = recall_dataset(topics, cfg)
    # Create content dataset
    content_dataset = recall_dataset(content, cfg)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.recall_tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.recall_tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = recall_model(cfg)
    model.to(device)
    # Predict topics
    topics_preds = get_embeddings(topics_loader, model, device)
    content_preds = get_embeddings(content_loader, model, device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    torch.cuda.empty_cache()
    del topics_dataset, content_dataset, topics_loader, content_loader, topics_preds, content_preds
    gc.collect()
    
    # KNN model
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = cfg.top_n, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)
    predictions = []
    for k in range(len(indices)):
        pred = indices[k]
        p = [content.loc[ind, 'id'] for ind in pred.get()]
        predictions.append(p)
        
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, indices, model
    gc.collect()
    return predictions

In [8]:
def combine_preds(preds, tmp_preds):
    
    if len(preds) != len(tmp_preds):
        print(f'cannot combine preds with different length: preds length({len(preds)}) and tmp_preds length({len(tmp_preds)})')

    print('start to combine predictions')
    for i in range(len(preds)):
        preds[i] += tmp_preds[i]
        preds[i] = list(set(preds[i]))

In [9]:
topics_new, content_new = read_tree_data()

del topics_df
gc.collect()

predictions = get_neighbors(topics_new, content_new, CFG_list[0])

for i in range(1,len(CFG_list)):
    tmp_pred = get_neighbors(topics_new, content_new, CFG_list[i])
    combine_preds(predictions, tmp_pred) 
    del tmp_pred
    gc.collect()
    
topics_new['predictions'] = predictions

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

Training KNN model...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

Training KNN model...
start to combine predictions


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

Training KNN model...
start to combine predictions


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

Training KNN model...
start to combine predictions


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

Training KNN model...
start to combine predictions


In [10]:
print(f'each topic has {len(predictions[0])} results')

each topic has 167 results


In [11]:
topics = topics_new[['id','title','predictions']]
del topics_new
gc.collect()
# topics.drop(columns=['full_text'], inplace=True)
# content.drop(columns=['full_text'], inplace=True)
content = content_new[['id','title']]
del content_new
gc.collect()

0

In [12]:
print(topics.head())
print(content.head())

               id                                              title  \
0  t_00069b63a70a                                        Transcripts   
1  t_00068291e9a4                    Entradas e saídas de uma função   
2  t_00004da3a1b2                         Откриването на резисторите   
3  t_0006d41a73a8  Графики на експоненциални функции (Алгебра 2 н...   
4  t_4054df11a74e                     Flow Charts: Logical Thinking?   

                                         predictions  
0  [c_d5012b93e8c2, c_55e9d6961b68, c_b057682e5f2...  
1  [c_4847eff1973d, c_aac29bf10ba6, c_493d973175b...  
2  [c_0925f0aa775c, c_a8d9b7a0b623, c_8df0c4d547f...  
3  [c_8f3f94bf80ce, c_b28a81c8c588, c_c60b0012d00...  
4  [c_9c264389bf59, c_1574013b2b6a, c_9e8eb65676d...  
               id title
0  c_7fa2a802de7a     胃
1  c_5925fe0c887b    排尿
2  c_40a35e1b2028    食道
3  c_26db659ff684    牙齿
4  c_8acbe1227d2a    睾酮


In [13]:
del predictions
gc.collect()

42

## Rerank

### prepare test set

In [14]:
# =========================================================================================
# Build our inference set
# =========================================================================================
def build_inference_set(topics, content):
    print('start to build inference set')
    # Create lists for training
    topics_ids = []
    content_ids = []
    title1 = []
    title2 = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_title = row['title']
        predictions = row['predictions']
        for pred in predictions:
            content_title = content.loc[pred, 'title']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)
    # Build training dataset
    test = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'title1': title1,
         'title2': title2
        }
    )
    # Release memory
    del topics_ids, content_ids, title1, title2
    gc.collect()
    return test

# =========================================================================================
# Process test
# =========================================================================================
def preprocess_test(test):
    print('start to preprocess test')
    test['title1'].fillna("Title does not exist", inplace = True)
    test['title2'].fillna("Title does not exist", inplace = True)
    # Create feature column
    test['text'] = test['title1'] + '[SEP]' + test['title2']
    # Drop titles
    test.drop(['title1', 'title2'], axis = 1, inplace = True)
    # Sort so inference is faster
    test['length'] = test['text'].apply(lambda x: len(x))
    test.sort_values('length', inplace = True)
    test.drop(['length'], axis = 1, inplace = True)
    test.reset_index(drop = True, inplace = True)
    gc.collect()
    return test

In [15]:
# Set id as index for content
content.set_index('id', inplace = True)
# Build inference set
test = build_inference_set(topics, content)
del topics, content
gc.collect()

# Process test set
test = preprocess_test(test)

start to build inference set


  0%|          | 0/5 [00:00<?, ?it/s]

start to preprocess test


### inference

In [16]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_rerank_input(text, cfg):
    inputs = cfg.rerank_tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Rerank dataset
# =========================================================================================
class rerank_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_rerank_input(self.texts[item], self.cfg)
        return inputs

# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_socre(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

# =========================================================================================
# Model
# =========================================================================================
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.rerank_model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.rerank_model, config = self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [17]:
# =========================================================================================
# Inference function loop
# =========================================================================================
def inference_fn(test_loader, model, device):
    print('start to inference')
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total = len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
    predictions = np.concatenate(preds)
    return predictions

# =========================================================================================
# Inference
# =========================================================================================
def inference(test, cfg):
    # Create dataset and loader
    print('start to create rerank dataset')
    test_dataset = rerank_dataset(test, cfg)
    test_loader = DataLoader(
        test_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.rerank_tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    # Get model
    model = custom_model(cfg)
    # Load weights
    state = torch.load("/kaggle/input/lecr-reranker/autodl-tmp-paraphrase-multilingual-mpnet-base-v2-exp21_fold0_epochs10_fold0_rerank_exp21_top100_5models.pth", map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    # Release memory
    torch.cuda.empty_cache()
    del test_dataset, test_loader, model, state
    gc.collect()
    # Use threshold
    print('start using threshold')
    test['prediction'] = prediction
    test_1 = test.sort_values('prediction', ascending = False).drop_duplicates('topics_ids', keep = 'first')
    gc.collect()

    test['predictions'] = np.where(prediction > cfg.threshold, 1, 0)
    test1 = test[test['predictions'] == 1]
    test1 = pd.concat([test1, test_1])

    test1 = test1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
    test1['content_ids'] = test1['content_ids'].apply(lambda x: ' '.join(x))
    test1.columns = ['topic_id', 'content_ids']
    test0 = pd.Series(test['topics_ids'].unique())
    test0 = test0[~test0.isin(test1['topic_id'])]
    test0 = pd.DataFrame({'topic_id': test0.values, 'content_ids': ""})
    test_r = pd.concat([test1, test0], axis = 0, ignore_index = True)
    test_r.to_csv('submission.csv', index = False)
    return test_r

In [18]:
# Inference
test_r = inference(test, base_CFG)
test_r.head()

start to create rerank dataset
start to inference


  0%|          | 0/12 [00:00<?, ?it/s]

start using threshold


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0
1,t_00068291e9a4,c_639ea2ef9c95 c_ebb7fdf10a7e c_14bf71640ecd c...
2,t_00069b63a70a,c_749b9bfd3a69
3,t_0006d41a73a8,c_b972646631cb c_0c6473c3480d c_d7a0d7eaf799 c...
4,t_4054df11a74e,c_f2d184a98231 c_3695c5dc1df6
