In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/README.md
/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/model/config.json
/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/model/merges.txt
/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/model/sentence_bert_config.json
/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/model/pytorch_model.bin
/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/model/config_sentence_transformers.json
/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/model/modules.json
/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/tokenizer/tokenizer.json
/kaggle/input/paraphrase-distilroberta-base-v1/paraphrase-distilroberta-base-v1-exp2/tokenizer/vocab.json
/kaggle/input/paraphrase-dist

### Importing Libraries and setting up configuration classes for pretrained model

This code defines three configurations for three different models: <code>uns_model</code> and <code>sup_model</code> for each configuration correspond to the paths of pre-trained models for unsupervised and supervised learning respectively. <code>uns_tokenizer</code> and <code>sup_tokenizer</code> for each configuration correspond to the tokenizers for unsupervised and supervised learning respectively. The <code>pooling</code> attribute for each configuration indicates the way of pooling embeddings. <code>gradient_checkpointing</code> is a flag to determine whether to use gradient checkpointing for more efficient memory usage. <code>add_with_best_prob</code> is a flag indicating whether to add the embedding with the best probability during the training phase.

Finally, the code creates a list <code>CFG_list</code> that contains all the defined configurations, which will be used in the next steps of the project.

In [6]:
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CFG1:
    uns_model = "/kaggle/input/lecr-ensemble/LECR-ENSEMBLE/sentence-transformers-all-mpnet-base-v2"
    sup_model = "/kaggle/input/lecr-ensemble/LECR-ENSEMBLE/xlm-roberta-base"
    uns_tokenizer = AutoTokenizer.from_pretrained(uns_model + '/tokenizer')
    sup_tokenizer = AutoTokenizer.from_pretrained(sup_model + '/tokenizer')
    pooling = "mean"
    gradient_checkpointing = False
    add_with_best_prob = True
    
class CFG2:
    uns_model = "/kaggle/input/lecr-ensemble/LECR-ENSEMBLE/sentence-transformers-all-MiniLM-L6-v2"
    sup_model = "/kaggle/input/lecr-ensemble/LECR-ENSEMBLE/xlm-roberta-base"
    uns_tokenizer = AutoTokenizer.from_pretrained(uns_model + '/tokenizer')
    sup_tokenizer = AutoTokenizer.from_pretrained(sup_model + '/tokenizer')
    pooling = "mean"
    gradient_checkpointing = False
    add_with_best_prob = True
    
class CFG3:
    uns_model = "/kaggle/input/stsb-roberta-base-v2/stsb-roberta-base-v2-exp1"
    sup_model = "/kaggle/input/lecr-ensemble/LECR-ENSEMBLE/xlm-roberta-base"
    uns_tokenizer = AutoTokenizer.from_pretrained(uns_model + '/tokenizer')
    sup_tokenizer = AutoTokenizer.from_pretrained(sup_model + '/tokenizer')
    pooling = "mean"
    gradient_checkpointing = False
    add_with_best_prob = True 
    
CFG_list = [CFG1, CFG2, CFG3]

env: TOKENIZERS_PARALLELISM=false


### Defining a function for reading the data into RAM with basic preproccessing
This is a Python function named <code>read_data(cfg)</code> that takes a configuration object <code>cfg</code> as an input parameter.

The function reads in three CSV files: <code>topics.csv</code>, <code>content.csv</code>, and <code>sample_submission.csv</code>, which contain data related to the Learning Curriculum Recommendation project.

After reading in the CSV files, the function merges the <code>topics</code> dataframe with the <code>sample_submission</code> dataframe, which allows the function to only infer test topics.

Next, the function fills in missing values in the <code>title</code> column of both the <code>topics</code> and <code>content</code> dataframes.

Then, the function adds a new column called <code>length</code> to both dataframes, which contains the length of the <code>title</code> column for each row. The dataframes are then sorted in ascending order by the <code>length</code> column to make inference faster.

The function drops several columns from both dataframes that are not needed for the project, and then resets the index of both dataframes.

Finally, the function prints the shapes of both dataframes to verify that they have been read in correctly.

In [8]:
def read_data(cfg):
    topics = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/topics.csv')
    content = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/content.csv')
    sample_submission = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv')
    # Merge topics with sample submission to only infer test topics
    topics = topics.merge(sample_submission, how = 'inner', left_on = 'id', right_on = 'topic_id')
    # Fillna titles
    topics['title'].fillna("", inplace = True)
    content['title'].fillna("", inplace = True)
    # Sort by title length to make inference faster
    topics['length'] = topics['title'].apply(lambda x: len(x))
    content['length'] = content['title'].apply(lambda x: len(x))
    topics.sort_values('length', inplace = True)
    content.sort_values('length', inplace = True)
    # Drop cols
    topics.drop(['description', 'channel', 'category', 'level', 'parent', 'has_content', 'length', 'topic_id', 'content_ids'], axis = 1, inplace = True)
    content.drop(['description', 'kind', 'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
    # Reset index
    topics.reset_index(drop = True, inplace = True)
    content.reset_index(drop = True, inplace = True)
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")
    return topics, content

### Creating tensors for input to unsupervised model that is encoded tokenizers

This code defines a function <code>prepare_uns_input</code> that takes two arguments: <code>text</code> and <code>cfg</code>. The function uses the <code>uns_tokenizer</code> attribute of the configuration object <code>cfg</code> to tokenize the input <code>text</code>. The resulting tokenized input is encoded using <code>encode_plus</code> method of the tokenizer. The <code>return_tensors</code> argument is set to <code>None</code>, meaning that the function will not return the encoded input as tensors. The <code>add_special_tokens</code> argument is set to <code>True</code>, meaning that the special tokens of the tokenizer will be added to the encoded input.

The resulting encoded input is stored in a dictionary called <code>inputs</code>, which is then looped over to convert all values to PyTorch tensors of data type <code>long</code>. Finally, the function returns the encoded and tensorized input.

In [9]:
def prepare_uns_input(text, cfg):
    inputs = cfg.uns_tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

### **Defining Pooling classes** viz, 

This is a module defining several pooling layers used in natural language processing tasks, where the outputs of a pre-trained language model (such as <code>BERT</code>) are passed through these pooling layers to obtain a fixed-size representation of the input text. The fixed-size representation can then be used for downstream tasks, such as classification or regression.

The module defines the following pooling layers:

- <code>MeanPooling</code>: computes the mean of the hidden state vectors over the input sequence, weighted by the attention mask to exclude padding tokens.
- <code>AttentionPooling</code>: computes a weighted sum of the hidden state vectors over the input sequence, where the weights are computed using a multi-layer perceptron (<code>MLP</code>) and then normalized with a softmax function, again weighted by the attention mask to exclude padding tokens.
- <code>MaxPooling</code>: computes the element-wise maximum of the hidden state vectors over the input sequence, where the hidden state vectors are set to a very negative value (-10^4) for padding tokens so they are not selected as the maximum.
- <code>MinPooling</code>: computes the element-wise minimum of the hidden state vectors over the input sequence, where the hidden state vectors are set to a very small positive value (10^-4) for padding tokens so they are not selected as the minimum.
- <code>WeightedLayerPooling</code>: computes a weighted average of the hidden state vectors over all the layers of the pre-trained language model, where the weights can be learned or manually set.
- <code>ConcatPooling</code>: concatenates the hidden state vectors from the last <code>n_layers</code> of the pre-trained language model, where <code>n_layers</code> is a hyperparameter specified in the <code>pooling_config</code>, and returns the concatenated vector.

In [10]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class AttentionPooling(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.GELU(),
            nn.Linear(hidden_size, 1)
        )
    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()       
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings
       
class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()     
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e-4
        min_embeddings, _ = torch.min(embeddings, dim = 1)
        return min_embeddings
    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )
    def forward(self, features):
        ft_all_layers = features['all_layer_embeddings']

        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        features.update({'token_embeddings': weighted_average})
        return features
    
class ConcatPooling(nn.Module):
    def __init__(self, backbone_config, pooling_config):
        super(ConcatPooling, self, ).__init__()
        self.n_layers = pooling_config.n_layers
        self.output_dim = backbone_config.hidden_size*pooling_config.n_layers

    def forward(self, inputs, backbone_outputs):
        all_hidden_states = get_all_hidden_states(backbone_outputs)

        concatenate_pooling = torch.cat([all_hidden_states[-(i + 1)] for i in range(self.n_layers)], -1)
        concatenate_pooling = concatenate_pooling[:, 0]
        return concatenate_pooling

### Unsupervised Dataset

This is a Python class definition for a custom dataset called <code>uns_dataset</code> which inherits from the PyTorch <code>Dataset</code> class.

The <code>__init__</code> method initializes the dataset by taking in two arguments: <code>df</code>, which is a pandas dataframe containing a column named <code>title</code> with the input text data, and <code>cfg</code>, which is a configuration object that contains parameters related to the text preprocessing and encoding.

The <code>__len__</code> method is required by the <code>Dataset</code> class and returns the length of the dataset. In this case, it returns the length of the <code>texts</code> attribute, which is a numpy array of the text data.

The <code>__getitem__</code> method is also required by the <code>Dataset</code> class and defines how to retrieve an individual data sample from the dataset. In this case, it takes an index <code>item</code> as an argument, prepares the input data using a <code>prepare_uns_input</code> function with the text at that index and the <code>cfg</code> configuration object, and returns the prepared input data.

Overall, this class allows the text data to be loaded and prepared on the fly as needed during training or inference.

In [11]:
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_uns_input(self.texts[item], self.cfg)
        return inputs

**Supervised Model Architechture and preparing input for the same**

In [12]:
def prepare_sup_input(text, cfg):
    inputs = cfg.sup_tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

class sup_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_sup_input(self.texts[item], self.cfg)
        return inputs

**Unsupervised model architecture**

In [13]:
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.uns_model + '/config')
        self.model = AutoModel.from_pretrained(cfg.uns_model + '/model', config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature

**Getting Embeddings**

In [14]:
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

**Producing Positive classes**

In [15]:
def get_pos_socre(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

**Building our inference set**

In [16]:
def build_inference_set(topics, content, cfg):
    # Create lists for training
    topics_ids = []
    content_ids = []
    topics_languages = []
    content_languages = []
    title1 = []
    title2 = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_language = row['language']
        topics_title = row['title']
        predictions = row['predictions'].split(' ')
        for pred in predictions:
            content_title = content.loc[pred, 'title']
            content_language = content.loc[pred, 'language']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)
            topics_languages.append(topics_language)
            content_languages.append(content_language)
    # Build training dataset
    test = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'title1': title1, 
         'title2': title2,
         'topic_language': topics_languages, 
         'content_language': content_languages, 
        }
    )
    # Release memory
    del topics_ids, content_ids, title1, title2, topics_languages, content_languages
    gc.collect()
    
    return test

**Getting Neighbours**

In [17]:
def get_neighbors(tmp_topics, tmp_content, cfg):
    # Create topics dataset
    topics_dataset = uns_dataset(tmp_topics, cfg)
    # Create content dataset
    content_dataset = uns_dataset(tmp_content, cfg)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = 32, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.uns_tokenizer, padding = 'longest'),
        num_workers = 4, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = 32, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.uns_tokenizer, padding = 'longest'),
        num_workers = 4, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    # Predict topics
    topics_preds = get_embeddings(topics_loader, model, device)
    content_preds = get_embeddings(content_loader, model, device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    del topics_dataset, content_dataset, topics_loader, content_loader, topics_preds, content_preds
    gc.collect()
    torch.cuda.empty_cache()
    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = 1000, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)
    predictions = []
    for k in range(len(indices)):
        pred = indices[k]
        p = ' '.join([tmp_content.loc[ind, 'id'] for ind in pred.get()])
        predictions.append(p)
    tmp_topics['predictions'] = predictions
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model
    gc.collect()
    torch.cuda.empty_cache()
    return tmp_topics, tmp_content

**Processing Test DF**

In [18]:
def preprocess_test(tmp_test):
    tmp_test['title1'].fillna("Title does not exist", inplace = True)
    tmp_test['title2'].fillna("Title does not exist", inplace = True)
    # Create feature column
    tmp_test['text'] = tmp_test['title1'] + '[SEP]' + tmp_test['title2']
    # Drop titles
    tmp_test.drop(['title1', 'title2'], axis = 1, inplace = True)
    # Sort so inference is faster
    tmp_test['length'] = tmp_test['text'].apply(lambda x: len(x))
    tmp_test.sort_values('length', inplace = True)
    tmp_test.drop(['length'], axis = 1, inplace = True)
    tmp_test.reset_index(drop = True, inplace = True)
    gc.collect()
    torch.cuda.empty_cache()
    return tmp_test

**Model Itself**

In [19]:
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.sup_model + '/config', output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.sup_model + '/model', config = self.config)
        #self.pool = MeanPooling()
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        if CFG.pooling == 'mean' or CFG.pooling == "ConcatPool":
            self.pool = MeanPooling()
        elif CFG.pooling == 'max':
            self.pool = MaxPooling()
        elif CFG.pooling == 'min':
            self.pool = MinPooling()
        elif CFG.pooling == 'attention':
            self.pool = AttentionPooling(self.config.hidden_size)
        elif CFG.pooling == "WLP":
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers, layer_start=6)
        
        if CFG.pooling == "ConcatPool":
            self.fc = nn.Linear(self.config.hidden_size*4, 1)  
        else:
            self.fc = nn.Linear(self.config.hidden_size, 1)
        #self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        
        if CFG.pooling == "WLP":
            last_hidden_state = self.model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
            tmp = {
                'all_layer_embeddings': last_hidden_state.hidden_states
            }
            feature = self.pool(tmp)['token_embeddings'][:, 0]
            
        elif CFG.pooling == "ConcatPool":
            last_hidden_state = torch.stack(self.model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).hidden_states)

            p1 = self.pool(last_hidden_state[-1], inputs['attention_mask'])
            p2 = self.pool(last_hidden_state[-2], inputs['attention_mask'])
            p3 = self.pool(last_hidden_state[-3], inputs['attention_mask'])
            p4 = self.pool(last_hidden_state[-4], inputs['attention_mask'])

            feature = torch.cat(
                (p1, p2, p3, p4),-1
            )
               
        else:
            last_hidden_state = outputs.last_hidden_state
            feature = self.pool(last_hidden_state, inputs['attention_mask'])
        
        #last_hidden_state = outputs.last_hidden_state
        #feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

**Inference function**

In [20]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total = len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
    predictions = np.concatenate(preds)
    return predictions

def inference(test, cfg, _idx):
    # Create dataset and loader
    test_dataset = sup_dataset(test, cfg)
    test_loader = DataLoader(
        test_dataset, 
        batch_size = 32, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.sup_tokenizer, padding = 'longest'),
        num_workers = 2, 
        pin_memory = True, 
        drop_last = False
    )
    # Get model
    model = custom_model(cfg)
    
    # Load weights
    state = torch.load("/kaggle/input/lecr-ensemble/LECR-ENSEMBLE/xlm-roberta-base_fold0_42.pth", map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    
    # Release memory
    torch.cuda.empty_cache()
    del test_dataset, test_loader, model, state
    gc.collect()
    
    # Use threshold
    test['probs'] = prediction
    test['predictions'] = test['probs'].apply(lambda x: int(x > 0.0006))  
    test = test.merge(test.groupby("topics_ids", as_index=False)["probs"].max(), on="topics_ids", suffixes=["", "_max"])
    display(test.head())
    
    test1 = test[(test['predictions'] == 1) & (test['topic_language'] == test['content_language'])]
    test1 = test1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
    test1['content_ids'] = test1['content_ids'].apply(lambda x: ' '.join(x))
    test1.columns = ['topic_id', 'content_ids']
    display(test1.head())
    
    test0 = pd.Series(test['topics_ids'].unique())
    test0 = test0[~test0.isin(test1['topic_id'])]
    test0 = pd.DataFrame({'topic_id': test0.values, 'content_ids': ""})
    if cfg.add_with_best_prob:
        test0 = test0[["topic_id"]].merge(test[test['probs'] == test['probs_max']][["topics_ids", "content_ids"]],
                                          left_on="topic_id", right_on="topics_ids")[['topic_id', "content_ids"]]
    display(test0.head())
    test_r = pd.concat([test1, test0], axis = 0, ignore_index = True)
    test_r.to_csv(f'submission_{_idx+1}.csv', index = False)
    
    return test_r

**Generating results**

In [21]:
for _idx, CFG in enumerate(CFG_list):
    # Read data
    tmp_topics, tmp_content = read_data(CFG)
    # Run nearest neighbors
    tmp_topics, tmp_content = get_neighbors(tmp_topics, tmp_content, CFG)
    gc.collect()
    torch.cuda.empty_cache()
    # Set id as index for content
    tmp_content.set_index('id', inplace = True)
    # Build training set
    tmp_test = build_inference_set(tmp_topics, tmp_content, CFG)
    # Process test set
    tmp_test = preprocess_test(tmp_test)
    # Inference
    inference(tmp_test, CFG, _idx)
    del tmp_topics, tmp_content, tmp_test
    gc.collect()
    torch.cuda.empty_cache()
    
df_test = pd.concat([pd.read_csv(f'submission_{_idx + 1}.csv') for _idx in range(len(CFG_list))])
df_test.fillna("", inplace = True)
df_test['content_ids'] = df_test['content_ids'].apply(lambda c: c.split(' '))
df_test = df_test.explode('content_ids').groupby(['topic_id'])['content_ids'].unique().reset_index()
df_test['content_ids'] = df_test['content_ids'].apply(lambda c: ' '.join(c))

df_test.to_csv('submission.csv', index = False)
df_test.head()

 
--------------------------------------------------
topics.shape: (5, 3)
content.shape: (154047, 3)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4814 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times



 
Training KNN model...


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

Unnamed: 0,topics_ids,content_ids,topic_language,content_language,text,probs,predictions,probs_max
0,t_00069b63a70a,c_66111e868395,en,en,Transcripts[SEP]DNA,4.851724e-08,0,0.000185
1,t_00069b63a70a,c_bd5e71a65b93,en,en,Transcripts[SEP]DNA,4.851724e-08,0,0.000185
2,t_00069b63a70a,c_c152775f6f7b,en,en,Transcripts[SEP]DNA,4.851724e-08,0,0.000185
3,t_00069b63a70a,c_a7799481219a,en,pt,Transcripts[SEP]DNA,4.851724e-08,0,0.000185
4,t_00069b63a70a,c_b68d68a3868b,en,en,Transcripts[SEP]DNA,4.851724e-08,0,0.000185


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_5bc0e1e2cba0
1,t_00068291e9a4,c_ebb7fdf10a7e c_14bf71640ecd c_ac1672cdcd2c c...
2,t_0006d41a73a8,c_b972646631cb c_0c6473c3480d c_d7a0d7eaf799 c...
3,t_4054df11a74e,c_3695c5dc1df6


Unnamed: 0,topic_id,content_ids
0,t_00069b63a70a,c_749b9bfd3a69


 
--------------------------------------------------
topics.shape: (5, 3)
content.shape: (154047, 3)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4814 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multipro

 
Training KNN model...


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

Unnamed: 0,topics_ids,content_ids,topic_language,content_language,text,probs,predictions,probs_max
0,t_00069b63a70a,c_66111e868395,en,en,Transcripts[SEP]DNA,4.851751e-08,0,0.00019
1,t_00069b63a70a,c_f9389c635f87,en,en,Transcripts[SEP]RNA,4.613222e-08,0,0.00019
2,t_00069b63a70a,c_c152775f6f7b,en,en,Transcripts[SEP]DNA,4.851751e-08,0,0.00019
3,t_00069b63a70a,c_bd5e71a65b93,en,en,Transcripts[SEP]DNA,4.851751e-08,0,0.00019
4,t_00069b63a70a,c_a7799481219a,en,pt,Transcripts[SEP]DNA,4.851751e-08,0,0.00019


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_5bc0e1e2cba0
1,t_00068291e9a4,c_ebb7fdf10a7e c_639ea2ef9c95 c_14bf71640ecd c...
2,t_0006d41a73a8,c_b972646631cb c_0c6473c3480d c_d7a0d7eaf799 c...
3,t_4054df11a74e,c_3695c5dc1df6


Unnamed: 0,topic_id,content_ids
0,t_00069b63a70a,c_186fc761585b


 
--------------------------------------------------
topics.shape: (5, 3)
content.shape: (154047, 3)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4814 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times



 
Training KNN model...


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

Unnamed: 0,topics_ids,content_ids,topic_language,content_language,text,probs,predictions,probs_max
0,t_00069b63a70a,c_83398bf2a6b9,en,ar,Transcripts[SEP]التناسب,2.698774e-08,0,0.00019
1,t_00069b63a70a,c_2aeda03b182e,en,ar,"Transcripts[SEP]""سوبرمان""",2.90229e-08,0,0.00019
2,t_00069b63a70a,c_293622bd38b5,en,ar,Transcripts[SEP]رغمًا عن...,1.029449e-08,0,0.00019
3,t_00069b63a70a,c_61628bebc483,en,en,Transcripts[SEP]Phrasal Verbs,3.826137e-08,0,0.00019
4,t_00069b63a70a,c_0c4b328160dd,en,en,Transcripts[SEP]Direct Speech,1.800586e-07,0,0.00019


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_5bc0e1e2cba0
1,t_00068291e9a4,c_639ea2ef9c95 c_ebb7fdf10a7e c_14bf71640ecd c...
2,t_0006d41a73a8,c_b972646631cb c_0c6473c3480d c_d7a0d7eaf799 c...
3,t_4054df11a74e,c_3695c5dc1df6


Unnamed: 0,topic_id,content_ids
0,t_00069b63a70a,c_186fc761585b


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_5bc0e1e2cba0
1,t_00068291e9a4,c_ebb7fdf10a7e c_14bf71640ecd c_ac1672cdcd2c c...
2,t_00069b63a70a,c_749b9bfd3a69 c_186fc761585b
3,t_0006d41a73a8,c_b972646631cb c_0c6473c3480d c_d7a0d7eaf799 c...
4,t_4054df11a74e,c_3695c5dc1df6
