# Setup

In [1]:
import gc
import pandas as pd
from tqdm.auto import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import DataCollatorWithPadding

# Load the Models

In [2]:
class CFG1:
    sup_model = "../../input/stage-1-all-minilm-l6-v2/all-MiniLM-L6-v2-exp_fold0_epochs10"
    sup_model_tuned = "../../input/sentence-transformers-all-minilm-l6-v2-fold0-42/sentence-transformers-all-MiniLM-L6-v2_fold0_42.pth"
    sup_tokenizer = AutoTokenizer.from_pretrained(sup_model + '/tokenizer')
    pooling = "mean"
    batch_size = 120
    gradient_checkpointing = False
    add_with_best_prob = False
    
class CFG2:
    sup_model = "../../input/paraphrasemultilingualmpnetbasev2-origin2/paraphrasemultilingualmpnetbasev2-origin"
    sup_model_tuned = "../../input/paraphrase-multilingual-mpnet-base-v2-reranker/model-paraphrase-multilingual-mpnet-base-v2-tuned_0.4747.pth"
    sup_tokenizer = AutoTokenizer.from_pretrained(sup_model + '/tokenizer')
    pooling = "mean"
    batch_size = 120
    gradient_checkpointing = False
    add_with_best_prob = True  

In [3]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [4]:
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.sup_model + '/config', output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.sup_model + '/model', config = self.config)

        self.pool = MeanPooling()

        self.fc = nn.Linear(self.config.hidden_size, 1)

        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def feature(self, inputs):
        outputs = self.model(**inputs)
        
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        
        return feature
    
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return feature, output

# Inference

In [5]:
class sup_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_sup_input(self.texts[item], self.cfg)
        return inputs

In [6]:
def prepare_sup_input(text, cfg):
    inputs = cfg.sup_tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

In [7]:
df = pd.read_csv("../../input/prep_cleaned_train_context_5fold.csv", lineterminator="\n")

In [8]:
df.head(1)

Unnamed: 0,topics_ids,content_ids,channel,topic_title,topic_description,topic_parent_title,topic_parent_description,topic_child_title,topic_child_description,topic_category,topic_language,content_title,content_description,content_text,content_kind,content_language,target,topic_fold,content_fold,text
0,t_3d9ad9931021,c_efb73ad83f4b,ebc86c,,BC Introduction Human Biology Grewal,Butte College [SEP] Campus Courses [SEP] Libre...,,Text [SEP] Introduction to Human Biology [SEP]...,,supplemental,en,,,Orientaciones profesorado Orientaciones profes...,document,es,0,3.0,2.0,BC Introduction Human Biology Grewal [SEP] But...


## Inference with CFG1 and m1

In [9]:
m1 = custom_model(CFG1)

state1 = torch.load(CFG1.sup_model_tuned, map_location = torch.device('cpu'))
m1.load_state_dict(state1['model'])

<All keys matched successfully>

In [10]:
cfg = CFG1()
test_dataset = sup_dataset(df, cfg)

In [26]:
test_loader = DataLoader(
    test_dataset, 
    batch_size = cfg.batch_size, 
    shuffle = False, 
    collate_fn = DataCollatorWithPadding(tokenizer = cfg.sup_tokenizer, padding = 'longest'),
    num_workers = 0,
    pin_memory = True,
    drop_last = False
)

In [38]:
device = "cuda" if torch.cuda.is_available() else "cpu"

_ = m1.eval()
_ = m1.to(device)

out_features_1 = torch.Tensor([])
preds_1 = torch.Tensor([])
for _, x in tqdm(enumerate(test_loader), leave=True, position=0, total=len(test_loader)):
    for k, v in x.items():
        x[k] = v.to(device)
        x[k] = x[k][:, :512]
    
    with torch.no_grad():
        features, z = m1(x)
        
    out_features_1 = torch.concat([out_features_1, features.cpu()], axis=0)
    preds_1 = torch.concat([preds_1, z.cpu()], axis=0)
        
torch.cuda.empty_cache()
gc.collect()

  0%|          | 0/5127 [00:00<?, ?it/s]

25

In [39]:
del m1, test_dataset, test_loader

In [40]:
out_features_1.shape, preds_1.shape

(torch.Size([615170, 384]), torch.Size([615170, 1]))

In [49]:
torch.save(out_features_1, "../../input/pseudo_label/out_features_m1.pt")

In [48]:
torch.save(preds_1, "../../input/pseudo_label/preds_m1.pt")

## Inference with CFG2 and m2

In [50]:
m2 = custom_model(CFG2)

state2 = torch.load(CFG2.sup_model_tuned, map_location = torch.device('cpu'))
m2.load_state_dict(state2['model'])

<All keys matched successfully>

In [51]:
cfg = CFG2()
test_dataset = sup_dataset(df, cfg)

In [52]:
test_loader = DataLoader(
    test_dataset, 
    batch_size = cfg.batch_size, 
    shuffle = False, 
    collate_fn = DataCollatorWithPadding(tokenizer = cfg.sup_tokenizer, padding = 'longest'),
    num_workers = 0,
    pin_memory = True,
    drop_last = False
)

In [54]:
device = "cuda" if torch.cuda.is_available() else "cpu"

_ = m2.eval()
_ = m2.to(device)

out_features_1 = torch.Tensor([])
preds_1 = torch.Tensor([])
for _, x in tqdm(enumerate(test_loader), leave=True, position=0, total=len(test_loader)):
    for k, v in x.items():
        x[k] = v.to(device)
        x[k] = x[k][:, :512]
    
    with torch.no_grad():
        features, z = m2(x)
        
    out_features_1 = torch.concat([out_features_1, features.cpu()], axis=0)
    preds_1 = torch.concat([preds_1, z.cpu()], axis=0)
        
torch.cuda.empty_cache()
gc.collect()

  0%|          | 0/5127 [00:00<?, ?it/s]

25

In [55]:
del m2, test_dataset, test_loader

In [56]:
torch.save(out_features_1, "../../input/pseudo_label/out_features_m2.pt")
torch.save(preds_1, "../../input/pseudo_label/preds_m2.pt")