# Импорт используемых библиотек

In [1]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.2.4-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m46.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading lightning-2.2.4-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: lightning
Successfully installed lightning-2.2.4


In [3]:
import torch
import transformers
from torch import nn
from torch.utils import data
from torch.utils.data import Dataset, Sampler, DataLoader
import numpy as np
import pandas as pd
import random
from time import time
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import roc_auc_score
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter

from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint

import datetime
import os

import lightning as L

import gc

# Данные

Считаем данные и соберем из них тренировочный датафрейм сколонками
- qid
- query
- text
- label

In [21]:
data_folder = "/kaggle/input/text-reranking-competition-ir-msu-spring-2024"
docs_file = "/kaggle/input/text-reranking-competition-ir-msu-spring-2024/vkmarco-docs.tsv"
qrels_file = "/kaggle/input/text-reranking-competition-ir-msu-spring-2024/vkmarco-doctrain-qrels.tsv"
queries_file = "/kaggle/input/text-reranking-competition-ir-msu-spring-2024/vkmarco-doctrain-queries.tsv"

submission_queries = "/kaggle/input/text-reranking-competition-ir-msu-spring-2024/vkmarco-doceval-queries.tsv"
submission_qdocs = "/kaggle/input/text-reranking-competition-ir-msu-spring-2024/sample_submission.csv"

In [42]:
def read_tsv(docs_file, queries_file, qrels_file):
    from time import time
    start = time()
    print(f"Reading {docs_file}...")
    df_docs = pd.read_csv(docs_file, delimiter='\t', header=None)
    elapsed = time() - start
    print(f"\tDone! time elapsed: ", elapsed)
    
    print(f"Reading {queries_file}...")
    df_queries = pd.read_csv(queries_file, delimiter='\t', header=None)
    elapsed_queries = time() - (start + elapsed)
    print(f"\tDone! time elapsed: ", elapsed_queries)
    
    print(f"Reading {queries_file}...")
    df_qrels = pd.read_csv(qrels_file, delimiter=' ', header=None)
    elapsed_qrels = time() - (start + elapsed + elapsed_queries)
    print(f"\tDone! time elapsed: ", elapsed_qrels)
    
    return (df_docs, df_queries, df_qrels)

def create_base_df(df_docs, df_queries, df_qrels):
    # maybe add title to text field
    df_docs.rename(columns={0 : 'doc_id', 1 : 'url', 2 : 'title', 3 : 'text'}, inplace=True)
    df_docs_droped = df_docs.drop('url', axis=1)

    df_queries = df_queries.rename(columns={0: "qid", 1: "query"})

    df_qrels.rename(columns={0:'qid', 1:'unknown', 2:'doc_id', 3:'label'}, inplace=True)
    df_qrels_droped = df_qrels.drop(['unknown'], axis=1)
    
    df_qrels_query = df_qrels_droped.join(df_queries.set_index(['qid']), on='qid')
    df_qrels_query_docs = df_qrels_query.join(df_docs.set_index('doc_id'), on='doc_id')
    
    needed_cols = ['qid', 'query', 'text', 'label']
    to_drop = list(set(df_qrels_query_docs) - set(needed_cols))
    df_final = df_qrels_query_docs.drop(to_drop, axis=1)
    return df_final

def create_submission_df(df_docs, df_queries, df_qdocs, with_docid=False):
    df_docs.rename(columns={0 : 'doc_id', 1 : 'url', 2 : 'title', 3 : 'text'}, inplace=True)
    df_docs_droped = df_docs.drop('url', axis=1)
    
    df_queries.rename(columns={0: "qid", 1: "query"}, inplace=True)
    
    df_qdocs.rename(columns={"QueryId":'qid', "DocumentId":'doc_id'}, inplace=True)
    
    df_qdocs_query = df_qdocs.join(df_queries.set_index(['qid']), on='qid')
    df_qdocs_query_docs = df_qdocs_query.join(df_docs.set_index('doc_id'), on='doc_id')

    needed_cols = ['qid', 'query', 'text']
    if with_docid:
        needed_cols.append('doc_id')

    to_drop = list(set(df_qdocs_query_docs) - set(needed_cols))
    df_final = df_qdocs_query_docs.drop(to_drop, axis=1)
    return df_final

In [None]:
df_docs, df_queries, df_qrels = read_tsv(docs_file, queries_file, qrels_file)

In [None]:
print(df_docs.info())
df_docs.head()

In [None]:
print(df_queries.info())
df_queries.head()

In [None]:
print(df_qrels.info())
df_qrels.head()

Создадим тренировочный датафрейм

In [None]:
df_train = create_base_df(df_docs, df_queries, df_qrels)

In [None]:
print(df_train.info())
df_train.head()

In [None]:
df_train.to_csv('df_train.csv', sep='\t', index=False)

Сравним данный датасет с ранее созданным из домашки по nn-rerank

In [None]:
df_train = pd.read_csv("/kaggle/input/vk-marco-formatted/train/kaggle/working/df_train.csv",
                       delimiter='\t',
                       dtype={'qid':'int32', 'label':'int8', 'text':'str', 'query':'str'})

In [18]:
# если сохраненноый df_train не нужен:
!rm "/kaggle/working/df_train.csv"

Создадим датафрейм для сабмита

In [None]:
df_subm_queries = pd.read_csv(submission_queries, delimiter='\t', header=None)
df_subm_qdocs = pd.read_csv(submission_qdocs, delimiter=',')

In [None]:
df_subm_queries, df_subm_qdocs

In [None]:
df_subm = create_submission_df(df_docs, df_subm_queries, df_subm_qdocs)

In [None]:
df_subm.to_csv("df_subm.csv", sep='\t', index=False)

# Попробуем использовать XLM-roBERTa для ранжирования

Обучаем без нулевых лейблов. (По исследованию из прошлой домашки, они составляли ~4% от всего датасета)

In [4]:
class RankDataset(Dataset):
    def __init__(self, data, neg_p=1.0, bienc_mode=False):
        self.neg_p = neg_p
        if self.neg_p < 1.:
            self.data = pd.concat([data[data['label'] == 3],
                                   data[data['label'] == 2],
                                   data[data['label'] == 1].sample(frac=self.neg_p),])
        else:
            self.data = data
        self.bienc_mode = bienc_mode
        
    def __getitem__(self, index):
        query, text, label, qid = self.data.iloc[index, [2, 3, 1, 0]]
#         if bienc_mode:
#             return query.lower(), text.lower(), label, qid
        return [query.lower(), text.lower()], label, qid

    def __len__(self):
        return len(self.data)

In [5]:
class SubmitDataset(RankDataset):
    def __init__(self, data, neg_p=1.0, bienc_mode=False):
        super().__init__(data, neg_p, bienc_mode)
        
    def __getitem__(self, index):
        query, text, qid = self.data.iloc[index, [1, 2, 0]]
        return [query.lower(), text.lower()], qid

    def __len__(self):
        return len(self.data)

In [8]:
class VKMarcoDataModule(L.LightningDataModule):
    """
    it needs to be defined before class RankDataset()
    and before class BatchSampler() [its going to DataLoader]
    """
    def __init__(self, setup_dir="/kaggle/input/vk-marco-formatted",
                       subm_dir="/kaggle/input/vkcomp-subm-df",
                       tkn_max_len=64,
                       tokenizer_str='xlm-roberta-base',
                       batch_size=64,
                       test_batch_size=1024,
                       skip_prepare=True,
                       df_train=None,
                       df_test=None,
                       bienc_mode=False):
        super().__init__()
#         self.raw_dir = raw_dir
        self.setup_dir = setup_dir
        self.subm_dir = subm_dir

        # for tokenizer
        self.tkn_max_len = tkn_max_len 
        self.batch_size = batch_size
        self.test_batch_size = test_batch_size
        self.skip_prepare = skip_prepare
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_str)
        self.df_train=df_train
        self.df_test=df_test
        self.bienc_mode = bienc_mode
        """
        working on kaggle:
        if skip_prepare==True, then 
            df_train and df_test will be readed from 
            f'{setup_dir}/df_train.csv' and f'{setup_dir}/df_test.csv'
        else,
            df_train and df_test will be
            saved in (if file persistence is not setted) 
            session-temporary -- current working directory and
            can be readed from f'./df_train.csv' and f'./df_test.csv'
        """
    
    def prepare_data(self):
        pass
        
    def setup(self, stage:str, force=True):
        inter_folders = "kaggle/working"
        if stage == "fit":
            # it takes 6-7 minutes to read df_train
            if self.df_train is not None and not force:
                return
            print("Reading df_train...")
            start = time()
            self.df_train = pd.read_csv(f"{self.setup_dir}/train/{inter_folders}/df_train.csv",
                                        delimiter='\t',
                                        dtype={'qid':'int32', 'label':'int8', 'text':'str', 'query':'str'})
            elapsed = time() - start
            print("df_train is readed, time elsapsed: ", elapsed)
        if stage == "submit":
            if self.df_test is not None and not force:
                return
            print("Reading df_subm...")
            start = time()
            self.df_test = pd.read_csv(f"{self.subm_dir}/df_subm.csv",
                                       delimiter='\t',
                                       dtype={'qid':'int32', 'text':'str', 'query':'str'})
            elapsed = time() - start
            print("df_test is readed, time elapsed: ", elapsed)
    
    def train_length(self):
        if self.df_train is not None:
            return len(self.df_train.index)
        return 0
        #raise Exception("train df is empty, can't compiute train_length")
    
    def simple_train_dataloader(self, num_workers=1):
        train_dataset = RankDataset(self.df_train, neg_p=0.8, bienc_mode=self.bienc_mode)
        return DataLoader(train_dataset, shuffle=True, batch_size=self.batch_size, collate_fn=self._compose_batch, num_workers=num_workers)
    
    def test_dataloader(self, num_workers=1):
        # it's specific as test_step in LightningModule
        test_dataset = SubmitDataset(self.df_test, neg_p=1., bienc_mode=self.bienc_mode)
        return DataLoader(test_dataset, batch_size=self.test_batch_size, collate_fn=self._compose_submit_batch, num_workers=num_workers)
    
    def _compose_submit_batch(self, batch):
        qids = [z for _, z in batch]
        texts = [x for x, _ in batch]
        
        tokens = self.tokenizer(texts, padding=True, truncation=True, max_length=self.tkn_max_len, return_tensors='pt')
        return tokens, qids
    
    def _compose_batch(self, batch):
        # print("VKMARCO bienc_mode:", self.bienc_mode)
        qids = [z for _, _, z in batch]
        texts = [x for x, _, _ in batch]
        ys = np.array([y for _, y, _ in batch])
        
        # from [3] -> [0, 0, 0, 1]
        # from [2] -> [0, 0, 1, 0]
        # from [1] -> [0, 1, 0, 0]
        # from [0] -> [1, 0, 0, 0]
        
        # to probabilities of labels
        coded_ys = np.zeros((ys.shape[0], 4))
        indexes = np.full((coded_ys.shape[0], ), coded_ys.shape[1])
        indexes[0] = 0
        indexes = np.cumsum(indexes) + ys
        np.put(coded_ys, indexes, 1)
   
        coded_ys = torch.tensor(coded_ys).float()

        tokens = self.tokenizer(texts, padding=True, truncation=True, max_length=self.tkn_max_len, return_tensors='pt')
        return tokens, coded_ys, qids
    
    def teardown(self, stage):
        if stage == 'submit':
            del self.df_test
            gc.collect()

In [10]:
dm = VKMarcoDataModule()
dm.setup("fit")
df_tr = dm.df_train

Reading df_train...
df_train is readed, time elsapsed:  331.2066693305969


In [9]:
from torchtext.models import RobertaClassificationHead


class RankBert(nn.Module):
    def __init__(self, labels_num=4, train_layers_count=2):
        super(RankBert, self).__init__()

        self.bert = AutoModel.from_pretrained("xlm-roberta-base")
        self.config = self.bert.config
        self.labels_num = labels_num

        # freeze all layers without bias and LN
        for name, par in self.bert.named_parameters():
            if 'bias' in name or 'LayerNorm' in name:
                continue
            par.requires_grad = False

        layer_count = self.config.num_hidden_layers
        print("train_layers_count type:", type(train_layers_count), train_layers_count)
        print("labels_num type:", type(labels_num), labels_num)

        for i in range(train_layers_count): #unfreeze somw layers
            for par in self.bert.encoder.layer[layer_count - 1 - i].parameters():
                par.requires_grad = True

        self.head = RobertaClassificationHead(num_classes=labels_num, input_dim=self.config.hidden_size)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        x = self.bert(input_ids=input_ids,
                      token_type_ids=token_type_ids,
                      attention_mask=attention_mask)[0]
#                       )[0][:, 0, :] #hidden_state of [CLS]
        x = self.head(x)
        return x

In [10]:
class LightningModel(L.LightningModule):
    def __init__(self, model, amp_enable, optim, scheduler, loss_fn, device, tb_writer, config, bienc_mode=False):
        super().__init__()
        self.uninitialized_model = model
        self.amp_enable = amp_enable
        self.optim = optim
        self.scheduler = scheduler
        self.loss_fn = loss_fn
        self.config = config
        self.bienc_mode = bienc_mode
        self.tb_writer = tb_writer

        self.running_loss = 0.0
        self.running_auc = 0.0

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        return self.model(input_ids, token_type_ids, attention_mask)
    
    def _move_batch_to_device(self, batch, device):
        if self.bienc_mode:
            batch_q, y, qid, batch_t = batch
            for key in batch_q:
                batch_q[key] = batch_q[key].to(device)
            for key in batch_t:
                batch_t[key] = batch_t[key].to(device)
            batch_x = [batch_q, batch_t]
        else:
            batch_x, y, qid = batch

            for key in batch_x:
                batch_x[key] = batch_x[key].to(device)
        y = y.to(device)
        return batch_x, y, qid
    
    def _move_submit_batch_to_device(self, batch, device):
        batch_x, qid = batch

        for key in batch_x:
            batch_x[key] = batch_x[key].to(device)
        return batch_x, qid

    def training_step(self, batch, batch_idx):
        batch_x, y, qids = self._move_batch_to_device(batch, self.config.DEVICE)
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.amp_enable):
            outputs = self.model(**batch_x)
            loss = loss_fn(outputs, y.detach())

        self.running_loss += loss.detach().item()

        y = y.detach().cpu().int().numpy()
        forauc_outputs = outputs.detach().clone().cpu()
        if y.sum() > 0:
            #compute metric
            with torch.no_grad(): 
                auc = roc_auc_score(y.T,
                                    forauc_outputs.T,
                                    labels=np.array([range(0, self.model.labels_num)]),
                                    multi_class='ovr')
            self.running_auc += np.mean(auc)
        else:
            self.running_auc += 1
        
        #logging to tb
        tb_x = self.current_epoch * config.TRAIN_LENGTH + batch_idx + 1
        self.tb_writer.add_scalar('lr', self.scheduler.get_last_lr()[0], tb_x)

        self.tb_writer.add_scalar('Train/loss', loss, tb_x)
        self.tb_writer.add_scalar('Train/auc', auc, tb_x)
        
        if batch_idx % config.LOG_INTERVAL == self.config.LOG_INTERVAL - 1:
            last_loss = self.running_loss / self.config.LOG_INTERVAL # loss per batch
            last_auc = self.running_auc / self.config.LOG_INTERVAL # loss per batch

            print('  batch {} loss: {}'.format(batch_idx + 1, last_loss))
            print('  batch {} auc: {}'.format(batch_idx + 1, last_auc))
            
            self.tb_writer.add_scalar('Train/running_loss', last_loss, tb_x)
            self.tb_writer.add_scalar('Train/running_auc', last_auc, tb_x)
            
            self.log("rloss", last_loss, on_step=True, prog_bar=True)
            self.log("rauc", last_auc, on_step=True, prog_bar=True)

            self.running_loss = 0.
            self.running_auc = 0.
        
        self.log("train_loss", loss, on_step=True, prog_bar=True)
        self.log("train_auc", auc, on_step=True, prog_bar=True)

        if batch_idx % 10 == 0:
            gc.collect()
            torch.cuda.empty_cache()
        return loss
    
    def on_test_start(self):
        # in format: [p1, p2, p3]. For label 1 prediciton is p1, 2 - p2, 3 - p3
        self.y_test_triplet = []
        self.qids = []
    
    def test_step(self, batch, batch_idx):
        batch_x, qid = self._move_submit_batch_to_device(batch, config.DEVICE)
        preds = self.model(**batch_x)
        
        preds = preds.cpu()
        self.y_test_triplet += [preds]

        self.qids.extend(qid)
        
    def on_test_end(self):
        self.y_test_triplet = torch.cat(self.y_test_triplet).view(-1).cpu()
        self.y_test_triplet = self.y_test_triplet.view(
                                    len(self.y_test_triplet)//self.config.LABELS_NUM, 
                                    self.config.LABELS_NUM
                                    ).cpu()

        self.qids = torch.LongTensor(self.qids).view(-1).cpu()
        return self.y_test_triplet
    
    def _compute_final_metrics(self):
        auc = self._get_auc_score()

        # y_test_triplet[i] (=: output) is triplet of probabilities of labels 1, 2 and 3 ([p1, p2, p3])
        # in torch NDCG preds must be probabilities of relevance, and they are sorted
        # by it. So, to compute NDCG I will use next preprocessing algorithm:
        # get argmax of triplet: from [p1, p2, p3] to [label]
        # get softmax to norm the values to [0, 1]. It will save the order of targets. We need to sort 
        # between different labels, not between same labels.
        # Order of targets is only matters in NDCG
        y_test_labels = self.y_test_triplet.argmax(axis=1) + 1
        y_test = torch.softmax(y_test_labels.float(), dim=0)
        print("in _compute_final_metrics: y_test", type(y_test), y_test)
        print("in _compute_final_metrics shapes of preds, target and qids: ", y_test.shape, self.y_true.shape, self.qids.shape)

        mrr = self._MRR(y_test, self.y_true, self.qids)
        ndcg = self._NDCG(y_test, self.y_true, self.qids)
        
        return {"auc": auc,
                "MRR@10": mrr,
                "NDCG@10": ndcg}

    def configure_model(self):
        # initialize self.model
        self.model = self.uninitialized_model(labels_num=self.config.LABELS_NUM, train_layers_count=self.config.TRAIN_LAYERS)
    
    def configure_optimizers(self):
        self.optimizer = self.optim(self.model.parameters(), lr=self.config.LR, weight_decay=self.config.WD)
        self.scheduler = self.scheduler(self.optimizer,
                                   pct_start=0.1,
                                   max_lr=self.config.LR,
                                   epochs=self.config.EPOCHS, 
                                   steps_per_epoch=self.config.TRAIN_LENGTH)
        return {
                    'optimizer': self.optimizer,
                    'lr_scheduler': {
                        'scheduler': self.scheduler,
                        'interval': 'step'
                    }
               }

In [45]:
y_true = np.array([[0, 0, 0, 1],
                   [1, 0, 0, 0]])
y_pred = np.array([[0.2, 0.2, 0.2, 0.4],
          [0.6, 0.0, 0.0, 0.4]])
print(roc_auc_score(y_true.T, y_pred.T, labels=[0, 1, 2, 3], multi_class="ovo"))

1.0


## Обучение

In [None]:
df_tr

In [12]:
dm = VKMarcoDataModule()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [13]:
class config:
    EPOCHS=5
    LR=5e-5
    WD=0.01
    SAVE_DIR="./cross_encоder_checkpoint/"
    BATCH_SIZE=64
    TRAIN_LENGTH=dm.train_length()
    DEVICE=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    LOG_INTERVAL=250
    TRAIN_LAYERS=2
    LABELS_NUM=4

loss_fn = nn.CrossEntropyLoss()

# this instances will be initialized inside of LightningModule
bert_model = RankBert                           # check configure_model()
optimizer = torch.optim.AdamW                   # check configure_optimizers
scheduler = torch.optim.lr_scheduler.OneCycleLR # check configure_optimizers

cur_date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M").__str__()
codename=f"amp_cross__{cur_date}"
writer = SummaryWriter(f'{config.SAVE_DIR}{codename}')
lightning_model = LightningModel(bert_model,
                       amp_enable=True,
                       optim=optimizer,
                       scheduler=scheduler,
                       loss_fn=loss_fn,
                       device=config.DEVICE,
                       tb_writer=writer,
                       config=config,)
checkpoint_callback = ModelCheckpoint(monitor='train_loss',
                                      save_top_k=1,
                                      mode='min',
                                      dirpath=f'ckpt/{codename}',
                                      filename=codename + "-{epoch:02d}-{train_loss:.2f}",
                                      every_n_train_steps=6500,
                                      save_on_train_epoch_end=True)

In [14]:
gc.collect()

trainer = Trainer(max_epochs=config.EPOCHS, callbacks=[checkpoint_callback], )
train_dataloader = dm.simple_train_dataloader(num_workers=1)

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [None]:
ckpt_path = "/kaggle/working/ckpt/amp_cross__2024-05-07_11:38/amp_cross__2024-05-07_06:19-epoch=02-val_loss=0.00.ckpt"
if not os.path.isfile(ckpt_path):
    ckpt_path=None
trainer.fit(lightning_model, train_dataloaders=train_dataloader, ckpt_path=ckpt_path)

INFO: Restoring states from the checkpoint path at /kaggle/working/ckpt/amp_cross__2024-05-07_06:19/amp_cross__2024-05-07_06:19-epoch=02-val_loss=0.00.ckpt


train_layers_count type: <class 'int'> 2
labels_num type: <class 'int'> 4


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:361: The dirpath has changed from '/kaggle/working/ckpt/amp_cross__2024-05-07_06:19' to '/kaggle/working/ckpt/amp_cross__2024-05-07_11:38', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name    | Type             | Params
---------------------------------------------
0 | loss_fn | CrossEntropyLoss | 0     
1 | model   | RankBert         | 278 M 
---------------------------------------------
14.9 M    Trainable params
263 M     Non-trainable params
278 M     Total params
1,114.549 Total estimated model params size (MB)
INFO: Restored all states from the checkpoint at /kaggle/working/ckpt/amp_cross__2024-05-07_06:19/amp_cross__2024-05-07_06:19-epoch=02-val_loss=0.00.ckpt
/opt/conda/lib/python3.10/site-packages/lightning/pytorch

Training: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/training_epoch_loop.py:161: You're resuming from a checkpoint that ended before the epoch ended and your dataloader is not resumable. This can cause unreliable results if further training is done. Consider using an end-of-epoch checkpoint or make your dataloader resumable by implementing the `state_dict` / `load_state_dict` interface.


  batch 12500 loss: 0.420988166809082
  batch 12500 auc: 0.42438541666666646
  batch 12750 loss: 0.8371126499176026
  batch 12750 auc: 0.8339166666666669
  batch 13000 loss: 0.8318497524261474
  batch 13000 auc: 0.8353750000000001
  batch 13250 loss: 0.8251838693618775
  batch 13250 auc: 0.8389270833333331
  batch 250 loss: 1.0373180189132691
  batch 250 auc: 1.042145833333333
  batch 500 loss: 0.8246340627670288
  batch 500 auc: 0.8392395833333334
  batch 750 loss: 0.8234608969688415
  batch 750 auc: 0.837052083333333
  batch 1000 loss: 0.8318695106506347
  batch 1000 auc: 0.8378958333333334
  batch 1250 loss: 0.8298304872512817
  batch 1250 auc: 0.8354270833333339
  batch 1500 loss: 0.8348938722610474
  batch 1500 auc: 0.8362083333333329
  batch 1750 loss: 0.8332175455093384
  batch 1750 auc: 0.8364895833333329
  batch 2000 loss: 0.8351182594299317
  batch 2000 auc: 0.8337083333333336
  batch 2250 loss: 0.8303181133270263
  batch 2250 auc: 0.8361770833333334
  batch 2500 loss: 0.8276

In [14]:
# dm = VKMarcoDataModule(df_train=df_tr, df_test=df_te)
dm = VKMarcoDataModule()

dm.setup('submit')

Reading df_subm...
df_test is readed, time elapsed:  54.093584299087524


In [15]:
df_te = dm.df_test

In [16]:
ckpt_path = "/kaggle/working/ckpt/amp_cross__2024-05-07_11:38/amp_cross__2024-05-07_11:38-epoch=03-train_loss=0.85.ckpt"
if not os.path.isfile(ckpt_path):
    ckpt_path=None
trainer = Trainer(max_epochs=config.EPOCHS, callbacks=[checkpoint_callback], limit_test_batches=1.)
trainer.test(lightning_model, dataloaders=dm.test_dataloader(), ckpt_path=ckpt_path)

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO: `Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

INFO: Restoring states from the checkpoint path at /kaggle/working/ckpt/amp_cross__2024-05-07_11:38/amp_cross__2024-05-07_11:38-epoch=03-train_loss=0.85.ckpt


train_layers_count type: <class 'int'> 2
labels_num type: <class 'int'> 4


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loaded model weights from the checkpoint at /kaggle/working/ckpt/amp_cross__2024-05-07_11:38/amp_cross__2024-05-07_11:38-epoch=03-train_loss=0.85.ckpt
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{}]

In [17]:
lightning_model.y_test_triplet

tensor([[-14.9708,   0.6602,   1.2086,   0.2003],
        [-17.4569,   1.8869,   1.2938,  -0.7894],
        [-16.5205,   1.8888,   0.9219,  -0.6647],
        ...,
        [-14.9252,   1.1919,   1.2377,  -0.6348],
        [-16.8142,   2.1609,   1.0968,  -1.1651],
        [-13.0181,   0.9224,   1.0698,  -0.4613]])

In [18]:
preds = lightning_model.y_test_triplet.argmax(dim=1)
print(preds.shape)
preds

torch.Size([152692])


tensor([2, 1, 1,  ..., 2, 1, 2])

In [19]:
df_te

Unnamed: 0,qid,query,text
0,3,0 5 текила максимо де кодорниз сильвер 38,Федеральный реестр алкогольной продукции онлай...
1,3,0 5 текила максимо де кодорниз сильвер 38,Нужны реальные отзывы о Текила Montezuma Silve...
2,3,0 5 текила максимо де кодорниз сильвер 38,🍷 Продажа текилы Sauza в магазине WineStyle! П...
3,3,0 5 текила максимо де кодорниз сильвер 38,"Текила – модный алкогольный напиток, ставший в..."
4,3,0 5 текила максимо де кодорниз сильвер 38,мне тута недавно добрые люди дали попить текил...
...,...,...,...
152687,42768,ապօրինի ծագում ունեցող,"null Wednesday, 13 07 2022 ՀԱՅԱՍՏԱՆ ՔԱՂԱՔԱԿԱՆ ..."
152688,42768,ապօրինի ծագում ունեցող,null Հայկական ծագում ունեցող անձնանունների ցան...
152689,42768,ապօրինի ծագում ունեցող,«Ապօրինի ծագում ունեցող գույքի բռնագանձման մաս...
152690,42768,ապօրինի ծագում ունեցող,"null ﻿ Երևան, 16.Հուլիս.2022, 00 : 00 Խմբագրակ..."


## Сохраненение Submit'а

In [None]:
df_docs = pd.read_csv(docs_file, delimiter='\t', header=None)

In [None]:
df_subm_queries = pd.read_csv(submission_queries, delimiter='\t', header=None)
df_subm_qdocs = pd.read_csv(submission_qdocs, delimiter=',')

In [None]:
df_subm = create_submission_df(df_docs, df_subm_queries, df_subm_qdocs, with_docid=True)

In [None]:
df_subm['preds'] = preds

In [None]:
df_final = df_subm.drop(['query', 'text'], axis=1)
df_sorted = df_final.sort_values(['qid', 'preds'], ascending=[True, False])

In [None]:
df_sort_renamed = df_sorted.rename(columns={'qid':"QueryId", 'doc_id': "DocumentId"})
df_to_submit = df_sort_renamed.drop('preds', axis=1)
df_to_submit

In [None]:

df_to_submit.to_csv("submit.csv", columns=['QueryId', 'DocumentId'], index=False)