In [None]:
import pandas as pd
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup, AutoModelForSequenceClassification
import datasets
import numpy as np
from tqdm.auto import tqdm
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import ir_measures
from torch.cuda.amp import autocast, GradScaler
import warnings
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
data = datasets.load_dataset('unicamp-dl/mmarco', 'russian')

In [5]:
data

Unnamed: 0,query,positive,negative
0,можно ли немного кофеина во время беременности,Мы мало что знаем о влиянии кофеина во время б...,"Как правило, беременным женщинам безопасно ест..."
1,какие фрукты произрастают в Австралии,"Passiflora herbertiana. Редкий плод маракуйи, ...","Орех кола - это плод кола, рода деревьев (кола..."
2,насколько велика канадская армия,Канадские вооруженные силы. 1 Первая крупномас...,Канадский институт здоровья врачей (CPHI) - эт...
3,виды фруктовых деревьев,Вишня. Вишневые деревья растут по всему миру. ...,"Орех кола - это плод кола, рода деревьев (кола..."
4,сколько калорий в день теряется при грудном вс...,"Мало того, что грудное вскармливание лучше для...",Однако вам все равно нужно немного ниацина каж...
...,...,...,...
4999995,"заболевания, вызываемые простейшими у животных",Трипаносомоз Это простейшее заболевание животн...,"Антони ван Левенгук: Антони ван Левенгук, голл..."
4999996,что такое биоразнообразие и лекарства,Биоразнообразие играет жизненно важную роль в ...,Обязательно сообщите своему лечащему врачу обо...
4999997,как рассчитать соответствие компании,"Например, если компания соглашается обеспечить...",1 Нетто-зарплата также называется заработной п...
4999998,важная функция натрия -,1 Натрий играет большую роль в балансе жидкост...,Важность эстрогена. Эстрогены важны для поддер...


In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128, indices=None):
        """
        :param dataframe: pandas DataFrame с колонками ['query', 'positive', 'negative']
        :param tokenizer: любой токенизатор из Transformers
        :param max_length: максимальная длина токенизированных последовательностей
        """
        self.dataset = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.indices = indices
    
    def __len__(self):
        if self.indices is None:
            return len(self.dataset) * 2  
        return len(self.indices)
    
    def __getitem__(self, idx):
        if isinstance(idx, slice):
            if self.indices is None:
                total = len(self.dataset) * 2
                indices = list(range(total))[idx]
            else:
                indices = self.indices[idx]
            
            return CustomDataset(
                self.dataset,
                self.tokenizer,
                self.max_length,
                indices=indices
            )
        
        original_idx = self.indices[idx] if self.indices is not None else idx
        
        row_idx = original_idx // 2
        pair_type = original_idx % 2
        
        example = self.dataset.iloc[row_idx]
        query = example["query"]
        
        if pair_type == 0:
            text_b = example["positive"]
            label = 1
        else:
            text_b = example["negative"]
            label = 0
        
        encoded = self.tokenizer(
            query,
            text_b,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        for key in encoded:
            encoded[key] = encoded[key].squeeze(0)
        
        encoded["labels"] = torch.tensor(label, dtype=torch.long)
        return encoded

In [7]:
max_length = 128
batch_size = 128
tokenizer_name = './tokenizer'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.model_max_length = 1024


train_data = CustomDataset(
    dataframe=data,
    tokenizer=tokenizer,
    max_length=max_length
)

train = train_data[:1000000]

train_loader = DataLoader(
    dataset=train,
    batch_size=batch_size,
    shuffle=True
)

In [9]:
class Classifier(nn.Module):
    def __init__(self, hidden_size, num_labels, dropout_prob=0.1):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.act = nn.GELU()
        self.layernorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_size, num_labels)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.layernorm(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits
    

class CrossEncoder(nn.Module):
    def __init__(self, pretrained_model_name: str, num_labels: int = 2, dropout_prob: float = 0.1):
        super(CrossEncoder, self).__init__()
        self.encoder = AutoModel.from_pretrained(pretrained_model_name)
        self.classifier = Classifier(self.encoder.config.hidden_size, num_labels, dropout_prob)
    
    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            pooled_output = outputs.last_hidden_state[:, 0, :]
        
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            if self.classifier.fc2.out_features == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.classifier.fc2.out_features), labels.view(-1))
        
        return {"loss": loss, "logits": logits}

pretrained_model_name = "./crossenc" 
model = CrossEncoder(pretrained_model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [None]:
weight_decay       = 0.01                                 
warmup_ratio       = 0.1          
max_grad_norm      = 1.0            
layerwise_decay    = 0.8           
fp16               = True
accumulation_steps = 1

In [11]:
model.encoder.config.max_position_embeddings = 1024

In [None]:
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()

In [None]:

checkpoint_dir = "./checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
resume_from_checkpoint = None

epochs = 10
learning_rate = 3e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    betas=(0.9, 0.999), 
    eps=1e-6
)

total_steps = len(train_loader) * epochs
warmup_steps = int(warmup_ratio * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

scaler = GradScaler()

start_epoch = 0
if resume_from_checkpoint and os.path.isfile(resume_from_checkpoint):
    print(f"Загружаем чекпоинт: {resume_from_checkpoint}")
    checkpoint = torch.load(resume_from_checkpoint, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    scaler.load_state_dict(checkpoint['scaler_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"Обучение возобновлено с эпохи {start_epoch}")

model.train()
for epoch in range(start_epoch, epochs):
    total_loss = 0.0
    
    for batch in tqdm(train_loader, colour='#fc6b03', desc=f'Эпоха {epoch+1}/{epochs}'):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        token_type_ids = batch.get("token_type_ids")
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)
        
        optimizer.zero_grad()
        with autocast(enabled=fp16):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )
            loss = outputs["loss"] / accumulation_steps
        
        scaler.scale(loss).backward()

        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Средняя потеря (loss) за эпоху: {avg_loss:.4f}")

    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pt")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'scaler_state_dict': scaler.state_dict(),
        'loss': avg_loss,
    }, checkpoint_path)
    print(f"Чекпоинт сохранен: {checkpoint_path}")

In [None]:

max_length = 128
val_batch_size = 128

test = train_data[2000000:2200000]

val_loader = torch.utils.data.DataLoader(
    dataset=test,
    batch_size=val_batch_size,
    shuffle=False
)

model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for batch in tqdm(val_loader,colour='#35fc03',desc='validation'):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        token_type_ids = batch.get("token_type_ids")
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=1)
        
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("\nClassification Report:\n", classification_report(all_labels, all_preds))

In [None]:

model_ids = {
    "Russian-msmarco": "path/to/weights",
    "FRIDA": "path/to/weights",
    'RuCrossEncoder': "path/to/weights"
}

models = {}
tokenizers = {}
for name, model_id in model_ids.items():
    tokenizers[name] = AutoTokenizer.from_pretrained(tokenizer_name)
    models[name] = AutoModelForSequenceClassification.from_pretrained(model_id)

batch_size = 128
max_length = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test = train_data[10000000:12000000]

for name in models.keys():
    print(f"Оценка модели: {name}")
    
    val_loader = torch.utils.data.DataLoader(
        dataset=test,
        batch_size=batch_size,
        shuffle=False
    )
    
    all_labels = []
    all_preds = []
    
    model = models[name].to(device)
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(val_loader,colour='#1ad2db',desc='validation'):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            token_type_ids = batch.get("token_type_ids")
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("-" * 30)

In [None]:
model_checkpoint = "name of model/checkpoint of pretrained"  
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

valid_data = pd.read_excel('name of generated dataset')

class RankingDataset(Dataset):
    def __init__(self, queries):
        """
        queries: список словарей формата:
            {
                "query": "текст запроса",
                "docs": [
                    {"text": "текст документа", "relevance": уровень_релевантности},
                    ...
                ]
            }
        """
        self.queries = queries
    
    def __len__(self):
        return len(self.queries)
    
    def __getitem__(self, idx):
        return self.queries[idx]

# Пример данных
# test_queries = [
#     {
#         "query": "как часто ходить к стоматологу?",
#         "docs": [
#             {"text": "рекомендуется каждые 6 месяцев", "relevance": 2},
#             {"text": "дядя Вася стоматолог", "relevance": 0},
#             {"text": "оптимально раз в полгода", "relevance": 2},
#             {"text": "зубы надо чистить ежедневно", "relevance": 1},
#         ]
#     },
#     # ... другие запросы
# ]

dataset = RankingDataset(valid_data)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

model.eval()
all_results = []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Обработка запросов"):
        query_data = batch[0] if isinstance(batch, list) else batch
        query_text = str(query_data["query"]) 
        docs = query_data["docs"]
        
        texts = []
        for doc in docs:
            doc_text = str(doc.get("text", ""))  
            texts.append((query_text, doc_text))
        
        inputs = tokenizer(
            [t[0] for t in texts],
            [t[1] for t in texts],
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)
        
       
        outputs = model(**inputs)
        scores = torch.sigmoid(outputs.logits).cpu().numpy().flatten()
        
        query_results = {
            "query": query_text,
            "scores": scores,
            "true_relevance": [int(doc["relevance"]) for doc in docs]
        }
        all_results.append(query_results)

qrels = []
run = []

for result in all_results:
    query = result["query"]
    for i, (score, rel) in enumerate(zip(result["scores"], result["true_relevance"])):
        qrels.append(ir_measures.Qrel(query, f"doc_{i}", rel))
        run.append(ir_measures.ScoredDoc(query, f"doc_{i}", float(score)))

metrics = ir_measures.calc_aggregate(
    [
        ir_measures.nDCG@10,
        ir_measures.MAP@100,
        ir_measures.Recall@100,
    ],
    qrels,
    run
)

print("\nРезультаты оценки:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")