In [3]:
import pandas as pd
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import datasets
from torch.utils.data import Dataset
from tqdm import tqdm

In [4]:
data = pd.read_csv('data.csv')

In [5]:
data

Unnamed: 0,query,positive,negative
0,можно ли немного кофеина во время беременности,Мы мало что знаем о влиянии кофеина во время б...,"Как правило, беременным женщинам безопасно ест..."
1,какие фрукты произрастают в Австралии,"Passiflora herbertiana. Редкий плод маракуйи, ...","Орех кола - это плод кола, рода деревьев (кола..."
2,насколько велика канадская армия,Канадские вооруженные силы. 1 Первая крупномас...,Канадский институт здоровья врачей (CPHI) - эт...
3,виды фруктовых деревьев,Вишня. Вишневые деревья растут по всему миру. ...,"Орех кола - это плод кола, рода деревьев (кола..."
4,сколько калорий в день теряется при грудном вс...,"Мало того, что грудное вскармливание лучше для...",Однако вам все равно нужно немного ниацина каж...
...,...,...,...
4999995,"заболевания, вызываемые простейшими у животных",Трипаносомоз Это простейшее заболевание животн...,"Антони ван Левенгук: Антони ван Левенгук, голл..."
4999996,что такое биоразнообразие и лекарства,Биоразнообразие играет жизненно важную роль в ...,Обязательно сообщите своему лечащему врачу обо...
4999997,как рассчитать соответствие компании,"Например, если компания соглашается обеспечить...",1 Нетто-зарплата также называется заработной п...
4999998,важная функция натрия -,1 Натрий играет большую роль в балансе жидкост...,Важность эстрогена. Эстрогены важны для поддер...


In [6]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128, indices=None):
        """
        :param dataframe: pandas DataFrame с колонками ['query', 'positive', 'negative']
        :param tokenizer: любой токенизатор из Transformers
        :param max_length: максимальная длина токенизированных последовательностей
        """
        self.dataset = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.indices = indices
    
    def __len__(self):
        if self.indices is None:
            return len(self.dataset) * 2  
        return len(self.indices)
    
    def __getitem__(self, idx):
        if isinstance(idx, slice):
            if self.indices is None:
                total = len(self.dataset) * 2
                indices = list(range(total))[idx]
            else:
                indices = self.indices[idx]
            
            return CustomDataset(
                self.dataset,
                self.tokenizer,
                self.max_length,
                indices=indices
            )
        
        original_idx = self.indices[idx] if self.indices is not None else idx
        
        row_idx = original_idx // 2
        pair_type = original_idx % 2
        
        example = self.dataset.iloc[row_idx]
        query = example["query"]
        
        if pair_type == 0:
            text_b = example["positive"]
            label = 1
        else:
            text_b = example["negative"]
            label = 0
        
        encoded = self.tokenizer(
            query,
            text_b,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        for key in encoded:
            encoded[key] = encoded[key].squeeze(0)
        
        encoded["labels"] = torch.tensor(label, dtype=torch.long)
        return encoded

In [7]:
max_length = 128
batch_size = 128
tokenizer_name = './tokenizer'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.model_max_length = 1024


train_data = CustomDataset(
    dataframe=data,
    tokenizer=tokenizer,
    max_length=max_length
)

train = train_data[:1000000]

train_loader = DataLoader(
    dataset=train,
    batch_size=batch_size,
    shuffle=True
)

In [8]:
train_data[0]

{'input_ids': tensor([    1, 35099,   634,  1982,  5421,   622,   565,   782, 26011,     2,
            2,  3284,  2715,   368,  6384,   288,  4184,  2190,  5421,   622,
          565,   782, 26011,   324,  1118,   289,  5977,  4635,    18,   987,
          368,  1667,  1129, 35455,  8560,    16,  2104,   388,   908,  2178,
         2391,  1067,    18,  1194,   388,  9649, 21166,    16,  3661,  4228,
         4357,  5421,   622,  1954,  2614, 32867, 16660,  2391,  1067,    18,
          850,  4533,  3477,    16,  2617, 21614,   281,  6140,   655,  5421,
        45064,   514,   386,    28, 20038,  1959,   694,  1853,  6140,   655,
         5421, 45064,  2692, 20038,  1959,    18,     2,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [9]:
class Classifier(nn.Module):
    def __init__(self, hidden_size, num_labels, dropout_prob=0.1):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.act = nn.GELU()
        self.layernorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_size, num_labels)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.layernorm(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits
    

class CrossEncoder(nn.Module):
    def __init__(self, pretrained_model_name: str, num_labels: int = 2, dropout_prob: float = 0.1):
        super(CrossEncoder, self).__init__()
        self.encoder = AutoModel.from_pretrained(pretrained_model_name)
        self.classifier = Classifier(self.encoder.config.hidden_size, num_labels, dropout_prob)
    
    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            pooled_output = outputs.last_hidden_state[:, 0, :]
        
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            if self.classifier.fc2.out_features == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.classifier.fc2.out_features), labels.view(-1))
        
        return {"loss": loss, "logits": logits}

pretrained_model_name = "./crossenc" 
model = CrossEncoder(pretrained_model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [10]:
from torch.cuda.amp import autocast, GradScaler
         
weight_decay       = 0.01                                 
warmup_ratio       = 0.1          
max_grad_norm      = 1.0            
layerwise_decay    = 0.8           
fp16               = True
accumulation_steps = 1

In [11]:
model.encoder.config.max_position_embeddings = 1024

In [12]:
import warnings
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()

In [None]:
import os
from tqdm import tqdm

checkpoint_dir = "./checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
resume_from_checkpoint = None

epochs = 10
learning_rate = 3e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    betas=(0.9, 0.999), 
    eps=1e-6
)

total_steps = len(train_loader) * epochs
warmup_steps = int(warmup_ratio * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

scaler = GradScaler()

start_epoch = 0
if resume_from_checkpoint and os.path.isfile(resume_from_checkpoint):
    print(f"Загружаем чекпоинт: {resume_from_checkpoint}")
    checkpoint = torch.load(resume_from_checkpoint, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    scaler.load_state_dict(checkpoint['scaler_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"Обучение возобновлено с эпохи {start_epoch}")

model.train()
for epoch in range(start_epoch, epochs):
    total_loss = 0.0
    
    for batch in tqdm(train_loader, colour='#fc6b03', desc=f'Эпоха {epoch+1}/{epochs}'):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        token_type_ids = batch.get("token_type_ids")
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)
        
        optimizer.zero_grad()
        with autocast(enabled=fp16):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )
            loss = outputs["loss"] / accumulation_steps
        
        scaler.scale(loss).backward()

        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Средняя потеря (loss) за эпоху: {avg_loss:.4f}")

    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pt")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'scaler_state_dict': scaler.state_dict(),
        'loss': avg_loss,
    }, checkpoint_path)
    print(f"Чекпоинт сохранен: {checkpoint_path}")

Эпоха 1/10: 100%|[38;2;252;107;3m██████████[0m| 7813/7813 [34:39<00:00,  3.76it/s]


Средняя потеря (loss) за эпоху: 0.5470
Чекпоинт сохранен: ./checkpoints/checkpoint_epoch_0.pt


Эпоха 2/10: 100%|[38;2;252;107;3m██████████[0m| 7813/7813 [34:57<00:00,  3.72it/s]


Средняя потеря (loss) за эпоху: 0.4360
Чекпоинт сохранен: ./checkpoints/checkpoint_epoch_1.pt


Эпоха 3/10: 100%|[38;2;252;107;3m██████████[0m| 7813/7813 [34:32<00:00,  3.77it/s]


Средняя потеря (loss) за эпоху: 0.3129
Чекпоинт сохранен: ./checkpoints/checkpoint_epoch_2.pt


Эпоха 4/10: 100%|[38;2;252;107;3m██████████[0m| 7813/7813 [34:28<00:00,  3.78it/s]


Средняя потеря (loss) за эпоху: 0.2088
Чекпоинт сохранен: ./checkpoints/checkpoint_epoch_3.pt


Эпоха 5/10: 100%|[38;2;252;107;3m██████████[0m| 7813/7813 [34:51<00:00,  3.74it/s]


Средняя потеря (loss) за эпоху: 0.1407
Чекпоинт сохранен: ./checkpoints/checkpoint_epoch_4.pt


Эпоха 6/10: 100%|[38;2;252;107;3m██████████[0m| 7813/7813 [34:32<00:00,  3.77it/s]


Средняя потеря (loss) за эпоху: 0.0983
Чекпоинт сохранен: ./checkpoints/checkpoint_epoch_5.pt


Эпоха 7/10:  25%|[38;2;252;107;3m██▌       [0m| 1974/7813 [08:44<25:43,  3.78it/s]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

max_length = 128
val_batch_size = 128

test = train_data[2000000:2200000]

val_loader = torch.utils.data.DataLoader(
    dataset=test,
    batch_size=val_batch_size,
    shuffle=False
)

model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for batch in tqdm(val_loader,colour='#35fc03',desc='validation'):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        token_type_ids = batch.get("token_type_ids")
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=1)
        
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("\nClassification Report:\n", classification_report(all_labels, all_preds))

validation: 100%|[38;2;53;252;3m██████████[0m| 1563/1563 [08:38<00:00,  3.02it/s]


Accuracy: 0.6882
Precision: 0.6359
Recall: 0.8804
F1 Score: 0.7385

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.50      0.61    100000
           1       0.64      0.88      0.74    100000

    accuracy                           0.69    200000
   macro avg       0.72      0.69      0.68    200000
weighted avg       0.72      0.69      0.68    200000



In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model_ids = {
    "Russian-msmarco": "./pretrained/MSmarco",
    "BGE": "./pretrained/BGE",
}

models = {}
tokenizers = {}
for name, model_id in model_ids.items():
    tokenizers[name] = AutoTokenizer.from_pretrained(tokenizer_name)
    models[name] = AutoModelForSequenceClassification.from_pretrained(model_id)

batch_size = 128
max_length = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test = train_data[100000:120000]

for name in models.keys():
    print(f"Оценка модели: {name}")
    
    val_loader = torch.utils.data.DataLoader(
        dataset=test,
        batch_size=batch_size,
        shuffle=False
    )
    
    all_labels = []
    all_preds = []
    
    model = models[name].to(device)
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(val_loader,colour='#1ad2db',desc='validation'):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            token_type_ids = batch.get("token_type_ids")
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("-" * 30)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./pretrained/MSmarco and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ./pretrained/BGE and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Оценка модели: Russian-msmarco


validation: 100%|[38;2;26;210;219m██████████[0m| 157/157 [00:39<00:00,  3.94it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.5000
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000
------------------------------
Оценка модели: BGE


validation: 100%|[38;2;26;210;219m██████████[0m| 157/157 [00:40<00:00,  3.84it/s]

Accuracy:  0.5000
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000
------------------------------



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
