In [6]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, models, losses
from sentence_transformers.evaluation import LabelAccuracyEvaluator
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import os

# Optional: Set which GPU to use
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"


## Preparing the dataset

In [7]:
from datasets import load_dataset
from sentence_transformers import InputExample

# Load both train and validation splits
snli = load_dataset('snli')

# Use only 'entailment' pairs (label==2) for MultipleNegativesRankingLoss
def make_examples(dataset, include_negative=False):
    examples = [
        InputExample(texts=[item['premise'], item['hypothesis']], label=1.0)
        for item in dataset
        if item['label'] == 0 and item['premise'] and item['hypothesis']
    ]
    if include_negative:
        examples += [
            InputExample(texts=[item['hypothesis'], item['premise']], label=0.0)
            for item in dataset
            if item['label'] == 2 and item['premise'] and item['hypothesis']
        ]
    return examples

train_examples = make_examples(snli['train'])
val_examples   = make_examples(snli['validation'], include_negative=True)

print(f"Train: {len(train_examples)}, Val: {len(val_examples)}")


Train: 183416, Val: 6607


In [8]:
train_dataloader = DataLoader(train_examples, shuffle=True,  batch_size=512)
val_dataloader = DataLoader(val_examples,   shuffle=False, batch_size=512)

## Model setup

In [9]:
model_name = "answerdotai/ModernBERT-base"
bert = models.Transformer(
    model_name, 
    max_seq_length=8192,
)
pooling = models.Pooling(bert.get_word_embedding_dimension())
sbert_model = SentenceTransformer(modules=[bert, pooling])

## Loss function

In [10]:
train_loss = losses.MultipleNegativesRankingLoss(sbert_model)

## Validation

In [11]:
sents1 = [ex.texts[0] for ex in val_examples]
sents2 = [ex.texts[1] for ex in val_examples]
labels = [ex.label for ex in val_examples]

val_evaluator = EmbeddingSimilarityEvaluator(sents1, sents2, labels, name='validation')

## Training

In [40]:
sbert_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=val_evaluator,
    evaluation_steps=100,
    epochs=1,
    output_path='./outputs/models/sbert/sbert-modernbert-nli',
    save_best_model=True,
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)
  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels