In [25]:
from datasets import load_dataset
from sentence_transformers import InputExample

# Load both train and validation splits
hebnli = load_dataset('HebArabNlpProject/HebNLI')

ENTAILMENT = "entailment"
CONTRADICTION = "contradiction"
PREMISE = "translation1"
HYPOTHESIS = "translation2"
LABEL = "original_label"

# Use only 'entailment' pairs (label==2) for MultipleNegativesRankingLoss
def make_examples(dataset, include_negative=False):
    examples = [
        InputExample(texts=[item[PREMISE], item[HYPOTHESIS]], label=1.0)
        for item in dataset
        if item[LABEL] == ENTAILMENT and item[PREMISE] and item[HYPOTHESIS]
    ]
    if include_negative:
        examples += [
            InputExample(texts=[item[HYPOTHESIS], item[PREMISE]], label=0.0)
            for item in dataset
            if item[LABEL] == CONTRADICTION and item[PREMISE] and item[HYPOTHESIS]
        ]
    return examples

test_examples = make_examples(hebnli['test'], include_negative=True)

print(f"Train: {len(test_examples)}")


Train: 390


In [26]:
from sentence_transformers import SentenceTransformer

model_path = "/home/nlp/achimoa/workspace/hebrew_text_retrieval/outputs/models/sbert/sbert-hebmodernbert-hebnli/ckpt_20250522_1841_ep1-ba136000"
# model_path = "/home/nlp/achimoa/workspace/hebrew_text_retrieval/outputs/models/sbert/sbert-hebmodernbert-hebnli/ckpt_20250603_1331_ep4-ba628000"
model = SentenceTransformer(model_path)


In [27]:
sents1 = [ex.texts[0] for ex in test_examples]
sents2 = [ex.texts[1] for ex in test_examples]
labels = [ex.label for ex in test_examples]  # 1.0 for entailment, 0.0 for contradiction

In [28]:
embs1 = model.encode(sents1, batch_size=32, convert_to_numpy=True, show_progress_bar=True)
embs2 = model.encode(sents2, batch_size=32, convert_to_numpy=True, show_progress_bar=True)

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

cos_scores = np.array([cosine_similarity([a], [b])[0][0] for a, b in zip(embs1, embs2)])


In [30]:
threshold = 0.5
pred_labels = (cos_scores > threshold).astype(float)

In [31]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score

print("Accuracy:", accuracy_score(labels, pred_labels))
print("F1 Score:", f1_score(labels, pred_labels))
print("Macro F1 Score:", f1_score(labels, pred_labels, average='macro'))
print("ROC-AUC:", roc_auc_score(labels, cos_scores))
print(classification_report(labels, pred_labels, digits=3))


Accuracy: 0.6512820512820513
F1 Score: 0.7094017094017094
Macro F1 Score: 0.6367521367521367
ROC-AUC: 0.7545760572270146
              precision    recall  f1-score   support

         0.0      0.746     0.454     0.564       194
         1.0      0.610     0.847     0.709       196

    accuracy                          0.651       390
   macro avg      0.678     0.650     0.637       390
weighted avg      0.678     0.651     0.637       390



In [32]:
import numpy as np
from sklearn.metrics import f1_score

thresholds = np.arange(0.3, 0.8, 0.01)
scores = [f1_score(labels, (cos_scores > t).astype(float)) for t in thresholds]
best_thresh = thresholds[np.argmax(scores)]
print(f"Best F1 threshold: {best_thresh:.2f}")


Best F1 threshold: 0.54


In [33]:
pred_labels = (cos_scores > best_thresh).astype(float)

In [36]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score

print("Accuracy:", accuracy_score(labels, pred_labels))
print("F1 Score:", f1_score(labels, pred_labels, average=None))
print("Macro F1 Score:", f1_score(labels, pred_labels, average='macro'))
print("ROC-AUC:", roc_auc_score(labels, cos_scores))
print(classification_report(labels, pred_labels, digits=3))


Accuracy: 0.6846153846153846
F1 Score: [0.63501484 0.72234763]
Macro F1 Score: 0.678681233296046
ROC-AUC: 0.7545760572270146
              precision    recall  f1-score   support

         0.0      0.748     0.552     0.635       194
         1.0      0.648     0.816     0.722       196

    accuracy                          0.685       390
   macro avg      0.698     0.684     0.679       390
weighted avg      0.698     0.685     0.679       390



In [14]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(labels, pred_labels))

[[107  87]
 [ 36 160]]


In [None]:
import random
random.seed(42)

indices = list(range(len(sents1)))
random.shuffle(indices)
sample_size = 10

print("Showing 10 random validation examples:")
for i in indices[:sample_size]:
    print(f"Premise:    {sents1[i]}")
    print(f"Hypothesis: {sents2[i]}")
    print(f"Gold label: {labels[i]}, Cosine: {cos_scores[i]:.3f}, Predicted: {pred_labels[i]}\n")


Showing 10 random validation examples:
Premise:    המנהלים לא מכירים אף אחד אחר.
Hypothesis: במסגרת הסדרים כאלה, מנהלים מחברות שונות שמכירים זה את זה ישבו בדירקטוריונים אחד של השני.
Gold label: 0.0, Cosine: 0.375, Predicted: 0.0

Premise:    יעדי אחריות רק עוברים ביקורת
Hypothesis: כדי להבטיח שמקבלי ההחלטות יקבלו מידע שימושי, רלוונטי, אמין, ומתקבל בזמן הנכון, ארגוני מימון מובילים קובעים יעדי אחריות שמשתרעים הרבה מעבר לקבלת ביקורת בלתי מוגבלת.
Gold label: 0.0, Cosine: 0.429, Predicted: 0.0

Premise:    היה סוכר אבל לא היו עבדים.
Hypothesis: סוכר ועבדים
Gold label: 0.0, Cosine: 0.549, Predicted: 1.0

Premise:    לא הייתה להקה שניגנה על הבמה לאחר שהכריחו את הלהקה האחרת לרדת מהבמה.
Hypothesis: על הבמה, להקה אחרת התחילה לנגן.
Gold label: 0.0, Cosine: 0.691, Predicted: 1.0

Premise:    נתתי הרצאה על הכלכלה האמריקאית בתל אביב, ישראל, בחודש שעבר.
Hypothesis: בחודש שעבר, בזמן שהייתי בישראל, דיברתי על הכלכלה האמריקאית.
Gold label: 1.0, Cosine: 0.788, Predicted: 1.0

Premise:    מבקרים אחרים מציב

In [19]:
from collections import Counter
Counter(labels)

Counter({1.0: 196, 0.0: 194})