### Import Library

In [1]:
import torch
import numpy as np
from datasets import load_dataset, Value
from sentence_transformers import CrossEncoder,SentenceTransformer, util
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import defaultdict
from tqdm import tqdm
import random
from datasets import load_dataset,  Dataset


2025-06-06 19:23:19.183002: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749237799.418666      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749237799.491324      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Dataset

In [2]:
dataset = load_dataset("anti-ai/ViNLI-Zalo-supervised")

README.md:   0%|          | 0.00/133 [00:00<?, ?B/s]

law_vi.jsonl.gz:   0%|          | 0.00/37.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32980 [00:00<?, ? examples/s]

In [3]:
def restructure_data(split):
    query_list = dataset[split]['query']
    positives = dataset[split]['positive']
    negatives = dataset[split]['hard_neg']
    
    samples = []
    for q, pos, neg in zip(query_list, positives, negatives):
        samples.append({"query": q, "document": pos, "label": 1})
        samples.append({"query": q, "document": neg, "label": 0})
    random.shuffle(samples)
    return Dataset.from_list(samples)

In [4]:
full_train_dataset = restructure_data("train")

In [5]:
split_dataset = full_train_dataset.train_test_split(test_size=0.1, seed=42)
temp_train = split_dataset["train"]
test_dataset = split_dataset["test"]

train_valid_split = temp_train.train_test_split(test_size=0.1, seed=42)
train_dataset = train_valid_split["train"]
valid_dataset = train_valid_split["test"]

### Evaluation

In [6]:
def batched_predict(model, pairs, batch_size=1):
    model.model.eval()
    scores = []
    with torch.no_grad():
        for i in tqdm(range(0, len(pairs), batch_size), desc="Predicting", disable=True):
            batch = pairs[i:i+batch_size]
            batch_scores = model.predict(batch, show_progress_bar=False)
            scores.extend(batch_scores)
            torch.cuda.empty_cache()
    return scores

In [7]:
def evaluate_crossencoder(model_name, test_dataset, batch_size=1):
    print(f"\n--- Evaluating model: {model_name} ---")
    model = CrossEncoder(
        model_name,
        trust_remote_code=True,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )

    test_pairs = list(zip(test_dataset["query"], test_dataset["document"]))
    true_labels = test_dataset["label"]

    pred_scores = batched_predict(model, test_pairs, batch_size=batch_size)
    pred_labels = [1 if score >= 0.5 else 0 for score in pred_scores]

    acc = accuracy_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)

    print("Classification Evaluation:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")

In [8]:
crossencoder_models = [
    "cross-encoder/ms-marco-MiniLM-L6-v2",
    #"lightblue/reranker_0.5_cont_filt",
    "Data-Lab/bge-reranker-v2-m3-cross-encoder-v0.1",
    "jinaai/jina-reranker-v2-base-multilingual",
    "Alibaba-NLP/gte-multilingual-reranker-base",
    "BAAI/bge-reranker-base",
    "hghaan/rerank_model",
]

In [9]:
# for model_name in biencoder_models:
#     evaluate_biencoder(model_name, test_dataset, batch_size=32)

for model_name in crossencoder_models:
    evaluate_crossencoder(model_name, test_dataset, batch_size=1)


--- Evaluating model: cross-encoder/ms-marco-MiniLM-L6-v2 ---


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Classification Evaluation:
Accuracy:  0.7818
F1 Score:  0.8158
Precision: 0.7029
Recall:    0.9719

--- Evaluating model: Data-Lab/bge-reranker-v2-m3-cross-encoder-v0.1 ---


config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Classification Evaluation:
Accuracy:  0.9168
F1 Score:  0.9221
Precision: 0.8623
Recall:    0.9909

--- Evaluating model: jinaai/jina-reranker-v2-base-multilingual ---


config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

configuration_xlm_roberta.py:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_xlm_roberta.py:   0%|          | 0.00/43.8k [00:00<?, ?B/s]

mlp.py:   0%|          | 0.00/6.21k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py:   0%|          | 0.00/9.82k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py:   0%|          | 0.00/28.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- modeling_xlm_roberta.py
- mlp.py
- xlm_padding.py
- mha.py
- embedding.py
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/557M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Classification Evaluation:
Accuracy:  0.9018
F1 Score:  0.8904
Precision: 0.9996
Recall:    0.8027

--- Evaluating model: Alibaba-NLP/gte-multilingual-reranker-base ---


config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/612M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Classification Evaluation:
Accuracy:  0.9409
F1 Score:  0.9412
Precision: 0.9303
Recall:    0.9524

--- Evaluating model: BAAI/bge-reranker-base ---


config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Classification Evaluation:
Accuracy:  0.8116
F1 Score:  0.7662
Precision: 0.9995
Recall:    0.6212

--- Evaluating model: hghaan/rerank_model ---


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Classification Evaluation:
Accuracy:  0.9985
F1 Score:  0.9985
Precision: 0.9970
Recall:    1.0000
