In [3]:
# Install the beir PyPI package
!pip install beir

Collecting beir
  Using cached beir-2.0.0-py3-none-any.whl
Collecting sentence-transformers (from beir)
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting pytrec-eval (from beir)
  Using cached pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting faiss-cpu (from beir)
  Using cached faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting elasticsearch==7.9.1 (from beir)
  Using cached elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting datasets (from beir)
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets->beir)
  Using cached pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->beir)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets->beir)
  Using cached pandas-2.2.3-cp311-cp3

In [4]:
from time import time
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os
import random

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

dataset = "mrtydi"

#### Download nfcorpus.zip dataset and unzip the dataset
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
print("Dataset downloaded here: {}".format(data_path))

#### Provide the data path where nfcorpus has been downloaded and unzipped to the data loader
# data folder would contain these files: 
# (1) nfcorpus/corpus.jsonl  (format: jsonlines)
# (2) nfcorpus/queries.jsonl (format: jsonlines)
# (3) nfcorpus/qrels/test.tsv (format: tsv ("\t"))

data_path = "datasets/mrtydi/indonesian"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="dev")

#### Dense Retrieval using SBERT (Sentence-BERT) ####
#### Provide any pretrained sentence-transformers model
#### The model was fine-tuned using cosine-similarity.
#### Complete list - https://www.sbert.net/docs/pretrained_models.html

model = DRES(models.SentenceBERT("AryoshiW/distilbert-en-id-qa"), batch_size=128)
retriever = EvaluateRetrieval(model, score_function="dot")

#### Retrieve dense results (format of results is identical to qrels)
start_time = time()
results = retriever.retrieve(corpus, queries)
end_time = time()
print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))
#### Evaluate your retrieval using NDCG@k, MAP@K ...

logging.info("Retriever evaluation for k in: {}".format(retriever.k_values))
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

mrr = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="mrr")
recall_cap = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="r_cap")
hole = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="hole")

#### Print top-k documents retrieved ####
top_k = 10
 
query_id, ranking_scores = random.choice(list(results.items()))
scores_sorted = sorted(ranking_scores.items(), key=lambda item: item[1], reverse=True)
logging.info("Query : %s\n" % queries[query_id])

for rank in range(top_k):
    doc_id = scores_sorted[rank][0]
    # Format: Rank x: ID [Title] Body
    logging.info("Rank %d: %s [%s] - %s\n" % (rank+1, doc_id, corpus[doc_id].get("title"), corpus[doc_id].get("text")))

  from tqdm.autonotebook import tqdm


2024-12-30 08:15:34 - Downloading mrtydi.zip ...


/workspace/datasets/mrtydi.zip:   0%|          | 0.00/8.23G [00:00<?, ?iB/s]

2024-12-30 08:32:51 - Unzipping mrtydi.zip ...
Dataset downloaded here: /workspace/datasets/mrtydi
2024-12-30 08:35:00 - Loading Corpus...


  0%|          | 0/1469399 [00:00<?, ?it/s]

2024-12-30 08:35:06 - Loaded 1469399 DEV Documents.
2024-12-30 08:35:06 - Doc Example: {'text': 'Asam deoksiribonukleat, lebih dikenal dengan singkatan DNA (bahasa Inggris: d</b>eoxyribo<b data-parsoid=\'{"dsr":[417,424,3,3]}\'>n</b>ucleic a</b>cid), adalah sejenis biomolekul yang menyimpan dan menyandi instruksi-instruksi genetika setiap organisme dan banyak jenis virus. Instruksi-instruksi genetika ini berperan penting dalam pertumbuhan, perkembangan, dan fungsi organisme dan virus. DNA merupakan asam nukleat; bersamaan dengan protein dan karbohidrat, asam nukleat adalah makromolekul esensial bagi seluruh makhluk hidup yang diketahui. Kebanyakan molekul DNA terdiri dari dua unting biopolimer yang berpilin satu sama lainnya membentuk heliks ganda. Dua unting DNA ini dikenal sebagai polinukleotida karena keduanya terdiri dari satuan-satuan molekul yang disebut nukleotida. Tiap-tiap nukleotida terdiri atas salah satu jenis basa nitrogen (guanina (G), adenina (A), timina (T), atau sitosi

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/199 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/106k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

(…)ipynb_checkpoints/config-checkpoint.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

2024-12-30 08:35:24 - Encoding Queries...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-12-30 08:35:25 - Sorting Corpus by document length (Longest first)...
2024-12-30 08:35:27 - Scoring Function: Dot Product (dot)
2024-12-30 08:35:27 - Encoding Batch 1/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:35:54 - Encoding Batch 2/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:36:20 - Encoding Batch 3/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:36:47 - Encoding Batch 4/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:37:13 - Encoding Batch 5/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:37:39 - Encoding Batch 6/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:38:06 - Encoding Batch 7/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:38:32 - Encoding Batch 8/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:39:23 - Encoding Batch 10/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:39:49 - Encoding Batch 11/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:40:12 - Encoding Batch 12/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:40:35 - Encoding Batch 13/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:40:56 - Encoding Batch 14/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:41:16 - Encoding Batch 15/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:41:33 - Encoding Batch 16/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:41:47 - Encoding Batch 17/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:42:00 - Encoding Batch 18/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:42:13 - Encoding Batch 19/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:42:27 - Encoding Batch 20/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:42:42 - Encoding Batch 21/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:42:55 - Encoding Batch 22/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:43:05 - Encoding Batch 23/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:43:16 - Encoding Batch 24/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:43:26 - Encoding Batch 25/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:43:35 - Encoding Batch 26/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:43:44 - Encoding Batch 27/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:43:51 - Encoding Batch 28/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:43:58 - Encoding Batch 29/30...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2024-12-30 08:44:04 - Encoding Batch 30/30...


Batches:   0%|          | 0/152 [00:00<?, ?it/s]

Time taken to retrieve: 521.68 seconds
2024-12-30 08:44:06 - Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]
2024-12-30 08:44:06 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-12-30 08:44:07 - 

2024-12-30 08:44:07 - NDCG@1: 0.1226
2024-12-30 08:44:07 - NDCG@3: 0.1708
2024-12-30 08:44:07 - NDCG@5: 0.1842
2024-12-30 08:44:07 - NDCG@10: 0.2014
2024-12-30 08:44:07 - NDCG@100: 0.2340
2024-12-30 08:44:07 - NDCG@1000: 0.2550
2024-12-30 08:44:07 - 

2024-12-30 08:44:07 - MAP@1: 0.1226
2024-12-30 08:44:07 - MAP@3: 0.1585
2024-12-30 08:44:07 - MAP@5: 0.1658
2024-12-30 08:44:07 - MAP@10: 0.1731
2024-12-30 08:44:07 - MAP@100: 0.1789
2024-12-30 08:44:07 - MAP@1000: 0.1796
2024-12-30 08:44:07 - 

2024-12-30 08:44:07 - Recall@1: 0.1226
2024-12-30 08:44:07 - Recall@3: 0.2067
2024-12-30 08:44:07 - Recall@5: 0.2394
2024-12-30 08:44:07 - Recall@10: 0.2917
2024-12-30 08:44:07 - Recall@100: 0.4551
20