# "Настройка" BEIR

In [3]:
!pip install datasets
!pip install beir
!pip install gdown

Collecting beir
  Downloading beir-2.0.0.tar.gz (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting sentence-transformers (from beir)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pytrec_eval (from beir)
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting faiss_cpu (from beir)
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting elasticsearch==7.9.1 (from beir)
  Downloading elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading elasticsearch-7.9.1-py2.py3-none-any.whl (219 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.2/219.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading faiss_cpu-1.8.0.post1-cp310-cp310-manyl

In [None]:
#!git clone git@github.com:kngrg/rus_beir.git

In [None]:
import os
import gdown
import zipfile


url = "https://drive.google.com/uc?export=download&id=1RkR7YQ2AewNg96MmHF7sksCC_oxxob5V"
out_dir = os.path.join(os.getcwd(), "datasets")
os.makedirs(out_dir, exist_ok=True)
output_file_path = os.path.join(out_dir, "rus-mmarco.zip")
gdown.download(url, output_file_path, quiet=False)

zip_path = os.path.join(out_dir, "rus-mmarco.zip")
print(zip_path)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(out_dir)
print(f"Dataset extracted here: {out_dir}")

In [None]:
# Загрузка через HF
from beir.datasets.data_loader_hf import HFDataLoader
corpus, queries, qrels = HFDataLoader(hf_repo="kngrg/rus-mmarco", hf_repo_qrels="kngrg/rus-mmarco/qrels", streaming=False,
                                                  keep_in_memory=False).load(split='train')


In [None]:
from beir.datasets.data_loader import GenericDataLoader

data_path = "datasets/rus-mmarco"
corpus, queries, qrels = GenericDataLoader(data_path).load(split="dev") # or split = "train"

# ElasticSearch

In [None]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-7.9.2/
shasum -a 512 -c elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512

In [None]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time

# Sleep for few seconds to let the instance start.
time.sleep(20)

In [None]:
%%bash

ps -ef | grep elasticsearch

In [None]:
%%bash

curl -sX GET "localhost:9200/"

In [None]:
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval

#### Provide parameters for elastic-search
hostname = "localhost"
index_name = "mmarco"
initialize = True # True, will delete existing index with same name and reindex all documents

model = BM25(index_name=index_name, hostname=hostname, initialize=initialize, language="russian")
retriever = EvaluateRetrieval(model)

#### Retrieve dense results (format of results is identical to qrels)
results = retriever.retrieve(corpus, queries)

In [None]:
#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

In [None]:
metrics = {"ndcg": ndcg, "_map": _map, "recall": recall, "precision": precision}
for metric in metrics.keys():
    for it_num, it_val in zip(metrics[metric], metrics[metric].values()):
        print(it_num, it_val )
    print('\n')
