Store the results by running BM25_all.py:

dataset       k1     b    nDCG@1   nDCG@3   nDCG@5  nDCG@10         R@1        R@3        R@5       R@10
clapnq       1.5   0.4    0.2143   0.2021   0.2617      0.3     0.0571     0.1702     0.3409     0.4302
cloud        1.5   0.2    0.2105   0.1974   0.2141   0.2415     0.0921     0.1974     0.2303     0.2895
fiqa         1.2   0.4    0.1389   0.1295   0.1619   0.1812      0.044      0.125     0.1991     0.2454
govt         1.5   0.8    0.2683   0.2569   0.2713     0.31     0.1138     0.2378     0.2886     0.3825
nDCG@1 marco: 0.208
nDCG@3 marco: 0.19647499999999998
nDCG@5 marco: 0.22724999999999998
nDCG@10 marco: 0.258175
Recall@1 marco: 0.07675
Recall@3 marco: 0.18259999999999998
Recall@5 marco: 0.264725
Recall@10 marco: 0.33690000000000003



In [4]:
from pathlib import Path

import pandas as pd

from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.evaluation import EvaluateRetrieval

In [3]:
BASE_DIR = Path(".").resolve()
# 如果以后把 notebook 放到 notebooks/ 之类的子目录，就改成：
# BASE_DIR = Path("..").resolve()
DATASETS = {
    "clapnq": {
        "data_dir": BASE_DIR / "dataset" / "clapnq",
        "faiss_dir": BASE_DIR / "indexes" / "clapnq-bge-faiss",
    },
    "cloud": {
        "data_dir": BASE_DIR / "dataset" / "cloud",
        "faiss_dir": BASE_DIR / "indexes" / "cloud-bge-faiss",
    },
    "fiqa": {
        "data_dir": BASE_DIR / "dataset" / "fiqa",
        "faiss_dir": BASE_DIR / "indexes" / "fiqa-bge-faiss",
    },
    "govt": {
        "data_dir": BASE_DIR / "dataset" / "govt",
        "faiss_dir": BASE_DIR / "indexes" / "govt-bge-faiss",
    },
}


BASE_DIR: /Users/leeediazk/Uni_Tuebingen/CL_5th/Challenges_CL/Multi-turn-RAG


In [5]:
MODEL_NAME = "BAAI/bge-base-en-v1.5"  
BATCH_SIZE = 64 #adjustable
SPLIT = "train"    # 现在 qrels 里是 train.tsv

# initialize embedding model
embedding_model = models.SentenceBERT(MODEL_NAME)

# DenseRetrievalExactSearch 精确搜索
dres_model = DRES(embedding_model, batch_size=BATCH_SIZE)

# EvaluateRetrieval
retriever = EvaluateRetrieval(dres_model, score_function="cos_sim")

K_VALUES = [1, 3, 5, 10]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
def evaluate_dataset(dataset_name: str):
    cfg = DATASETS[dataset_name]
    data_dir = cfg["data_dir"]
    faiss_dir = cfg["faiss_dir"]

    print(f"Dataset: {dataset_name}")
    print("data_dir :", data_dir)
    print("faiss_dir:", faiss_dir)

    # 保证目录存在
    faiss_dir.mkdir(parents=True, exist_ok=True)

    # loading data into beir
    corpus, queries, qrels = GenericDataLoader(
        data_folder=str(data_dir)
    ).load(split=SPLIT) # TODO：这里有些变量名以后可以统一

    print(f"#docs = {len(corpus)}, #queries = {len(queries)}")

    # encode + faiss 检索
    results = retriever.encode_and_retrieve(
        corpus, 
        queries, 
        encode_output_path=str(faiss_dir)
    )

    # nDCG  Recall 
    ndcg, recall = retriever.evaluate(qrels, results, K_VALUES)

    return {
        "ndcg": ndcg,
        "recall": recall,
    }


In [7]:
all_metrics = {}

for name in ["clapnq", "cloud", "fiqa", "govt"]:
    all_metrics[name] = evaluate_dataset(name)

all_metrics

Dataset: clapnq
data_dir : /Users/leeediazk/Uni_Tuebingen/CL_5th/Challenges_CL/Multi-turn-RAG/dataset/clapnq
faiss_dir: /Users/leeediazk/Uni_Tuebingen/CL_5th/Challenges_CL/Multi-turn-RAG/indexes/clapnq-bge-faiss


  0%|          | 0/183408 [00:00<?, ?it/s]

#docs = 183408, #queries = 208


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/782 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
rows = []

for ds_name, metrics in all_metrics.items():
    ndcg = metrics["ndcg"]
    recall = metrics["recall"]

    for k in K_VALUES:
        rows.append({
            "dataset": ds_name,
            "k": k,
            "nDCG": ndcg[k],
            "Recall": recall[k],
        })

df_results = pd.DataFrame(rows)
df_results