DistilBERT & ANCE

In [1]:
!pip install -U beir

Collecting beir
  Downloading beir-0.2.2-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 2.5 MB/s 
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.7.1.post2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 7.6 MB/s 
Collecting sentence-transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.6 MB/s 
Collecting elasticsearch
  Downloading elasticsearch-7.15.0-py2.py3-none-any.whl (378 kB)
[K     |████████████████████████████████| 378 kB 45.7 MB/s 
[?25hCollecting pytrec-eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
Collecting tensorflow-text
  Downloading tensorflow_text-2.6.0-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 34.9 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 

In [2]:
import pathlib, os
import pandas as pd
from tqdm.notebook import tqdm
from beir import util, LoggingHandler
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.datasets.data_loader import GenericDataLoader

dataset = 'scidocs'
url = 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip'.format(dataset)
out_dir = os.path.join(os.getcwd(), 'datasets')
data_path = util.download_and_unzip(url, out_dir)
corpus, queries, qrels = GenericDataLoader(data_path).load(split='test') # 'test', 'train', 'dev'

  from tqdm.autonotebook import tqdm


/content/datasets/scidocs.zip:   0%|          | 0.00/136M [00:00<?, ?iB/s]

  0%|          | 0/25657 [00:00<?, ?it/s]

In [3]:
def eval_metrics(model_name, ndcg, _map, recall, precision):
  c_map, c_ndcg, c_pre, c_rec = 'MAP@10', 'NDCG@10', 'P@10', 'Recall@10'
  eval_dict = {c_map: [_map[c_map]], c_ndcg: [ndcg[c_ndcg]], c_pre: [precision[c_pre]], c_rec: [recall[c_rec]]}
  eval_df = pd.DataFrame(data=eval_dict)
  eval_df.index = [model_name]
  return eval_df

DistilBERT

In [4]:
model_distilbert = DRES(models.SentenceBERT('msmarco-distilbert-base-v3'), batch_size=128)
retriever_distilbert = EvaluateRetrieval(model_distilbert, score_function='cos_sim')
results_distilbert = retriever_distilbert.retrieve(corpus, queries)
ndcg_d, _map_d, recall_d, precision_d = retriever_distilbert.evaluate(
    qrels, results_distilbert, retriever_distilbert.k_values)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/201 [00:00<?, ?it/s]

In [5]:
eval_metrics('DistilBERT', ndcg_d, _map_d, recall_d, precision_d)

Unnamed: 0,MAP@10,NDCG@10,P@10,Recall@10
DistilBERT,0.07561,0.13328,0.0683,0.13832


ANCE

In [6]:
model_ance = DRES(models.SentenceBERT('msmarco-roberta-base-ance-firstp'))
retriever_ance = EvaluateRetrieval(model_ance, score_function='dot')
results_ance = retriever_ance.retrieve(corpus, queries)
ndcg_a, _map_a, recall_a, precision_a = retriever_ance.evaluate(
    qrels, results_ance, retriever_ance.k_values)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/787 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/7.14k [00:00<?, ?B/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/201 [00:00<?, ?it/s]

In [7]:
eval_metrics('ANCE', ndcg_a, _map_a, recall_a, precision_a)

Unnamed: 0,MAP@10,NDCG@10,P@10,Recall@10
ANCE,0.06951,0.12187,0.0609,0.1236


end of fun.