# Set up BEIR

In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [2]:
!pip install beir



In [3]:
from beir import util, LoggingHandler

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

  from tqdm.autonotebook import tqdm


# Setup Model(s)

In [4]:
!pip install transformers numpy torch



In [5]:
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from transformers import AutoModel, AutoTokenizer, BertForSequenceClassification
import numpy as np
import torch
from tqdm import trange
import os
from typing import List, Dict

class FinBERT:
    def __init__(self, model_path: str, device, **kwargs):
        self.device = device
        # self.bert_q = AutoModel.from_pretrained(model_path)
        self.bert_q = AutoModel.from_pretrained(model_path, num_labels=2)
        self.bert_q.eval()
        self.bert_q.to(self.device)

        # self.bert_d = AutoModel.from_pretrained(model_path)
        self.bert_d = AutoModel.from_pretrained(model_path, num_labels=2)
        self.bert_d.eval()
        self.bert_d.to(self.device)

        # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


    def encode_queries(self, queries: List[Dict[str, str]], batch_size: int = 16, **kwargs) -> np.ndarray:
        query_embeddings = []

        with torch.no_grad():
          for start_idx in trange(0, len(queries), batch_size):
            encoded = self.tokenizer(queries[start_idx:start_idx+batch_size], truncation=True, padding=True, return_tensors='pt', max_length=512)
            encoded.to(self.device)
            model_out = self.bert_q(**encoded)
            query_embeddings += model_out.last_hidden_state[:, 0, :].detach().cpu()

        return torch.stack(query_embeddings)

    def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs) -> np.ndarray:
        corpus_embeddings = []

        with torch.no_grad():
          for start_idx in trange(0, len(corpus), batch_size):
            titles = [row['title'] for row in corpus[start_idx: start_idx + batch_size]]
            texts = [row['text']  for row in corpus[start_idx: start_idx + batch_size]]
            encoded = self.tokenizer(titles, texts, truncation='longest_first', padding=True, return_tensors='pt', max_length=512)
            encoded.to(self.device)
            model_out = self.bert_d(**encoded)
            corpus_embeddings += model_out.last_hidden_state[:, 0, :].detach().cpu()

        return torch.stack(corpus_embeddings)

2024-04-04 21:20:10 - PyTorch version 2.2.2 available.
2024-04-04 21:20:10 - Loading faiss.
2024-04-04 21:20:10 - Successfully loaded faiss.


In [6]:
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from transformers import AutoModel, AutoTokenizer, BertForSequenceClassification
import numpy as np
import torch
from tqdm import tqdm
from tqdm import trange
import os
from typing import List, Dict
import requests
import zipfile

def download_file(url, path, filename, zip=False):
    """Downloads and extracts zip file.
    ----------
    Arguments:
        url: str - zip url
        path: str - the path to download the file
        filename: str - name of the file
        zip - bool - if file is zip or not
    """
    # https://stackoverflow.com/questions/37573483/progress-bar-while-download-file-over-http-with-requests/37573701
    # Streaming
    r = requests.get(url, stream=True)
    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024 #1 Kibibyte

    t=tqdm(total=total_size, unit='iB', unit_scale=True)
    # Download file
    with open(path/filename, 'wb') as f:
        for data in r.iter_content(block_size):
            t.update(len(data))
            f.write(data)
    t.close()
    if total_size != 0 and t.n != total_size:
        print("ERROR, download failed.")

    if zip == True:
        # Extract and delete zip file
        with open(path/filename, 'rb') as fileobj:
            z = zipfile.ZipFile(fileobj)
            z.extractall(path)
            z.close()
        os.remove(path/filename)
    else:
        pass

def download_model():
    if (not os.path.exists(bert_model_path)):
        download_file("https://www.dropbox.com/s/sh2h9o5yd7v4ku6/bert-qa.zip?dl=1", pathlib.Path.cwd(), bert_model_path + ".zip", zip=True)
    if (not os.path.exists(checkpoint_finbert)):
        download_file("https://www.dropbox.com/s/12uiuumz4vbqvhk/2_finbert-qa-50_512_16_3e6.pt?dl=1", pathlib.Path.cwd(), "2_finbert-qa-50_512_16_3e6.pt", zip=False)
        

bert_model_path = "bert-qa"   
checkpoint_finbert = "2_finbert-qa-50_512_16_3e6.pt"

class FinBERTQA:
    def __init__(self, model_path: str, device, **kwargs):
        self.device = device

        download_model()
        
        model = BertForSequenceClassification.from_pretrained(bert_model_path, cache_dir=None, num_labels=2)
        model.to(device)
        model.eval()
        model.load_state_dict(torch.load(checkpoint_finbert, map_location=device))
        self.bert_q = model
        self.bert_d = model

        self.tokenizer = AutoTokenizer.from_pretrained(bert_model_path)
        # self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


    def encode_queries(self, queries: List[Dict[str, str]], batch_size: int = 16, **kwargs) -> np.ndarray:
        query_embeddings = []

        with torch.no_grad():
          for start_idx in trange(0, len(queries), batch_size):
            encoded = self.tokenizer(queries[start_idx:start_idx+batch_size], truncation=True, padding=True, return_tensors='pt', max_length=512)
            encoded.to(self.device)
            model_out = self.bert_q(**encoded, output_hidden_states=True)
            # query_embeddings += model_out.hidden_states[-1][:, 0, :].detach().cpu()

            # Alternatively, we can use the average of the last hidden state 
            query_embeddings += model_out.hidden_states[-1].mean(dim=1).detach().cpu()

        return torch.stack(query_embeddings)

    def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs) -> np.ndarray:
        corpus_embeddings = []

        with torch.no_grad():
          for start_idx in trange(0, len(corpus), batch_size):
            titles = [row['title'] for row in corpus[start_idx: start_idx + batch_size]]
            texts = [row['text']  for row in corpus[start_idx: start_idx + batch_size]]
            encoded = self.tokenizer(titles, texts, truncation='longest_first', padding=True, return_tensors='pt', max_length=512)
            encoded.to(self.device)
            model_out = self.bert_d(**encoded, output_hidden_states=True)
            # corpus_embeddings += model_out.hidden_states[-1][:, 0, :].detach().cpu()
            corpus_embeddings += model_out.hidden_states[-1].mean(dim=1).detach().cpu()

        return torch.stack(corpus_embeddings)


In [7]:
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from transformers import AutoModel, AutoTokenizer
import numpy as np
import torch
from tqdm import trange
import os
from typing import List, Dict

class SciNCL:
    def __init__(self, model_path: str, device, **kwargs):
        self.device = device
        self.bert_q = AutoModel.from_pretrained(model_path)
        self.bert_q.eval()
        self.bert_q.to(self.device)

        self.bert_d = AutoModel.from_pretrained(model_path)
        self.bert_d.eval()
        self.bert_d.to(self.device)

        self.tokenizer = AutoTokenizer.from_pretrained(model_path)


    def encode_queries(self, queries: List[Dict[str, str]], batch_size: int = 16, **kwargs) -> np.ndarray:
        query_embeddings = []

        with torch.no_grad():
          for start_idx in trange(0, len(queries), batch_size):
            encoded = self.tokenizer(queries[start_idx:start_idx+batch_size], truncation=True, padding=True, return_tensors='pt', max_length=512)
            encoded.to(self.device)
            model_out = self.bert_q(**encoded)
            query_embeddings += model_out.last_hidden_state[:, 0, :].detach().cpu()

        return torch.stack(query_embeddings)

    def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs) -> np.ndarray:
        corpus_embeddings = []

        with torch.no_grad():
          for start_idx in trange(0, len(corpus), batch_size):
            titles = [row['title'] for row in corpus[start_idx: start_idx + batch_size]]
            texts = [row['text']  for row in corpus[start_idx: start_idx + batch_size]]
            encoded = self.tokenizer(titles, texts, truncation='longest_first', padding=True, return_tensors='pt', max_length=512)
            encoded.to(self.device)
            model_out = self.bert_d(**encoded)
            corpus_embeddings += model_out.last_hidden_state[:, 0, :].detach().cpu()

        return torch.stack(corpus_embeddings)

# Setup Datasets

In [8]:
import pathlib, os
from beir import util

def download_dataset(dataset_name: str):
  url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset_name)
  out_dir = os.path.join(os.getcwd(), "datasets")
  data_path = util.download_and_unzip(url, out_dir)
  print("Dataset downloaded here: {}".format(data_path))
  return data_path

In [9]:
#!ls datasets/scifact/

In [10]:
from beir.datasets.data_loader import GenericDataLoader


# dataset = "scifact"
# dataset = "fiqa"
dataset="trec-covid"
# dataset="msmarco"
# dataset = "scidocs"
# dataset = "nfcorpus"
# dataset = "nq"
data_path = download_dataset(dataset_name=dataset)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev"

Dataset downloaded here: /Users/alexmano/Documents/projects/information-retrieval/ir-cross-evaluations/datasets/trec-covid
2024-04-04 21:20:10 - Loading Corpus...


100%|██████████| 171332/171332 [00:00<00:00, 270140.77it/s]

2024-04-04 21:20:10 - Loaded 171332 TEST Documents.
2024-04-04 21:20:10 - Doc Example: {'text': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract 




# Evaluate

In [11]:
from beir.retrieval.evaluation import EvaluateRetrieval

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = device if not torch.backends.mps.is_available() else "mps"

# finbert = DRES(FinBERT(model_path="ProsusAI/finbert", device=device), batch_size=16)
# medcpt = DRES(MedCPT("ncbi/MedCPT-Query-Encoder", "ncbi/MedCPT-Article-Encoder", "ncbi/MedCPT-Query-Encoder", device=device), batch_size=256)
finbertqa = DRES(FinBERTQA(model_path="", device=device), batch_size=16)
# scincl = DRES(SciNCL(model_path="malteos/scincl", device=device), batch_size=16)
retriever = EvaluateRetrieval(finbertqa, score_function="cos_sim")

results = retriever.retrieve(corpus, queries)

2024-04-04 21:20:11 - Encoding Queries...


100%|██████████| 4/4 [00:00<00:00,  5.12it/s]


2024-04-04 21:20:12 - Sorting Corpus by document length (Longest first)...
2024-04-04 21:20:12 - Scoring Function: Cosine Similarity (cos_sim)
2024-04-04 21:20:12 - Encoding Batch 1/4...


100%|██████████| 3125/3125 [24:27<00:00,  2.13it/s]


2024-04-04 21:44:40 - Encoding Batch 2/4...


100%|██████████| 3125/3125 [18:10<00:00,  2.87it/s]


2024-04-04 22:02:50 - Encoding Batch 3/4...


100%|██████████| 3125/3125 [07:43<00:00,  6.75it/s]


2024-04-04 22:10:34 - Encoding Batch 4/4...


100%|██████████| 1334/1334 [00:40<00:00, 32.58it/s]


In [12]:
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, results, retriever.k_values)
ndcg, _map, recall, precision

2024-04-04 22:11:15 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-04-04 22:11:15 - 

2024-04-04 22:11:15 - NDCG@1: 0.1400
2024-04-04 22:11:15 - NDCG@3: 0.1383
2024-04-04 22:11:15 - NDCG@5: 0.1274
2024-04-04 22:11:15 - NDCG@10: 0.1238
2024-04-04 22:11:15 - NDCG@100: 0.0605
2024-04-04 22:11:15 - NDCG@1000: 0.0456
2024-04-04 22:11:15 - 

2024-04-04 22:11:15 - MAP@1: 0.0004
2024-04-04 22:11:15 - MAP@3: 0.0008
2024-04-04 22:11:15 - MAP@5: 0.0010
2024-04-04 22:11:15 - MAP@10: 0.0015
2024-04-04 22:11:15 - MAP@100: 0.0030
2024-04-04 22:11:15 - MAP@1000: 0.0046
2024-04-04 22:11:15 - 

2024-04-04 22:11:15 - Recall@1: 0.0004
2024-04-04 22:11:15 - Recall@3: 0.0011
2024-04-04 22:11:15 - Recall@5: 0.0015
2024-04-04 22:11:15 - Recall@10: 0.0028
2024-04-04 22:11:15 - Recall@100: 0.0112
2024-04-04 22:11:15 - Recall@1000: 0.0412
2024-04-04 22:11:15 - 

2024-04-04 22:11:15 - P@1: 0.1600
2024-04-04 22:11:15

({'NDCG@1': 0.14,
  'NDCG@3': 0.13827,
  'NDCG@5': 0.12736,
  'NDCG@10': 0.12376,
  'NDCG@100': 0.06049,
  'NDCG@1000': 0.04559},
 {'MAP@1': 0.00038,
  'MAP@3': 0.00082,
  'MAP@5': 0.00105,
  'MAP@10': 0.0015,
  'MAP@100': 0.00303,
  'MAP@1000': 0.00457},
 {'Recall@1': 0.00038,
  'Recall@3': 0.00108,
  'Recall@5': 0.00147,
  'Recall@10': 0.00282,
  'Recall@100': 0.01124,
  'Recall@1000': 0.04123},
 {'P@1': 0.16,
  'P@3': 0.17333,
  'P@5': 0.148,
  'P@10': 0.142,
  'P@100': 0.0562,
  'P@1000': 0.01976})

In [13]:
mrr = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="mrr")
recall_cap = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="recall_cap")
hole = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="hole")
top_k_accuracy = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="top_k_accuracy")
mrr, recall_cap, hole, top_k_accuracy

2024-04-04 22:11:15 - 

2024-04-04 22:11:15 - MRR@1: 0.1600
2024-04-04 22:11:15 - MRR@3: 0.2567
2024-04-04 22:11:15 - MRR@5: 0.2657
2024-04-04 22:11:15 - MRR@10: 0.2893
2024-04-04 22:11:15 - MRR@100: 0.3025
2024-04-04 22:11:15 - MRR@1000: 0.3030
2024-04-04 22:11:15 - 

2024-04-04 22:11:15 - R_cap@1: 0.1600
2024-04-04 22:11:15 - R_cap@3: 0.1733
2024-04-04 22:11:15 - R_cap@5: 0.1480
2024-04-04 22:11:15 - R_cap@10: 0.1420
2024-04-04 22:11:15 - R_cap@100: 0.0562
2024-04-04 22:11:15 - R_cap@1000: 0.0414
2024-04-04 22:11:15 - 

2024-04-04 22:11:15 - Hole@1: 0.5000
2024-04-04 22:11:15 - Hole@3: 0.5200
2024-04-04 22:11:15 - Hole@5: 0.5600
2024-04-04 22:11:15 - Hole@10: 0.5920
2024-04-04 22:11:15 - Hole@100: 0.6988
2024-04-04 22:11:15 - Hole@1000: 0.7557
2024-04-04 22:11:15 - 

2024-04-04 22:11:15 - Accuracy@1: 0.1600
2024-04-04 22:11:15 - Accuracy@3: 0.3800
2024-04-04 22:11:15 - Accuracy@5: 0.4200
2024-04-04 22:11:15 - Accuracy@10: 0.6000
2024-04-04 22:11:15 - Accuracy@100: 0.8800
2024-04-04 2

({'MRR@1': 0.16,
  'MRR@3': 0.25667,
  'MRR@5': 0.26567,
  'MRR@10': 0.28929,
  'MRR@100': 0.30251,
  'MRR@1000': 0.30303},
 {'R_cap@1': 0.16,
  'R_cap@3': 0.17333,
  'R_cap@5': 0.148,
  'R_cap@10': 0.142,
  'R_cap@100': 0.0562,
  'R_cap@1000': 0.04138},
 {'Hole@1': 0.5,
  'Hole@3': 0.52,
  'Hole@5': 0.56,
  'Hole@10': 0.592,
  'Hole@100': 0.6988,
  'Hole@1000': 0.7557},
 {'Accuracy@1': 0.16,
  'Accuracy@3': 0.38,
  'Accuracy@5': 0.42,
  'Accuracy@10': 0.6,
  'Accuracy@100': 0.88,
  'Accuracy@1000': 1.0})