# Evaluation

> Evaluation: Evaluators for zeroqaret project - ColBERT

In [1]:
#| default_exp evaluation

In [2]:
#| hide
from nbdev.showdoc import *

In [1]:
#| hide
import nbdev; nbdev.nbdev_export()

## Imports

In [4]:
#| export
from loguru import logger
import os
from pathlib import Path
from fastcore.basics import patch_to, patch

from zeroqaret.helper import *
from zeroqaret.dataset import BEIRDataset, our_list as eval_list

from getpass import getpass
from typing import Union, Dict, List

from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

import pandas as pd

  from tqdm.autonotebook import tqdm


In [5]:
# # 
# mlflow_uri = getpass("Enter mlflow uri: ")

In [6]:

# setup_mlflow(mlflow_uri, "zeroqaret")

## Get Datasets

In [7]:
# these are the list of datasets to be evaluated
eval_list 

['fiqa', 'trec-covid']

In [8]:
beir_datasets = BEIRDataset()

[32m2023-10-27 00:37:15.805[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [9]:
# download and store all datasets in a dictionary (datasets)
datasets = {}

for ds in eval_list:
    datasets[ds] = beir_datasets.load_dataset(ds)

[32m2023-10-27 00:37:15.822[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'fiqa'...[0m
[32m2023-10-27 00:37:15.823[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/fiqa'[0m


  0%|          | 0/57638 [00:00<?, ?it/s]

[32m2023-10-27 00:37:16.215[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'trec-covid'...[0m
[32m2023-10-27 00:37:16.216[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid'[0m


  0%|          | 0/171332 [00:00<?, ?it/s]

Exception ignored in: <function tqdm.__del__ at 0x7fa0c2d7a0d0>
Traceback (most recent call last):
  File "/home/bengsoon/conda/envs/xcs224/lib/python3.9/site-packages/tqdm/std.py", line 1161, in __del__
    def __del__(self):
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
datasets.keys()

## SBERT Model

In [None]:
from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch
from beir.retrieval.evaluation import EvaluateRetrieval
from beir import util
from time import time

import random

In [None]:
sbert_model_name = "all-mpnet-base-v2"
sbert_model = models.SentenceBERT(model_path=sbert_model_name)
batch_size = 256,

normalize = True

In [None]:
sbert_model = DenseRetrievalExactSearch(models.SentenceBERT(sbert_model_name), batch_size = 256, corpus_chunk_size=512*9999)
sbert_retriever = EvaluateRetrieval(sbert_model, score_function="dot")

In [None]:
corpus, queries, qrels = datasets["fiqa"]

In [None]:
start_time = time()
sbert_results = sbert_retriever.retrieve(corpus, queries)
end_time = time()
print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))

> Format of `results` from `retriever.retrieve`:
``` python
    {
        str(qid) : {
            str(pid) : score
        }
    }
```

In [None]:
logger.info("Retriever evaluation for k in: {}".format(sbert_retriever.k_values))
sbert_ndcg, sbert_map, sbert_recall, sbert_precision = sbert_retriever.evaluate(qrels, sbert_results, sbert_retriever.k_values)

In [None]:
sbert_ndcg

In [None]:
sbert_map

In [None]:
sbert_recall

In [None]:
sbert_precision

In [None]:
mrr = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="mrr")
recall_cap = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="r_cap")
hole = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="hole")

In [None]:
top_k = 10

query_id, ranking_scores = random.choice(list(results.items()))
scores_sorted = sorted(ranking_scores.items(), key=lambda item: item[1], reverse=True)
logger.info("Query : %s\n" % queries[query_id])

In [None]:
for rank in range(top_k):
    doc_id = scores_sorted[rank][0]
    # Format: Rank x: ID [Title] Body
    logger.info("Rank %d: %s [%s] - %s\n" % (rank+1, doc_id, corpus[doc_id].get("title"), corpus[doc_id].get("text")))

In [None]:
ndcg

In [None]:
# from beir.retrieval import models
# from beir import util
# from typing import Union, Tuple, List
# from datetime import datetime
# import torch
# import sys

In [None]:

# logger.info("Computing Document Embeddings...")
# if normalize:
#     corpus_embs = model.encode_corpus(reduced_corpus, batch_size=128, convert_to_tensor=True, normalize_embeddings=True)
# else:
#     corpus_embs = model.encode_corpus(reduced_corpus, batch_size=128, convert_to_tensor=True)

In [None]:
# class SBERTEval(models.SentenceBERT):
#     def __init__(self,
#                  model_path: Union[str, Tuple] = None, 
#                  sep: str = " ", # separator for corpus title and texts.
#                  normalize: bool = True, # if True, normalize encodings. Use dot-product if normalize, otherwise cosine-sim.
#                  encoding_batch_size: int = 128, # batch size for document embedding calculations.
#                  eval_list: List[str] = ["fiqa", "trec-covid"], # list of the names of the BEIR datasets.
#                  k_value: int = 10, # Top-k retrieval value for similarity search
                 
#                 ) -> None:
#         """ 
#         Wrapper function for models.SentenceBERT with evaluation and experimentation functionality with MLflow. 
#         Adapted from https://github.com/beir-cellar/beir/blob/main/examples/benchmarking/benchmark_sbert.py
#         """
#         super().__init__(model_path, sep)
#         self.normalize = normalize
#         self.encoding_batch_size = encoding_batch_size
#         self.k_value = k_value

#         ### LOAD Datasets ###
#         # list of evaluation datasets
#         self.eval_list = eval_list 
#         self.beir_datasets = BEIRDataset()
#         # download and store all datasets in a dictionary (datasets)
#         self.datasets = {ds: self.beir_datasets.load_dataset(ds) for ds in self.eval_list}
#         logger.info(f"BEIR datasets downloaded and loaded in self.datasets dictionary")

#     def eval_dataset(self,
#                      dataset_name: str
#                     ) -> None:

#         logger.info(f" Evaluation for '{dataset_name}' ".center(80, "#"))
                    
#         # load dataset
#         corpus, queries, qrels = self.datasets[dataset_name]
#         corpus_ids, query_ids = list(corpus), list(queries)

#         corpus = [corpus[corpus_id] for corpus_id in corpus_ids]
        
#         # compute document embeddgs
#         logger.info("Computing Document Embeddings...")
#         if self.normalize:
#             self.corpus_embs = self.encode_corpus(corpus, batch_size=self.encoding_batch_size, convert_to_tensor=True, normalize_embeddings=True)
#         else:
#             self.corpus_embs = self.encode_corpus(corpus, batch_size=self.encoding_batch_size, convert_to_tensor=True)

#         #### Saving benchmark times
#         time_taken_all = {}
        
#         for query_id in query_ids:
#             query = queries[query_id]
            
#             #### Compute query embedding and retrieve similar scores using dot-product
#             start = datetime.now()
#             if normalize:
#                 query_emb = model.encode_queries([query], batch_size=1, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
#                 #### Dot product for normalized embeddings is equal to cosine similarity
#                 sim_scores = util.dot_score(query_emb, corpus_embs)
#             else:
#                 query_emb = model.encode_queries([query], batch_size=1, convert_to_tensor=True, show_progress_bar=False)
#                 #### Behind the hood, this cos_sim function will normalize the tensors first before applying dot-product 
#                 sim_scores = util.cos_sim(query_emb, corpus_embs)
            
#             #### Get top-k ranking
#             sim_scores_top_k_values, sim_scores_top_k_idx = torch.topk(sim_scores, self.k_value, dim=1, largest=True, sorted=True)
#             end = datetime.now()
            
#             #### Measuring time taken in ms (milliseconds)
#             time_taken = (end - start)
#             time_taken = time_taken.total_seconds() * 1000
#             time_taken_all[query_id] = time_taken
#             logger.info("{}: {} {:.2f}ms".format(query_id, query, time_taken))

#             time_taken = list(time_taken_all.values())
#             logger.info("Average time taken: {:.2f}ms".format(sum(time_taken)/len(time_taken_all)))
            
#             #### Measuring Index size consumed by document embeddings
#             corpus_embs = corpus_embs.cpu()
#             cpu_memory = sys.getsizeof(np.asarray([emb.numpy() for emb in corpus_embs]))
            
#             logging.info("Number of documents: {}, Dim: {}".format(len(corpus_embs), len(corpus_embs[0])))
#             logging.info("Index size (in MB): {:.2f}MB".format(cpu_memory*0.000001))

In [None]:
# sbert = SBERTEval(sbert_model_name)

## ColBERTv2 - vanilla

In [None]:
# faiss-gpu
# %conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl

# torch
# %pip install torch=1.13.1 torchaudio==0.13.1 torchvision==0.14.1

# others
# %pip install bitarray datasets gitpython ninja scipy spacy tqdm transformers ujson flask python-dotenv

## git clone colbert repo into "../ColBERT"
# !cd .. && git clone https://github.com/stanford-futuredata/ColBERT.git

### Preprocessing Dataset

> The original code in `load_collection()` from _ColBERT/colbert/evaluation/loaders.py_ required monotonic `pid`, but that is not necessarily our case. We'll have to monkey patch it to pass that assertion at line 166: ```assert pid == 'id' or int(pid) == line_idx, f"pid={pid}, line_idx={line_idx}"``` 

In [None]:
dataset_name = "fiqa"
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)

# The indices in BeIR datasets may not be monotic, 
### so we will need a dictionary with enumerated indices (which is used in ColBERT) as keys and BeIR index as values
### collection_ids = {colbert_index: beir_index}
collection_ids = {idx: val for idx, val in enumerate(list(corpus))}

# Load datasets for ColBERT
collection_path, queries_path = beir_datasets.convert_for_colbert(dataset_name)
collection, queries = Collection(path=collection_path), Queries(path=queries_path)

# queries_ids = list(queries)
# queries = list(queries.values())

Let's look at an example of a query

In [None]:
queries[8]

... and an example of a passage from the collection

In [None]:
print(collection[10])

### Indexing

In [None]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens

index_name = f'{dataset_name}.{nbits}bits'

In [None]:
checkpoint = 'colbert-ir/colbertv2.0'

In [None]:
with Run().context(RunConfig(nranks=1, experiment='notebook')):  # nranks specifies the number of GPUs to use
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                # Consider larger numbers for small datasets.

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=collection, overwrite='reuse')

In [None]:
indexer.index??

In [None]:
indexer.get_index()

In [None]:
# To create the searcher using its relative name (i.e., not a full path), set
# experiment=value_used_for_indexing in the RunConfig.
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name, collection=collection)

In [None]:
query = queries[8] # try with an in-range query or supply your own
print(f"#> {query}")

In [None]:
# Find the top-3 passages for this query
results = searcher.search(query, k=10)

In [None]:
qrels['8']

In [None]:
# Print out the top-k retrieved passages
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")

### Batch Search

In [None]:
rankings = searcher.search_all(queries, 5)

In [None]:
rankings.todict()

## ColBERTv2 as BeIR Retriever

In [None]:
dataset_name = "fiqa"
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)

# The indices in BeIR datasets may not be monotic, 
### so we will need a dictionary with enumerated indices (which is used in ColBERT) as keys and BeIR index as values
### collection_ids = {colbert_index: beir_index}
collection_ids = {idx: str(val) for idx, val in enumerate(list(corpus))}

# Load datasets for ColBERT
collection_path, queries_path = beir_datasets.convert_for_colbert(dataset_name)
collection, queries = Collection(path=collection_path), Queries(path=queries_path)

# queries_ids = list(queries)
# queries = list(queries.values())

In [None]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens

index_name = f'{dataset_name}.{nbits}bits'

In [None]:
#| export
class ColBERTRetrievalSearch(Indexer):
    def __init__(self, 
                 checkpoint: str, # ColBERT checkpoint
                 index_name: str, # name of the index
                 experiment_name: str, # name of experiment
                 collection: "Collection", # collection object in Collection format
                 collection_ids: Dict, # {colbert_index: beir_pid}
                 doc_maxlen: int,
                 nbits: int,
                 kmeans: int = 4,
                 overwrite_param: Union[bool, str] = 'reuse',
                 **kwargs):
        """
        Retrieval Search wrapper for ColBERTv2, adapted from BeIR's `DenseRetrievalExactSearch`
         (https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/search/dense/exact_search.py#L12).

        The difference to BeIR's implementation is that if `corpus` and `corpus_ids` are passed at initialization stage, 
            it will pre-compute document encodings and store it. 

        If `index_name` and `overwrite = 'reuse'        
        """
        self.checkpoint = checkpoint
        self.index_name = index_name
        self.collection = collection
        self.collection_ids = collection_ids
        self.experiment_name = experiment_name
        self.doc_maxlen = doc_maxlen
        self.nbits = nbits
        self.kmeans = kmeans
        self.overwrite_param = overwrite_param
        
        with Run().context(RunConfig(nranks=1, experiment=experiment_name)):  # nranks specifies the number of GPUs to use
            config = ColBERTConfig(doc_maxlen=self.doc_maxlen, nbits=self.nbits, kmeans_niters=self.kmeans) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                        # Consider larger numbers for small datasets.
        
            super().__init__(checkpoint=self.checkpoint, config=config)
            self.index(name=self.index_name, collection=self.collection, overwrite=self.overwrite_param)
            
            self.searcher = Searcher(index=self.index_name, collection=self.collection)

    def search(self,
               corpus: "Collection" = None, # corpus in Collection format
               queries: "Queries" = None, # queries in Queries format
               k: int = 10, # top-K value
               score_function = None, # redundant; here to make it compatible with function call from EvaluateRetrieval
               filter_fn = None,              
               full_length_search: bool = False,
               **kwargs,
              ) -> Dict[str, Dict[str, float]]:

        res = self.searcher.search_all(queries, k, filter_fn, full_length_search)
        self.results = {}
        for qid, doc_res in res.items():
            doc_res = {self.collection_ids[cid] : score for cid, rank, score in doc_res}
            self.results[str(qid)] = doc_res

        return self.results

In [None]:
model = ColBERTRetrievalSearch(checkpoint, 
                                   index_name, 
                                   experiment_name="ColBERTRetrievalSearch_test", 
                                   collection=collection, 
                                   collection_ids=collection_ids,
                                   doc_maxlen=doc_maxlen, 
                                   nbits=nbits, 
                                   overwrite_param="reuse")

In [None]:
retriever = EvaluateRetrieval(model)

In [None]:
results = retriever.retrieve(collection, queries)

In [None]:
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

In [None]:
ndcg

In [None]:
_map

In [None]:
recall

In [None]:
precision

## Results Collector

In [None]:
test_df = pd.DataFrame()
test_df["test"] = pd.Series(ndcg)

In [None]:
test_df

In [None]:
#| export
class ResultsCollector:
    """ Collect results from Retrieval Evaluation """
    def __init__(self):
        pass
    def evaluate(self,
                 model_name,
                 retriever,
                 qrels, 
                 results):
        
        ndcg, map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
        if not hasattr(self, "ndcg"): self.ndcg = pd.DataFrame()
        if not hasattr(self, "map"): self.map = pd.DataFrame()
        if not hasattr(self, "recall"): self.recall = pd.DataFrame()
        if not hasattr(self, "precision"): self.precision = pd.DataFrame()
        
        self.ndcg[model_name] = pd.Series(ndcg)
        self.map[model_name] = pd.Series(map)
        self.recall[model_name] = pd.Series(recall)
        self.precision[model_name] = pd.Series(precision)
            

In [None]:
results_collector = ResultsCollector()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()