# Evaluation

> Evaluation: Evaluators for zeroqaret project - ColBERT

In [None]:
#| default_exp evaluation

In [1]:
#| hide
from nbdev.showdoc import *

In [2]:
#| hide
import nbdev; nbdev.nbdev_export()

## Imports

In [61]:
from zeroqaret.dataset import BEIRDataset

In [1]:
#| export
from loguru import logger
import os
from pathlib import Path
from fastcore.basics import patch_to, patch

from zeroqaret.helper import create_header
from zeroqaret.dataset import BEIRDataset

from getpass import getpass
from typing import Union, Dict, List

import pandas as pd
from tqdm import tqdm

  from tqdm.autonotebook import tqdm


In [78]:
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Trainer
from colbert.data import Queries, Collection

In [2]:
from beir.retrieval.models import SentenceBERT
from beir.retrieval.search.dense import DenseRetrievalExactSearch
from beir.retrieval.evaluation import EvaluateRetrieval
from beir import util
from time import time

import random

In [3]:
from beir import util
from typing import Union, Tuple, List
from datetime import datetime
import torch
import numpy as np
import sys

In [4]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import util as sbert_util

## Get Datasets

In [5]:
beir_datasets = BEIRDataset()

[32m2023-10-31 12:41:15.471[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


## Results Collector

In [6]:
#| export
class ResultsCollector:
    """ Collect results from Retrieval Evaluation for single dataset."""
    def __init__(self,
                 model_path: str = None,
                 dataset_name: str = None,
                 split: str = "test",
                 ):
        self.model_path = model_path
        self.dataset_name = dataset_name
        self.split = split
        logger.info("ResultsCollector object initialized.")
        
    def collect(self,
                experiment_name, 
                retriever,
                results,
                results_time):
        
        ndcg, map, recall, precision = retriever.evaluate(retriever.qrels, results, retriever.k_values)
        
        if not hasattr(self, "ndcg"): self.ndcg = pd.DataFrame()
        if not hasattr(self, "map"): self.map = pd.DataFrame()
        if not hasattr(self, "recall"): self.recall = pd.DataFrame()
        if not hasattr(self, "precision"): self.precision = pd.DataFrame()
        if not hasattr(self, "time"): self.time = pd.DataFrame()
        
        self.ndcg[experiment_name] = pd.Series(ndcg)
        self.map[experiment_name] = pd.Series(map)
        self.recall[experiment_name] = pd.Series(recall)
        self.precision[experiment_name] = pd.Series(precision)
        self.time[experiment_name] = pd.Series(results_time)

    @property
    def all(self):
        metrics = ["ndcg", "map", "recall", "precision", "time"]
        total_df = pd.DataFrame()
        for attr in self.__dir__():
            if attr in metrics:
                total_df = pd.concat((total_df, getattr(self, attr)))
        return total_df

In [320]:
@patch_to(ResultsCollector)
def save_as_csv(self,
                file_path: str,
                table: str
               ):
    df = getattr(self, table)
    df.to_csv(file_path, index = False)
    logger.info(f"Table '{table}' saved as '{file_path}'.")

## SBERT Model

In [None]:
sbert_model_name = "all-mpnet-base-v2"
sbert_model = models.SentenceBERT(model_path=sbert_model_name)
batch_size = 256,

normalize = True

In [None]:
sbert_model = DenseRetrievalExactSearch(models.SentenceBERT(sbert_model_name, ), batch_size = 256, corpus_chunk_size=512*9999)
sbert_retriever = EvaluateRetrieval(sbert_model, score_function="dot")

In [None]:
corpus, queries, qrels = beir_datasets.load_dataset("scifact")

In [None]:
start_time = time()
sbert_results = sbert_retriever.retrieve(corpus, queries)
end_time = time()
print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))

> Format of `results` from `retriever.retrieve`:
``` python
    {
        str(qid) : {
            str(pid) : score
        }
    }
```

In [None]:
logger.info("Retriever evaluation for k in: {}".format(sbert_retriever.k_values))
sbert_ndcg, sbert_map, sbert_recall, sbert_precision = sbert_retriever.evaluate(qrels, sbert_results, sbert_retriever.k_values)

In [None]:
mrr = sbert_retriever.evaluate_custom(qrels, sbert_results, sbert_retriever.k_values, metric="mrr")
recall_cap = sbert_retriever.evaluate_custom(qrels, sbert_results, sbert_retriever.k_values, metric="r_cap")
hole = sbert_retriever.evaluate_custom(qrels, sbert_results, sbert_retriever.k_values, metric="hole")

In [None]:
top_k = 10

query_id, ranking_scores = random.choice(list(sbert_results.items()))
scores_sorted = sorted(ranking_scores.items(), key=lambda item: item[1], reverse=True)
logger.info("Query : %s\n" % queries[query_id])

In [None]:
for rank in range(top_k):
    doc_id = scores_sorted[rank][0]
    # Format: Rank x: ID [Title] Body
    logger.info("Rank %d: %s [%s] - %s\n" % (rank+1, doc_id, corpus[doc_id].get("title"), corpus[doc_id].get("text")))

In [None]:
# from beir.retrieval import models
# from beir import util
# from typing import Union, Tuple, List
# from datetime import datetime
# import torch
# import sys

In [None]:

# logger.info("Computing Document Embeddings...")
# if normalize:
#     corpus_embs = model.encode_corpus(reduced_corpus, batch_size=128, convert_to_tensor=True, normalize_embeddings=True)
# else:
#     corpus_embs = model.encode_corpus(reduced_corpus, batch_size=128, convert_to_tensor=True)

## SBERTEval

In [7]:
class SBERTEval(EvaluateRetrieval):
    def __init__(self,
                 model_path: Union[str, Tuple] = None,      
                 normalize: bool = True, # if True, normalize encodings. Use dot-product if normalize, otherwise cosine-sim.
                 encoding_batch_size: int = 128, # batch size for document embedding calculations.
                 k_values: List[int] = [1,3,5,10,100,1000], # Top-k retrieval values for similarity search
                 
                ) -> None:
        """ 
        Wrapper function for models.SentenceBERT with evaluation and experimentation functionality with MLflow. 
        Adapted from https://github.com/beir-cellar/beir/blob/main/examples/benchmarking/benchmark_sbert.py
        """
        self.model_path = model_path
        self.normalize = normalize
        self.encoding_batch_size = encoding_batch_size
        self.k_values = k_values
        self.top_k = max(k_values)

        ### SBERT model ###
        self.model = SentenceBERT(self.model_path)
                
        ### initialize EvaluateRetrieval
        super().__init__(self.model)

        ### BEIRDatasets class ###
        self.beir_datasets = BEIRDataset()



In [8]:
@patch_to(SBERTEval)
def compute_corpus_embeddings(self, corpus):
    if self.normalize:
        return self.model.encode_corpus(corpus, batch_size=self.encoding_batch_size, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=True)
    else:
        return self.model.encode_corpus(corpus, batch_size=self.encoding_batch_size, convert_to_tensor=True, show_progress_bar=True)

In [9]:
@patch_to(SBERTEval)
def search_queries(self,
                   queries: Union[str, List], # single query or batch queries
                   top_k: int
                  ) -> (List[List[int]], List[List[float]]) :

    """
    Performs cosine similarity calculation between query and document embeddings.
    Returns (List[list of top-k docs indices for each query], List[similarity score for each query])  
    """
    
    if isinstance(queries, str):
        queries = [queries]
    
    if self.normalize:
        queries_emb = self.model.encode_queries(queries, batch_size=1, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
        #### Dot product for normalized embeddings is equal to cosine similarity
        sim_scores = util.dot_score(queries_emb.to("cuda"), self.doc_embeddings.to("cuda"))
    else:
        queries_emb = self.model.encode_queries(queries, batch_size=1, convert_to_tensor=True, show_progress_bar=False)
        #### Behind the hood, this cos_sim function will normalize the tensors first before applying dot-product 
        sim_scores = util.cos_sim(queries_emb.to("cuda"), self.doc_embeddings.to("cuda"))
    
    #### Get top-k ranking
    sim_scores[torch.isnan(sim_scores)] = -1
    sim_scores_top_k_values, sim_scores_top_k_idx = torch.topk(sim_scores, top_k, dim=1, largest=True, sorted=True)
    
    sim_scores_top_k_values = sim_scores_top_k_values.cpu().tolist()
    sim_scores_top_k_idx = sim_scores_top_k_idx.cpu().tolist()
    
    return (sim_scores_top_k_idx, sim_scores_top_k_values)

In [10]:
@patch_to(SBERTEval)
def beir_retrieval(self,
                   dataset_name: str, # beir dataset name
                   split: str = 'test', # split name
                ) -> (Dict[str, Dict[str, float]], float):
    """ Retrieval for BeIR dataset. 
    
    Returns tuple of
     - retrieval results `{str(qid) : {str(pid) : float(score)}}`
     - time: `{"Average Query Time (ms/it)": float, "Total Query Time (s)": float, "Total Document Embedding Time (s)": float}

     """
    logger.info(create_header(f" Evaluation for '{dataset_name}' "))
    
    # load dataset
    self.raw_corpus, self.queries, self.qrels = self.beir_datasets.load_dataset(dataset_name, split)
    self.corpus_ids, self.query_ids = list(self.raw_corpus), list(self.queries)

    self.corpus = [self.raw_corpus[corpus_id] for corpus_id in self.corpus_ids]

    logger.info(f"Pre-computing Document Embeddings for '{dataset_name}' dataset...")
    #### Measuring Index size consumed by document embeddings
    start = datetime.now()
    self.doc_embeddings = self.compute_corpus_embeddings(self.corpus)
    self.doc_embeddings = self.doc_embeddings.cpu()
    end = datetime.now()
    total_doc_emb_time = (end - start)
    total_doc_emb_time = total_doc_emb_time.total_seconds()
    
    cpu_memory = sys.getsizeof(np.asarray([emb.numpy() for emb in self.doc_embeddings]))
    
    logger.info("Number of documents: {}, Dim: {}".format(len(self.doc_embeddings), len(self.doc_embeddings[0])))
    logger.info("Index size (in MB): {:.2f}MB".format(cpu_memory*0.000001))
    logger.info(f"Time taken for pre-computing corpus embedding: {total_doc_emb_time:.2f} s")
    logger.info("Pre-computing of Document Embeddings done.\n\n")
    
    #### Query benchmarking evaluation
    logger.info(f"Starting query benchmark evaluation ...")
    time_taken_all = {}
    beir_results = {}
    for query_id in tqdm(self.query_ids):
        query = self.queries[query_id]
        
        #### Compute query embedding and retrieve similar scores using dot-product
        start = datetime.now()
        query_search_top_idx, query_search_score = self.search_queries(query, self.top_k)
        end = datetime.now()
        
        #### Measuring time taken in ms (milliseconds)
        time_taken = (end - start)
        time_taken = time_taken.total_seconds() * 1000
        time_taken_all[query_id] = time_taken
        # logger.info("{}: {} {:.2f}ms".format(query_id, query, time_taken))

        # append to search_results {str(qid) : {str(pid) : score}}
        beir_results[str(query_id)] = {str(self.corpus_ids[id]): score for id, score in zip(query_search_top_idx[0], query_search_score[0])}

    total_query_time = sum(list(time_taken_all.values())) # in ms
    average_query_time = total_query_time/len(time_taken_all) # ms/it
    logger.info("Average time taken: {:.2f} ms / query".format(average_query_time))
    logger.info("Total time taken: {:.2f} s".format(total_query_time))

    time = {}
    time["Average Query Time (ms/it)"] = average_query_time
    time["Total Query Time (s)"] = total_query_time / 1000 # in seconds
    time["Total Document Embedding Time (s)"] = total_doc_emb_time

    return (beir_results, time)

## ColBERTv2 as BeIR Retriever

In [45]:
#| export
class ColBERTRetrievalSearch(Indexer):
    def __init__(self, 
                 checkpoint: str, # ColBERT checkpoint
                 index_name: str, # name of the index
                 experiment_name: str, # name of experiment
                 collection: "Collection", # collection object in Collection format
                 collection_ids: Dict, # {colbert_index: beir_pid}
                 doc_maxlen: int,
                 nbits: int,
                 kmeans: int = 4,
                 overwrite_param: Union[bool, str] = 'reuse',
                 **kwargs):
        """
        Retrieval Search wrapper for ColBERTv2, adapted from BeIR's `DenseRetrievalExactSearch`
         (https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/search/dense/exact_search.py#L12).

        The difference to BeIR's implementation is that if `corpus` and `corpus_ids` are passed at initialization stage, 
            it will pre-compute document encodings and store it. 

        If `index_name` and `overwrite = 'reuse'        
        """
        self.checkpoint = checkpoint
        self.index_name = index_name
        self.collection = collection
        self.collection_ids = collection_ids
        self.experiment_name = experiment_name
        self.doc_maxlen = doc_maxlen
        self.nbits = nbits
        self.kmeans = kmeans
        self.overwrite_param = overwrite_param
        
        with Run().context(RunConfig(nranks=1, experiment=experiment_name)):  # nranks specifies the number of GPUs to use
            config = ColBERTConfig(doc_maxlen=self.doc_maxlen, nbits=self.nbits, kmeans_niters=self.kmeans) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                        # Consider larger numbers for small datasets.
        
            super().__init__(checkpoint=self.checkpoint, config=config)
            self.index(name=self.index_name, collection=self.collection, overwrite=self.overwrite_param)
            
            self.searcher = Searcher(index=self.index_name, collection=self.collection)

    def search(self,
               corpus: "Collection" = None, # corpus in Collection format
               queries: "Queries" = None, # queries in Queries format
               k: int = 10, # top-K value
               score_function = None, # redundant; here to make it compatible with function call from EvaluateRetrieval
               filter_fn = None,              
               full_length_search: bool = False,
               **kwargs,
              ) -> Dict[str, Dict[str, float]]:

        res = self.searcher.search_all(queries, k, filter_fn, full_length_search)
        self.results = {}
        for qid, doc_res in res.items():
            doc_res = {self.collection_ids[cid] : score for cid, rank, score in doc_res}
            self.results[str(qid)] = doc_res

        return self.results

## `SciFact`

In [11]:
dataset_name = "scifact"

In [12]:
scifact_results = ResultsCollector(dataset_name=dataset_name)

[32m2023-10-31 12:41:22.562[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m12[0m - [1mResultsCollector object initialized.[0m


### SBERT Baseline

We will use `multi-qa-MiniLM-L6-cos-v1` as our baseline;

_This model was tuned for semantic search: Given a query/question, if can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs._ [Ref](https://www.sbert.net/docs/pretrained_models.html#:~:text=Model%20Overview,-The%20following%20table&text=The%20all%2Dmpnet%2Dbase%2D,all%20existing%20sentence%2Dtransformers%20models.)

In [13]:
baseline_model = "multi-qa-MiniLM-L6-cos-v1"

In [14]:
baseline_retriever = SBERTEval(model_path = baseline_model)

[32m2023-10-31 12:41:27.090[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [15]:
results, results_time = baseline_retriever.beir_retrieval(dataset_name)

[32m2023-10-31 12:41:32.009[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m13[0m - [1m
   
****************************************************************************************************   
******                                                                                        ******   
*                                     Evaluation for 'scifact'                                     *   
******                                                                                        ******   
****************************************************************************************************
[0m
[32m2023-10-31 12:41:32.012[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'scifact'...[0m
[32m2023-10-31 12:41:32.012[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact'[0

  0%|          | 0/5183 [00:00<?, ?it/s]

[32m2023-10-31 12:41:32.095[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m21[0m - [1mPre-computing Document Embeddings for 'scifact' dataset...[0m


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

[32m2023-10-31 12:41:44.262[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m32[0m - [1mNumber of documents: 5183, Dim: 384[0m
[32m2023-10-31 12:41:44.264[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m33[0m - [1mIndex size (in MB): 7.96MB[0m
[32m2023-10-31 12:41:44.265[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m34[0m - [1mTime taken for pre-computing corpus embedding: 12.15 s[0m
[32m2023-10-31 12:41:44.266[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m35[0m - [1mPre-computing of Document Embeddings done.

[0m
[32m2023-10-31 12:41:44.267[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m38[0m - [1mStarting query benchmark evaluation ...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:02<00:00, 105.73it/s]
[32m2023-10-31 12:41:47.109[0m | [1mINFO    [0m | [

In [16]:
scifact_results.collect(experiment_name="Baseline SBERT", retriever=baseline_retriever, results=results, results_time=results_time)

In [17]:
scifact_results.all

Unnamed: 0,Baseline SBERT
NDCG@1,0.43667
NDCG@3,0.50444
NDCG@5,0.52354
NDCG@10,0.54029
NDCG@100,0.58909
NDCG@1000,0.60131
MAP@1,0.41028
MAP@3,0.47785
MAP@5,0.49084
MAP@10,0.49919


### Finetuning SBERT
We will finetune `multi-qa-MiniLM-L6-cos-v1` with the generated Title, Questions

https://github.com/UKPLab/sentence-transformers/blob/master/examples/unsupervised_learning/query_generation/2_programming_train_bi-encoder.py

In [18]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets

In [19]:
gen_scifact_df = pd.read_csv("../datasets/scifact/qg/scifact_qg_all.csv", index_col=0)
pids = gen_scifact_df["pid"].tolist()
passages = gen_scifact_df["passage"].tolist()
titles = gen_scifact_df["title"].tolist()
questions = gen_scifact_df["question"].tolist()

In [20]:
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)
corpus_ids, query_ids = list(corpus), list(queries)

[32m2023-10-31 12:41:50.769[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'scifact'...[0m
[32m2023-10-31 12:41:50.772[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact'[0m


  0%|          | 0/5183 [00:00<?, ?it/s]

In [21]:
train_examples = []
for p, t, q in zip(passages, titles, questions):
    anchor = str(t) + " - " + str(q)
    train_examples.append(InputExample(texts=[anchor, p]))

In [22]:
train_examples[158].texts

['Mutations in CTLA-4 Cause Complex Immune Dysregulation Syndrome - What is the syndrome associated with mutations in CTLA-4?',
 'Autosomal dominant immune dysregulation syndrome in humans with CTLA4 mutations The protein cytotoxic T lymphocyte antigen-4 (CTLA-4) is an essential negative regulator of immune responses, and its loss causes fatal autoimmunity in mice. We studied a large family in which five individuals presented with a complex, autosomal dominant immune dysregulation syndrome characterized by hypogammaglobulinemia, recurrent infections and multiple autoimmune clinical features. We identified a heterozygous nonsense mutation in exon 1 of CTLA4. Screening of 71 unrelated patients with comparable clinical phenotypes identified five additional families (nine individuals) with previously undescribed splice site and missense mutations in CTLA4. Clinical penetrance was incomplete (eight adults of a total of 19 genetically proven CTLA4 mutation carriers were considered unaffected

In [23]:
train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=8)

In [24]:
ft_model = SentenceTransformer(baseline_model)

In [25]:
train_loss = losses.MultipleNegativesRankingLoss(ft_model)

In [26]:
# Tune the model
model_path = f"../models/{baseline_model.replace('-', '_')}_ft"
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
ft_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, show_progress_bar=True, checkpoint_path=model_path)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/647 [00:00<?, ?it/s]

Iteration:   0%|          | 0/647 [00:00<?, ?it/s]

Iteration:   0%|          | 0/647 [00:00<?, ?it/s]

In [29]:
ft_model.save(model_path)

In [31]:
ft_retriever = SBERTEval(model_path, normalize=False)

[32m2023-10-31 12:45:41.498[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [32]:
ft_results, ft_time = ft_retriever.beir_retrieval(dataset_name)

[32m2023-10-31 12:45:43.454[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m13[0m - [1m
   
****************************************************************************************************   
******                                                                                        ******   
*                                     Evaluation for 'scifact'                                     *   
******                                                                                        ******   
****************************************************************************************************
[0m
[32m2023-10-31 12:45:43.456[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'scifact'...[0m
[32m2023-10-31 12:45:43.457[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact'[0

  0%|          | 0/5183 [00:00<?, ?it/s]

[32m2023-10-31 12:45:43.528[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m21[0m - [1mPre-computing Document Embeddings for 'scifact' dataset...[0m


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

[32m2023-10-31 12:45:53.553[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m32[0m - [1mNumber of documents: 5183, Dim: 384[0m
[32m2023-10-31 12:45:53.554[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m33[0m - [1mIndex size (in MB): 7.96MB[0m
[32m2023-10-31 12:45:53.555[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m34[0m - [1mTime taken for pre-computing corpus embedding: 10.01 s[0m
[32m2023-10-31 12:45:53.555[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m35[0m - [1mPre-computing of Document Embeddings done.

[0m
[32m2023-10-31 12:45:53.556[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m38[0m - [1mStarting query benchmark evaluation ...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:02<00:00, 111.92it/s]
[32m2023-10-31 12:45:56.240[0m | [1mINFO    [0m | [

In [33]:
scifact_results.collect(f"{baseline_model}_ft", ft_retriever, ft_results, ft_time)

In [35]:
scifact_results.all

Unnamed: 0,Baseline SBERT,multi-qa-MiniLM-L6-cos-v1_ft
NDCG@1,0.43667,0.43667
NDCG@3,0.50444,0.50672
NDCG@5,0.52354,0.53238
NDCG@10,0.54029,0.55935
NDCG@100,0.58909,0.59926
NDCG@1000,0.60131,0.61148
MAP@1,0.41028,0.41667
MAP@3,0.47785,0.48263
MAP@5,0.49084,0.49939
MAP@10,0.49919,0.5116


In [None]:
# faiss-gpu
# %conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl

# torch
# %pip install torch=1.13.1 torchaudio==0.13.1 torchvision==0.14.1

# others
# %pip install bitarray datasets gitpython ninja scipy spacy tqdm transformers ujson flask python-dotenv

## git clone colbert repo into "../ColBERT"
# !cd .. && git clone https://github.com/stanford-futuredata/ColBERT.git

### ColBERTv2 Baseline

In [43]:
dataset_name = "scifact"
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)

# The indices in BeIR datasets may not be monotic, 
### so we will need a dictionary with enumerated indices (which is used in ColBERT) as keys and BeIR index as values
### collection_ids = {colbert_index: beir_index}
collection_ids = {idx: str(val) for idx, val in enumerate(list(corpus))}

# Load datasets for ColBERT
collection_path, queries_path = beir_datasets.convert_for_colbert(dataset_name)
collection, queries = Collection(path=collection_path), Queries(path=queries_path)

# queries_ids = list(queries)
# queries = list(queries.values())

checkpoint = 'colbert-ir/colbertv2.0'

  0%|          | 0/5183 [00:00<?, ?it/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

[32m2023-10-31 16:44:05.532[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mconvert_for_colbert[0m:[36m98[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact/colbert/scifact_collection.tsv ...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:00<00:00, 33566.24it/s]
[32m2023-10-31 16:44:05.694[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mconvert_for_colbert[0m:[36m105[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact/colbert/scifact_queries.tsv ...[0m
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 315598.50it/s]

[Oct 31, 16:44:05] #> Loading collection...
0M 
[Oct 31, 16:44:05] #> Loading the queries from /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact/colbert/scifact_queries.tsv ...
[Oct 31, 16:44:05] #> Got 300 queries. All QIDs are unique.






In [44]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens

index_name = f'{dataset_name}.{nbits}bits'

In [46]:
model = ColBERTRetrievalSearch(checkpoint, 
                                   index_name, 
                                   experiment_name="ColBERTRetrievalSearch_test", 
                                   collection=collection, 
                                   collection_ids=collection_ids,
                                   doc_maxlen=doc_maxlen, 
                                   nbits=nbits, 
                                   overwrite_param="reuse")



[Oct 31, 16:44:09] #> Creating directory /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/ColBERTRetrievalSearch_test/indexes/scifact.2bits 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen":

0it [00:00, ?it/s]

[Oct 31, 16:44:44] [0] 		 #> Saving chunk 0: 	 5,183 passages and 1,229,853 embeddings. From #0 onward.


1it [00:11, 11.85s/it]
100%|██████████| 1/1 [00:00<00:00, 433.34it/s]
  0%|          | 0/16384 [00:00<?, ?it/s]

[Oct 31, 16:44:44] [0] 		 #> Checking all files were saved...
[Oct 31, 16:44:44] [0] 		 Found all files!
[Oct 31, 16:44:44] [0] 		 #> Building IVF...
[Oct 31, 16:44:44] [0] 		 #> Loading codes...
[Oct 31, 16:44:44] [0] 		 Sorting codes...
[Oct 31, 16:44:44] [0] 		 Getting unique codes...
[Oct 31, 16:44:44] #> Optimizing IVF to store map from centroids to list of pids..
[Oct 31, 16:44:44] #> Building the emb2pid mapping..
[Oct 31, 16:44:44] len(emb2pid) = 1229853


100%|██████████| 16384/16384 [00:00<00:00, 72281.28it/s]


[Oct 31, 16:44:44] #> Saved optimized IVF to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/ColBERTRetrievalSearch_test/indexes/scifact.2bits/ivf.pid.pt
[Oct 31, 16:44:44] [0] 		 #> Saving the indexing metadata to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/ColBERTRetrievalSearch_test/indexes/scifact.2bits/metadata.json ..
#> Joined...
[Oct 31, 16:44:50] #> Loading codec...
[Oct 31, 16:44:50] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Oct 31, 16:44:50] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Oct 31, 16:44:51] #> Loading IVF...
[Oct 31, 16:44:51] #> Loading doclens...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1502.26it/s]

[Oct 31, 16:44:51] #> Loading codes and residuals...



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.36it/s]


In [47]:
retriever = EvaluateRetrieval(model)

In [48]:
results = retriever.retrieve(collection, queries)

300it [00:03, 91.29it/s]


In [53]:
retriever.qrels = qrels

In [54]:
scifact_results.collect("ColBERTv2 Baseline", retriever, results, {'Average Query Time (ms/it)': 10.95, 'Total Query Time (s)': 3.0, 'Total Document Embedding Time (s)': None})


In [56]:
scifact_results.all

Unnamed: 0,Baseline SBERT,multi-qa-MiniLM-L6-cos-v1_ft,ColBERTv2 Baseline
NDCG@1,0.43667,0.43667,0.58667
NDCG@3,0.50444,0.50672,0.65424
NDCG@5,0.52354,0.53238,0.6728
NDCG@10,0.54029,0.55935,0.69195
NDCG@100,0.58909,0.59926,0.7165
NDCG@1000,0.60131,0.61148,0.72385
MAP@1,0.41028,0.41667,0.55717
MAP@3,0.47785,0.48263,0.62802
MAP@5,0.49084,0.49939,0.64135
MAP@10,0.49919,0.5116,0.65074


### Finetune ColBERTv2

In [267]:
def finetune_colbert(experiment_name: str, 
                     csv_file: str, # containing at least 'pid', 'passage' and 'question' columns
                     mode: str = "w", # overwrite the training files (triples.jsonl, {queries,collection}.tsv) previously generated
                     replace: bool = False, # if True, will throw error if training files already exists
                     nranks: int = 1, # number of GPUs
                     bsize: int = 32, # batch size
                     lr: float = 1e-05, # learning rate
                     doc_maxlen: int = 300, # max length for document
                     dim: int = 128, # dimension
                     accumsteps: int = 2, 
                     use_ib_negatives: bool = False, 
                     checkpoint: str = "colbert-ir/colbertv2.0", # finetuning from colbertv2.0
                     root_path: str = "../models/", # we will save checkpoints to "../models/{experiment_name}"                     
                    ) -> str:
    """
    Finetunes colbert model from `checkpoint` with data from `csv_file`

    Returns best `checkpoint_path`
    """
    
    beir_dataset = BEIRDataset()

        
    triples_path, queries_path, collection_path = beir_datasets.prepare_qg_for_colbert_training(csv_file, mode=mode, replace=replace)  
    
    with Run().context(RunConfig(nranks=nranks, experiment=experiment_name)):

        config = ColBERTConfig(
            checkpoint = checkpoint,
            bsize=bsize,
            experiment=experiment_name,
            root=f"{root_path}",
        )
    # config = ColBERTConfig(checkpoint=checkpoint, bsize=bsize, lr=lr, warmup=None, nway=0, doc_maxlen=doc_maxlen, dim=dim, accumsteps=accumsteps, use_ib_negatives=use_ib_negatives)
        
    trainer = Trainer(
        triples=triples_path,
        queries=queries_path,
        collection=collection_path,
        config=config,
    )

    trainer.train(checkpoint=checkpoint)

In [268]:
finetune_colbert(experiment_name="scifact_colbertv2_finetuned",
                          csv_file="../datasets/scifact/qg/scifact_qg_all.csv")

[32m2023-11-01 12:46:41.433[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m
[32m2023-11-01 12:46:41.436[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_qg_for_colbert_training[0m:[36m29[0m - [1mCreating ColBERT training files from ../datasets/scifact/qg/colbert_training...[0m
Training files: : 5183it [00:00, 22564.51it/s]
[32m2023-11-01 12:46:41.777[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_qg_for_colbert_training[0m:[36m52[0m - [1mtriples.jsonl, queries,tsv and collection.tsv files created in ../datasets/scifact/qg/colbert_training.[0m


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 1,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 32,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 220,
    "mask_punctuation": true,
    "checkpoint": "colbert-ir\/colbertv2.0",
    "triples": "..\/datasets\/scifact\/qg\/colbert_training\/triples.jsonl",
    





#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: .  Diffusion Tensor Magnetic Resonance Imaging of Human Newborn Cerebral White Matter - What is the purpose of applying a line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis to measure the apparent diffusion coefficient, calculate relative anisotropy, and delineate three-dimensional fiber architecture in cerebral white matter in preterm and full-term infants?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1, 19241, 23435,  8060, 17011, 12126,  1997,  2529, 20662,
        18439,  2317,  3043,  1011,  2054,  2003,  1996,  3800,  1997, 11243,
         1037,  2240, 13594, 19241,  1011, 18215,  8060, 17011, 12126,  1006,
        27011,   102])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1])

#>>>    24.47 9.2 		|		 15.27
[Nov 01, 12:46:5

In [270]:
dataset_name = "scifact"
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)

# The indices in BeIR datasets may not be monotic, 
### so we will need a dictionary with enumerated indices (which is used in ColBERT) as keys and BeIR index as values
### collection_ids = {colbert_index: beir_index}
collection_ids = {idx: str(val) for idx, val in enumerate(list(corpus))}

# Load datasets for ColBERT
collection_path, queries_path = beir_datasets.convert_for_colbert(dataset_name)
collection, queries = Collection(path=collection_path), Queries(path=queries_path)

# queries_ids = list(queries)
# queries = list(queries.values())

checkpoint = '/home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/default/none/2023-10/31/12.41.13/checkpoints/colbert'


colbert_model_ft = ColBERTRetrievalSearch(checkpoint, 
                                   index_name, 
                                   experiment_name="scifact_colbertv2_ft", 
                                   collection=collection, 
                                   collection_ids=collection_ids,
                                   doc_maxlen=doc_maxlen, 
                                   nbits=nbits, 
                                   overwrite_param="reuse")

  0%|          | 0/5183 [00:00<?, ?it/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

[32m2023-11-01 12:48:32.289[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mconvert_for_colbert[0m:[36m98[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact/colbert/scifact_collection.tsv ...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:00<00:00, 34289.09it/s]
[32m2023-11-01 12:48:32.446[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mconvert_for_colbert[0m:[36m105[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact/colbert/scifact_queries.tsv ...[0m
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 303788.31it/s]

[Nov 01, 12:48:32] #> Loading collection...
0M 
[Nov 01, 12:48:32] #> Loading the queries from /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact/colbert/scifact_queries.tsv ...
[Nov 01, 12:48:32] #> Got 300 queries. All QIDs are unique.



[Nov 01, 12:48:32] #> Creating directory /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/scifact_colbertv2_ft/indexes/scifact.2bits 







#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 300,
    "mask_punctuation": true,
    "checkpoint": "\/home\/bengsoon\/Projects\/xcs224u_project\/zeroqaret\/nbs\/experiments\/default\/none\/2023-10\/31\/12.41

0it [00:00, ?it/s]

[Nov 01, 12:49:05] [0] 		 #> Saving chunk 0: 	 5,183 passages and 1,229,853 embeddings. From #0 onward.


1it [00:12, 12.02s/it]
100%|██████████| 1/1 [00:00<00:00, 458.44it/s]
  0%|          | 0/16384 [00:00<?, ?it/s]

[Nov 01, 12:49:06] [0] 		 #> Checking all files were saved...
[Nov 01, 12:49:06] [0] 		 Found all files!
[Nov 01, 12:49:06] [0] 		 #> Building IVF...
[Nov 01, 12:49:06] [0] 		 #> Loading codes...
[Nov 01, 12:49:06] [0] 		 Sorting codes...
[Nov 01, 12:49:06] [0] 		 Getting unique codes...
[Nov 01, 12:49:06] #> Optimizing IVF to store map from centroids to list of pids..
[Nov 01, 12:49:06] #> Building the emb2pid mapping..
[Nov 01, 12:49:06] len(emb2pid) = 1229853


100%|██████████| 16384/16384 [00:00<00:00, 97233.90it/s]


[Nov 01, 12:49:06] #> Saved optimized IVF to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/scifact_colbertv2_ft/indexes/scifact.2bits/ivf.pid.pt
[Nov 01, 12:49:06] [0] 		 #> Saving the indexing metadata to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/scifact_colbertv2_ft/indexes/scifact.2bits/metadata.json ..
#> Joined...
[Nov 01, 12:49:08] #> Loading codec...
[Nov 01, 12:49:08] #> Loading IVF...
[Nov 01, 12:49:08] #> Loading doclens...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2256.22it/s]

[Nov 01, 12:49:08] #> Loading codes and residuals...



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 66.47it/s]


In [271]:
colbert_retriever_ft = EvaluateRetrieval(colbert_model_ft)
results = colbert_retriever_ft.retrieve(collection, queries)
colbert_retriever_ft.qrels = qrels

300it [00:03, 86.55it/s]


In [272]:
scifact_results.collect("ColBERTv2_ft", retriever, results, {'Average Query Time (ms/it)': 12.0, 'Total Query Time (s)': 3.0, 'Total Document Embedding Time (s)': None})

In [385]:
scifact_corpus, scifact_queries, _ = beir_datasets.load_dataset("scifact")

  0%|          | 0/5183 [00:00<?, ?it/s]

In [390]:
scifact_queries

{'1': '0-dimensional biomaterials show inductive properties.',
 '3': '1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.',
 '5': '1/2000 in UK have abnormal PrP positivity.',
 '13': '5% of perinatal mortality is due to low birth weight.',
 '36': 'A deficiency of vitamin B12 increases blood levels of homocysteine.',
 '42': 'A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.',
 '48': 'A total of 1,000 people in the UK are asymptomatic carriers of vCJD infection.',
 '49': 'ADAR1 binds to Dicer to cleave pre-miRNA.',
 '50': 'AIRE is expressed in some skin tumors.',
 '51': 'ALDH1 expression is associated with better breast cancer outcomes.',
 '53': 'ALDH1 expression is associated with poorer prognosis in breast cancer.',
 '54': 'AMP-activated protein kinase (AMPK) activation increases inflammation-related fibrosis in the lung

In [273]:
scifact_results.all

Unnamed: 0,Baseline SBERT,multi-qa-MiniLM-L6-cos-v1_ft,ColBERTv2 Baseline,ColBERTv2_ft
NDCG@1,0.43667,0.43667,0.58667,0.58
NDCG@3,0.50444,0.50672,0.65424,0.64954
NDCG@5,0.52354,0.53238,0.6728,0.66908
NDCG@10,0.54029,0.55935,0.69195,0.69023
NDCG@100,0.58909,0.59926,0.7165,0.71544
NDCG@1000,0.60131,0.61148,0.72385,0.72109
MAP@1,0.41028,0.41667,0.55717,0.5505
MAP@3,0.47785,0.48263,0.62802,0.62367
MAP@5,0.49084,0.49939,0.64135,0.63747
MAP@10,0.49919,0.5116,0.65074,0.64757


In [323]:
scifact_results.save_as_csv(f"../datasets/{dataset_name}/20231101_{dataset_name}_results.csv", "all")

[32m2023-11-01 18:14:39.462[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_as_csv[0m:[36m8[0m - [1mTable 'all' saved as '../datasets/scifact/20231101_scifact_results.csv'.[0m


## `nfcorpus`

In [274]:
dataset_name = 'nfcorpus'

In [275]:
nfcorpus_results = ResultsCollector(dataset_name=dataset_name)

[32m2023-11-01 14:51:22.852[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m12[0m - [1mResultsCollector object initialized.[0m


### SBERT Baseline

In [276]:
baseline_model = "multi-qa-MiniLM-L6-cos-v1"

In [277]:
baseline_retriever = SBERTEval(model_path = baseline_model)

[32m2023-11-01 14:52:03.040[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [278]:
results, results_time = baseline_retriever.beir_retrieval(dataset_name)

[32m2023-11-01 14:52:05.558[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m13[0m - [1m
   
****************************************************************************************************   
******                                                                                        ******   
                                     Evaluation for 'nfcorpus'                                     *   
******                                                                                        ******   
****************************************************************************************************
[0m
[32m2023-11-01 14:52:05.560[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'nfcorpus'...[0m
[32m2023-11-01 14:52:05.561[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/nfcorpus'

  0%|          | 0/3633 [00:00<?, ?it/s]

[32m2023-11-01 14:52:05.639[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m21[0m - [1mPre-computing Document Embeddings for 'nfcorpus' dataset...[0m


Batches:   0%|          | 0/29 [00:00<?, ?it/s]

[32m2023-11-01 14:52:13.047[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m32[0m - [1mNumber of documents: 3633, Dim: 384[0m
[32m2023-11-01 14:52:13.049[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m33[0m - [1mIndex size (in MB): 5.58MB[0m
[32m2023-11-01 14:52:13.049[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m34[0m - [1mTime taken for pre-computing corpus embedding: 7.40 s[0m
[32m2023-11-01 14:52:13.050[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m35[0m - [1mPre-computing of Document Embeddings done.

[0m
[32m2023-11-01 14:52:13.051[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m38[0m - [1mStarting query benchmark evaluation ...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 323/323 [00:02<00:00, 123.01it/s]
[32m2023-11-01 14:52:15.681[0m | [1mINFO    [0m | [3

In [280]:
nfcorpus_results.collect(experiment_name="Baseline SBERT", retriever=baseline_retriever, results=results, results_time=results_time)

In [281]:
nfcorpus_results.all

Unnamed: 0,Baseline SBERT
NDCG@1,0.38854
NDCG@3,0.34838
NDCG@5,0.32423
NDCG@10,0.29674
NDCG@100,0.2666
NDCG@1000,0.35088
MAP@1,0.05024
MAP@3,0.07971
MAP@5,0.0915
MAP@10,0.10533


### Finetuning SBERT

In [282]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets

In [283]:
gen_nfcorpus_df = pd.read_csv("../datasets/nfcorpus/qg/nfcorpus_qg_all.csv", index_col=0)
pids = gen_nfcorpus_df["pid"].tolist()
passages = gen_nfcorpus_df["passage"].tolist()
titles = gen_nfcorpus_df["title"].tolist()
questions = gen_nfcorpus_df["question"].tolist()

In [284]:
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)
corpus_ids, query_ids = list(corpus), list(queries)

[32m2023-11-01 14:53:58.663[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'nfcorpus'...[0m
[32m2023-11-01 14:53:58.666[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/nfcorpus'[0m


  0%|          | 0/3633 [00:00<?, ?it/s]

In [285]:
train_examples = []
for p, t, q in zip(passages, titles, questions):
    anchor = str(t) + " - " + str(q)
    train_examples.append(InputExample(texts=[anchor, p]))

In [299]:
queries

{'PLAIN-2': 'Do Cholesterol Statin Drugs Cause Breast Cancer?',
 'PLAIN-12': 'Exploiting Autophagy to Live Longer',
 'PLAIN-23': 'How to Reduce Exposure to Alkylphenols Through Your Diet',
 'PLAIN-33': 'What’s Driving America’s Obesity Problem?',
 'PLAIN-44': 'Who Should be Careful About Curcumin?',
 'PLAIN-56': 'Foods for Glaucoma',
 'PLAIN-68': 'What is Actually in Chicken Nuggets?',
 'PLAIN-78': 'What Do Meat Purge and Cola Have in Common?',
 'PLAIN-91': 'Chronic Headaches and Pork Parasites',
 'PLAIN-102': 'Stopping Heart Disease in Childhood',
 'PLAIN-112': 'Food Dyes and ADHD',
 'PLAIN-123': 'How Citrus Might Help Keep Your Hands Warm',
 'PLAIN-133': 'Starving Tumors of Their Blood Supply',
 'PLAIN-143': 'Are Dental X-Rays Safe?',
 'PLAIN-153': 'How Should I Take Probiotics?',
 'PLAIN-165': 'Breast Cancer & Alcohol: How Much is Safe?',
 'PLAIN-175': 'Diet and Cellulite',
 'PLAIN-186': 'Best Treatment for Constipation',
 'PLAIN-196': 'Should We Avoid Titanium Dioxide?',
 'PLAIN-20

In [286]:
train_examples[158].texts

['Association between multivitamin use and breast cancer risk - Is there a significant association between multivitamin use and the risk of breast cancer?',
 'Multivitamin supplement use and risk of breast cancer: a meta-analysis. BACKGROUND: The association between consumption of multivitamins and breast cancer is inconsistent in epidemiologic studies. OBJECTIVE: To perform a meta-analysis of cohort and case-control studies to evaluate multivitamin intake and its relationship with breast cancer risk. METHODS: The published literature was systematically searched and reviewed using MEDLINE (1950 through July 2010), EMBASE (1980 through July 2010), and the Cochrane Central Register of Controlled Trials (The Cochrane Library 2010 issue 1). Studies that included specific risk estimates were pooled using a random-effects model. The bias and quality of these studies were assessed with REVMAN statistical software (version 5.0) and the GRADE method of the Cochrane Collaboration. RESULTS: Eight

In [287]:
train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=8)

In [288]:
ft_model = SentenceTransformer(baseline_model)

In [289]:
train_loss = losses.MultipleNegativesRankingLoss(ft_model)

In [290]:
# Tune the model
model_path = f"../models/{dataset_name}_{baseline_model.replace('-', '_')}_ft"
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
ft_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, show_progress_bar=True, checkpoint_path=model_path)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/454 [00:00<?, ?it/s]

Iteration:   0%|          | 0/454 [00:00<?, ?it/s]

Iteration:   0%|          | 0/454 [00:00<?, ?it/s]

In [291]:
ft_model.save(model_path)

In [292]:
ft_retriever = SBERTEval(model_path, normalize=False)

[32m2023-11-01 14:57:19.525[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [319]:
print("\n\n".join([corpus[corpus_ids[i]]["text"] for i in ft_retriever.search_queries("Is Milk and Mucus a Myth?", 10)[0][0]]))

Excessive milk consumption has a long association with increased respiratory tract mucus production and asthma. Such an association cannot be explained using a conventional allergic paradigm and there is limited medical evidence showing causality. In the human colon, beta-casomorphin-7 (beta-CM-7), an exorphin derived from the breakdown of A1 milk, stimulates mucus production from gut MUC5AC glands. In the presence of inflammation similar mucus overproduction from respiratory tract MUC5AC glands characterises many respiratory tract diseases. beta-CM-7 from the blood stream could stimulate the production and secretion of mucus production from these respiratory glands. Such a hypothesis could be tested in vitro using quantitative RT-PCR to show that the addition of beta-CM-7 into an incubation medium of respiratory goblet cells elicits an increase in MUC5AC mRNA and by identifying beta-CM-7 in the blood of asthmatic patients. This association may not necessarily be simply cause and effec

In [293]:
ft_results, ft_time = ft_retriever.beir_retrieval(dataset_name)

[32m2023-11-01 14:57:19.531[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m13[0m - [1m
   
****************************************************************************************************   
******                                                                                        ******   
                                     Evaluation for 'nfcorpus'                                     *   
******                                                                                        ******   
****************************************************************************************************
[0m
[32m2023-11-01 14:57:19.532[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'nfcorpus'...[0m
[32m2023-11-01 14:57:19.533[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/nfcorpus'

  0%|          | 0/3633 [00:00<?, ?it/s]

[32m2023-11-01 14:57:19.615[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m21[0m - [1mPre-computing Document Embeddings for 'nfcorpus' dataset...[0m


Batches:   0%|          | 0/29 [00:00<?, ?it/s]

[32m2023-11-01 14:57:27.296[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m32[0m - [1mNumber of documents: 3633, Dim: 384[0m
[32m2023-11-01 14:57:27.297[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m33[0m - [1mIndex size (in MB): 5.58MB[0m
[32m2023-11-01 14:57:27.298[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m34[0m - [1mTime taken for pre-computing corpus embedding: 7.67 s[0m
[32m2023-11-01 14:57:27.299[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m35[0m - [1mPre-computing of Document Embeddings done.

[0m
[32m2023-11-01 14:57:27.300[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m38[0m - [1mStarting query benchmark evaluation ...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 323/323 [00:02<00:00, 130.77it/s]
[32m2023-11-01 14:57:29.772[0m | [1mINFO    [0m | [3

In [294]:
nfcorpus_results.collect(f"{baseline_model}_ft", ft_retriever, ft_results, ft_time)

In [295]:
nfcorpus_results.all

Unnamed: 0,Baseline SBERT,multi-qa-MiniLM-L6-cos-v1_ft
NDCG@1,0.38854,0.38545
NDCG@3,0.34838,0.33932
NDCG@5,0.32423,0.31247
NDCG@10,0.29674,0.28704
NDCG@100,0.2666,0.25411
NDCG@1000,0.35088,0.33565
MAP@1,0.05024,0.04151
MAP@3,0.07971,0.07134
MAP@5,0.0915,0.08202
MAP@10,0.10533,0.0967


### ColBERTv2 Baseline

In [350]:
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)

# The indices in BeIR datasets may not be monotic, 
### so we will need a dictionary with enumerated indices (which is used in ColBERT) as keys and BeIR index as values
### collection_ids = {colbert_index: beir_index}
collection_ids = list(corpus)
queries_ids = list(queries)

# Load datasets for ColBERT
collection_path, queries_path = beir_datasets.convert_for_colbert(dataset_name)
collection, queries = Collection(path=collection_path), Queries(path=queries_path)

# queries = list(queries.values())

checkpoint = 'colbert-ir/colbertv2.0'

  0%|          | 0/3633 [00:00<?, ?it/s]

  0%|          | 0/3633 [00:00<?, ?it/s]

[32m2023-11-01 18:29:40.094[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_for_colbert[0m:[36m27[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/nfcorpus/colbert/nfcorpus_collection.tsv ...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3633/3633 [00:00<00:00, 30834.29it/s]
[32m2023-11-01 18:29:40.217[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_for_colbert[0m:[36m34[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/nfcorpus/colbert/nfcorpus_queries.tsv ...[0m
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 323/323 [00:00<00:00, 468127.23it/s]

[Nov 01, 18:29:40] #> Loading collection...
0M 
[Nov 01, 18:29:40] #> Loading the queries from /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/nfcorpus/colbert/nfcorpus_queries.tsv ...
[Nov 01, 18:29:40] #> Got 323 queries. All QIDs are unique.






In [332]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens

index_name = f'{dataset_name}.{nbits}bits'

In [333]:
model = ColBERTRetrievalSearch(checkpoint, 
                                   index_name, 
                                   experiment_name=f"ColBERTv2Base_{dataset_name}", 
                                   collection=collection, 
                                   collection_ids=collection_ids,
                                   doc_maxlen=doc_maxlen, 
                                   nbits=nbits, 
                                   overwrite_param="reuse")



[Nov 01, 18:21:28] #> Creating directory /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/ColBERTv2Base_nfcorpus/indexes/nfcorpus.2bits 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 300

0it [00:00, ?it/s]

[Nov 01, 18:21:54] [0] 		 #> Saving chunk 0: 	 3,633 passages and 855,846 embeddings. From #0 onward.
[Nov 01, 18:21:54] [0] 		 #> Checking all files were saved...
[Nov 01, 18:21:54] [0] 		 Found all files!
[Nov 01, 18:21:54] [0] 		 #> Building IVF...
[Nov 01, 18:21:54] [0] 		 #> Loading codes...
[Nov 01, 18:21:54] [0] 		 Sorting codes...


1it [00:08,  8.20s/it]
100%|██████████| 1/1 [00:00<00:00, 708.98it/s]
100%|██████████| 8192/8192 [00:00<00:00, 83510.53it/s]


[Nov 01, 18:21:54] [0] 		 Getting unique codes...
[Nov 01, 18:21:54] #> Optimizing IVF to store map from centroids to list of pids..
[Nov 01, 18:21:54] #> Building the emb2pid mapping..
[Nov 01, 18:21:54] len(emb2pid) = 855846
[Nov 01, 18:21:55] #> Saved optimized IVF to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/ColBERTv2Base_nfcorpus/indexes/nfcorpus.2bits/ivf.pid.pt
[Nov 01, 18:21:55] [0] 		 #> Saving the indexing metadata to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/ColBERTv2Base_nfcorpus/indexes/nfcorpus.2bits/metadata.json ..
#> Joined...
[Nov 01, 18:21:58] #> Loading codec...
[Nov 01, 18:21:58] #> Loading IVF...
[Nov 01, 18:21:58] #> Loading doclens...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2044.01it/s]

[Nov 01, 18:21:59] #> Loading codes and residuals...



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 89.83it/s]


In [334]:
retriever = EvaluateRetrieval(model)

In [340]:
retriever.retrieve??

[0;31mSignature:[0m
[0mretriever[0m[0;34m.[0m[0mretrieve[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mcorpus[0m[0;34m:[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mqueries[0m[0;34m:[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
    [0;32mdef[0m [0mretrieve[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mcorpus[0m[0;34m:[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mquer

In [365]:
results = retriever.retrieve(collection, queries)

# the keys in results need to be converted back to original qids
results = {queries_ids[int(k)]:v   for k, v in results.items()}

323it [00:02, 108.39it/s]


In [366]:
retriever.qrels = qrels

In [367]:
nfcorpus_results.collect("ColBERTv2 Baseline", retriever, results, {'Average Query Time (ms/it)': 9.39, 'Total Query Time (s)': 3.0, 'Total Document Embedding Time (s)': None})


In [368]:
nfcorpus_results.all

Unnamed: 0,Baseline SBERT,multi-qa-MiniLM-L6-cos-v1_ft,ColBERTv2 Baseline
NDCG@1,0.38854,0.38545,0.47214
NDCG@3,0.34838,0.33932,0.41224
NDCG@5,0.32423,0.31247,0.38223
NDCG@10,0.29674,0.28704,0.34323
NDCG@100,0.2666,0.25411,0.3033
NDCG@1000,0.35088,0.33565,0.35523
MAP@1,0.05024,0.04151,0.06371
MAP@3,0.07971,0.07134,0.10011
MAP@5,0.0915,0.08202,0.11375
MAP@10,0.10533,0.0967,0.12995


### Finetuning ColBERTv2

In [375]:
??finetune_colbert

[0;31mSignature:[0m
[0mfinetune_colbert[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mexperiment_name[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcsv_file[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'w'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreplace[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnranks[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbsize[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m32[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m1e-05[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdoc_maxlen[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m300[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdim[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m128[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maccumsteps[

In [370]:
finetune_colbert(experiment_name=f"{dataset_name}_colbertv2_finetuned",
                          csv_file=f"../datasets/{dataset_name}/qg/{dataset_name}_qg_all.csv")

[32m2023-11-01 18:37:14.683[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m
[32m2023-11-01 18:37:14.685[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_qg_for_colbert_training[0m:[36m29[0m - [1mCreating ColBERT training files from ../datasets/nfcorpus/qg/colbert_training...[0m
Training files: : 3633it [00:00, 22149.57it/s]
[32m2023-11-01 18:37:14.934[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_qg_for_colbert_training[0m:[36m52[0m - [1mtriples.jsonl, queries,tsv and collection.tsv files created in ../datasets/nfcorpus/qg/colbert_training.[0m


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 1,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 32,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 220,
    "mask_punctuation": true,
    "checkpoint": "colbert-ir\/colbertv2.0",
    "triples": "..\/datasets\/nfcorpus\/qg\/colbert_training\/triples.jsonl",
   





#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: .  Association of Statin Use with Breast Cancer Survival - What was the association between statin use and breast cancer survival in a population-based cohort of breast cancer patients from Finland?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2523,  1997, 28093,  2378,  2224,  2007,  7388,  4456,
         7691,  1011,  2054,  2001,  1996,  2523,  2090, 28093,  2378,  2224,
         1998,  7388,  4456,  7691,  1999,  1037,  2313,  1011,  2241,  2522,
        27794,   102])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1])

#>>>    24.1 8.38 		|		 15.72
[Nov 01, 18:37:22] 0 2.2649578568234574e-06
#>>>    24.3 8.53 		|		 15.770000000000001
[Nov 01, 18:37:22] 1 2.264250058942707e-06
#>>>    24.44 9.95 		|		 14.490000000000002
[Nov 01, 18:37:23] 2 2.3017967130108445e-06
#>>>    24.

In [376]:
checkpoint = '/home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/default/none/2023-10/31/12.41.13/checkpoints/colbert'


colbert_model_ft = ColBERTRetrievalSearch(checkpoint, 
                                   index_name, 
                                   experiment_name=f"{dataset_name}_colbertv2_ft", 
                                   collection=collection, 
                                   collection_ids=collection_ids,
                                   doc_maxlen=doc_maxlen, 
                                   nbits=nbits, 
                                   overwrite_param="reuse")



[Nov 01, 19:02:33] #> Creating directory /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/nfcorpus_colbertv2_ft/indexes/nfcorpus.2bits 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 300,

0it [00:00, ?it/s]

[Nov 01, 19:02:56] [0] 		 #> Saving chunk 0: 	 3,633 passages and 855,846 embeddings. From #0 onward.
[Nov 01, 19:02:56] [0] 		 #> Checking all files were saved...
[Nov 01, 19:02:56] [0] 		 Found all files!
[Nov 01, 19:02:56] [0] 		 #> Building IVF...
[Nov 01, 19:02:56] [0] 		 #> Loading codes...
[Nov 01, 19:02:56] [0] 		 Sorting codes...


1it [00:08,  8.17s/it]
100%|██████████| 1/1 [00:00<00:00, 728.94it/s]
100%|██████████| 8192/8192 [00:00<00:00, 86335.56it/s]


[Nov 01, 19:02:56] [0] 		 Getting unique codes...
[Nov 01, 19:02:56] #> Optimizing IVF to store map from centroids to list of pids..
[Nov 01, 19:02:56] #> Building the emb2pid mapping..
[Nov 01, 19:02:56] len(emb2pid) = 855846
[Nov 01, 19:02:57] #> Saved optimized IVF to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/nfcorpus_colbertv2_ft/indexes/nfcorpus.2bits/ivf.pid.pt
[Nov 01, 19:02:57] [0] 		 #> Saving the indexing metadata to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/nfcorpus_colbertv2_ft/indexes/nfcorpus.2bits/metadata.json ..
#> Joined...
[Nov 01, 19:02:58] #> Loading codec...
[Nov 01, 19:02:58] #> Loading IVF...
[Nov 01, 19:02:58] #> Loading doclens...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1517.48it/s]

[Nov 01, 19:02:58] #> Loading codes and residuals...



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 92.66it/s]


In [378]:
colbert_retriever_ft = EvaluateRetrieval(colbert_model_ft)
results = colbert_retriever_ft.retrieve(collection, queries)

# the keys in results need to be converted back to original qids
results = {queries_ids[int(k)]:v   for k, v in results.items()}

colbert_retriever_ft.qrels = qrels

323it [00:02, 117.89it/s]


In [379]:
nfcorpus_results.collect("ColBERTv2_ft", retriever, results, {'Average Query Time (ms/it)': 8.48, 'Total Query Time (s)': 2.0, 'Total Document Embedding Time (s)': None})

In [380]:
nfcorpus_results.all

Unnamed: 0,Baseline SBERT,multi-qa-MiniLM-L6-cos-v1_ft,ColBERTv2 Baseline,ColBERTv2_ft
NDCG@1,0.38854,0.38545,0.47214,0.47214
NDCG@3,0.34838,0.33932,0.41224,0.40997
NDCG@5,0.32423,0.31247,0.38223,0.38063
NDCG@10,0.29674,0.28704,0.34323,0.34277
NDCG@100,0.2666,0.25411,0.3033,0.30339
NDCG@1000,0.35088,0.33565,0.35523,0.35371
MAP@1,0.05024,0.04151,0.06371,0.0634
MAP@3,0.07971,0.07134,0.10011,0.09915
MAP@5,0.0915,0.08202,0.11375,0.11327
MAP@10,0.10533,0.0967,0.12995,0.12969


In [381]:
nfcorpus_results.save_as_csv(f"../datasets/{dataset_name}/20231101_{dataset_name}_results.csv", "all")

[32m2023-11-01 19:05:21.985[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_as_csv[0m:[36m8[0m - [1mTable 'all' saved as '../datasets/nfcorpus/20231101_nfcorpus_results.csv'.[0m


#### all-mpnet-base-v2

In [260]:
word_emb = models.Transformer('sentence-transformers/all-mpnet-base-v2')

In [240]:
pooling = models.Pooling(word_emb.get_word_embedding_dimension)

In [264]:
model = SentenceTransformer('all-mpnet-base-v2')

In [265]:
train_loss = losses.MultipleNegativesRankingLoss(model)

In [266]:
# Tune the model
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, show_progress_bar=True, checkpoint_path="../models/scifact_all_mpnetv2_ft")

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1295 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1295 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1295 [00:00<?, ?it/s]

In [267]:
model.save("../models/scifact_all_mpnetv2_ft")

In [268]:
ft_retriever = SBERTEval("../models/scifact_all_mpnetv2_ft/", normalize=False)

[32m2023-10-31 12:24:57.778[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [269]:
ft_results, ft_time = ft_retriever.beir_retrieval("scifact")

[32m2023-10-31 12:24:59.035[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m13[0m - [1m
   
****************************************************************************************************   
******                                                                                        ******   
*                                     Evaluation for 'scifact'                                     *   
******                                                                                        ******   
****************************************************************************************************
[0m
[32m2023-10-31 12:24:59.037[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'scifact'...[0m
[32m2023-10-31 12:24:59.039[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/scifact'[0

  0%|          | 0/5183 [00:00<?, ?it/s]

[32m2023-10-31 12:24:59.100[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m21[0m - [1mPre-computing Document Embeddings for 'scifact' dataset...[0m


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

[32m2023-10-31 12:25:37.090[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m32[0m - [1mNumber of documents: 5183, Dim: 768[0m
[32m2023-10-31 12:25:37.092[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m33[0m - [1mIndex size (in MB): 15.92MB[0m
[32m2023-10-31 12:25:37.093[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m34[0m - [1mTime taken for pre-computing corpus embedding: 37.97 s[0m
[32m2023-10-31 12:25:37.094[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m35[0m - [1mPre-computing of Document Embeddings done.

[0m
[32m2023-10-31 12:25:37.094[0m | [1mINFO    [0m | [36m__main__[0m:[36mbeir_retrieval[0m:[36m38[0m - [1mStarting query benchmark evaluation ...[0m
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:04<00:00, 67.91it/s]
[32m2023-10-31 12:25:41.517[0m | [1mINFO    [0m | 

In [270]:
scifact_results.collect("scifact_all_mpnetv2_ft", ft_retriever, ft_results, ft_time)

In [272]:
scifact_results.ndcg

Unnamed: 0,Zero-shot SBERT,finetune_mpnet,scifact_all_mpnetv2_ft
NDCG@1,0.53333,0.35667,0.50667
NDCG@3,0.60009,0.45264,0.56486
NDCG@5,0.62803,0.47535,0.59359
NDCG@10,0.6557,0.504,0.61472
NDCG@100,0.68911,0.54203,0.6535
NDCG@1000,0.69653,0.55711,0.66212


### Paired Encodings

In [None]:
gen_scifact_df = pd.read_csv("../datasets/scifact/qg/scifact_qg_all.csv", index_col=0)

In [None]:
gen_scifact_df

In [None]:
pids = gen_scifact_df["pid"].tolist()
passages = gen_scifact_df["passage"].tolist()
titles = gen_scifact_df["title"].tolist()
questions = gen_scifact_df["question"].tolist()

In [None]:
corpus, queries, qrels = beir_datasets.load_dataset("scifact")
corpus_ids, query_ids = list(corpus), list(queries)

In [None]:
sbert_model_name = "all-mpnet-base-v2"
sbert_model = SentenceTransformer(sbert_model_name)

In [None]:
passages_emb = sbert_model.encode(passages, convert_to_tensor=True)
titles_emb = sbert_model.encode(titles, convert_to_tensor=True)
questions_emb = sbert_model.encode(questions, convert_to_tensor=True)

In [None]:
def search_queries_paired(passages_emb,
                          titles_emb,
                          questions_emb,
                          queries: Union[str, List], # single query or batch queries
                          top_k: int,
                          model
                  ) -> (List[List[int]], List[List[float]]) :

    """
    Performs cosine similarity calculation between query and generated-(passage, titles, questions) embeddings.
    Returns (List[list of top-k docs indices for each query], List[similarity score for each query])  
    """
    
    if isinstance(queries, str):
        queries = [queries]
    
    queries_emb = model.encode(queries, convert_to_tensor=True)
    sim_scores_passages = sbert_util.cos_sim(queries_emb, passages_emb)
    sim_scores_titles = sbert_util.cos_sim(queries_emb, titles_emb)
    sim_scores_questions = sbert_util.cos_sim(queries_emb, questions_emb)
    
    average_sim_scores = torch.vstack((sim_scores_passages, sim_scores_questions, sim_scores_titles)).mean(0, keepdim=True)
    
    # #### Get top-k ranking
    average_sim_scores[torch.isnan(average_sim_scores)] = -1
    sim_scores_top_k_values, sim_scores_top_k_idx = torch.topk(average_sim_scores, top_k, dim=1, largest=True, sorted=True)
    
    sim_scores_top_k_values = sim_scores_top_k_values.cpu().tolist()
    sim_scores_top_k_idx = sim_scores_top_k_idx.cpu().tolist()
    
    return sim_scores_top_k_values, sim_scores_top_k_idx

In [None]:
def search_queries_with_emb(ref_emb,
                          queries: Union[str, List], # single query or batch queries
                          top_k: int,
                          model
                  ) -> (List[List[int]], List[List[float]]) :

    """
    Performs cosine similarity calculation between query and generated-(passage, titles, questions) embeddings.
    Returns (List[list of top-k docs indices for each query], List[similarity score for each query])  
    """
    
    if isinstance(queries, str):
        queries = [queries]
    
    queries_emb = model.encode(queries, convert_to_tensor=True)
    sim_scores = sbert_util.cos_sim(queries_emb, ref_emb)
    # sim_scores_titles = sbert_util.cos_sim(queries_emb, titles_emb)
    # sim_scores_questions = sbert_util.cos_sim(queries_emb, questions_emb)
    
    
    # #### Get top-k ranking
    sim_scores[torch.isnan(sim_scores)] = -1
    sim_scores_top_k_values, sim_scores_top_k_idx = torch.topk(sim_scores, top_k, dim=1, largest=True, sorted=True)
    
    sim_scores_top_k_values = sim_scores_top_k_values.cpu().tolist()
    sim_scores_top_k_idx = sim_scores_top_k_idx.cpu().tolist()
    
    return sim_scores_top_k_values, sim_scores_top_k_idx

In [None]:
def rerank(ref_emb: Dict[str, Dict[str, str]],
           query_emb: Dict[str, str],
           results: Dict[str, Dict[str, float]],
           top_k: int) -> Dict[str, Dict[str, float]]:
    
        ref_emb[:top_k]        new_corpus = {}
    
        for query_id in results:
            if len(results[query_id]) > top_k:
                for (doc_id, _) in sorted(results[query_id].items(), key=lambda item: item[1], reverse=True)[:top_k]:
                    new_corpus[doc_id] = corpus[doc_id]
            else:
                for doc_id in results[query_id]:
                    new_corpus[doc_id] = corpus[doc_id]
                    
        return self.retriever.search(new_corpus, queries, top_k, self.score_function)

In [None]:
passages_results_docs_ids = search_queries_with_emb(passages_emb, query, retriever.top_k+500)

In [None]:
titles_emb[passages_result_docs_ids]

In [None]:
results = {}

for query_id in query_ids:
    query = queries[query_id]
    _, passages_result_docs_ids = search_queries_with_emb(passages_emb, query, retriever.top_k+500)
    _, questions_result_doc_ids = search_queries_with_emb(titles_emb[passages_result_docs_ids], query, retriever.top_k+250)
    final_result_scores, final_result_doc_ids = search_queries_with_emb(titles_emb[questions_result_doc_ids], query, retriever.top_k)
    results[query_id] =  {str(corpus_ids[id]): score for id, score in zip(final_result_doc_ids[0], final_result_scores[0])}
    

In [None]:
scifact_results.collect(experiment_name="Paired SBERT - First Try", retriever=retriever, results=results, results_time=None)

In [None]:
scifact_results.all

In [None]:
sim_scores_passages.shape # (1, corpus length)

In [None]:
concat_sim_scores = torch.vstack((sim_scores_passages, sim_scores_questions, sim_scores_titles))

In [None]:
concat_sim_scores.shape

In [None]:
concat_sim_scores.mean(0)

## ColBERTv2 - vanilla

### Preprocessing Dataset

> The original code in `load_collection()` from _ColBERT/colbert/evaluation/loaders.py_ required monotonic `pid`, but that is not necessarily our case. We'll have to monkey patch it to pass that assertion at line 166: ```assert pid == 'id' or int(pid) == line_idx, f"pid={pid}, line_idx={line_idx}"``` 

In [None]:
dataset_name = "fiqa"
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)

# The indices in BeIR datasets may not be monotic, 
### so we will need a dictionary with enumerated indices (which is used in ColBERT) as keys and BeIR index as values
### collection_ids = {colbert_index: beir_index}
collection_ids = {idx: val for idx, val in enumerate(list(corpus))}

# Load datasets for ColBERT
collection_path, queries_path = beir_datasets.convert_for_colbert(dataset_name)
collection, queries = Collection(path=collection_path), Queries(path=queries_path)

# queries_ids = list(queries)
# queries = list(queries.values())

Let's look at an example of a query

In [None]:
queries[8]

... and an example of a passage from the collection

In [None]:
print(collection[10])

### Indexing

In [None]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens

index_name = f'{dataset_name}.{nbits}bits'

In [None]:
checkpoint = 'colbert-ir/colbertv2.0'

In [None]:
with Run().context(RunConfig(nranks=1, experiment='notebook')):  # nranks specifies the number of GPUs to use
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                # Consider larger numbers for small datasets.

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=collection, overwrite='reuse')

In [None]:
indexer.index??

In [None]:
indexer.get_index()

In [None]:
# To create the searcher using its relative name (i.e., not a full path), set
# experiment=value_used_for_indexing in the RunConfig.
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name, collection=collection)

In [None]:
query = queries[8] # try with an in-range query or supply your own
print(f"#> {query}")

In [None]:
# Find the top-3 passages for this query
results = searcher.search(query, k=10)

In [None]:
qrels['8']

In [None]:
# Print out the top-k retrieved passages
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")

### Batch Search

In [None]:
rankings = searcher.search_all(queries, 5)

In [None]:
rankings.todict()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()