# Baseline

> Baseline Evaluation: Baseline evaluation for zeroqaret project. The evaluation modules here will also be re-used for other evaluations

In [1]:
#| default_exp baseline

In [1]:
#| hide
import nbdev; nbdev.nbdev_export()

In [2]:
#| hide
from nbdev.showdoc import *

## Imports

In [3]:
from loguru import logger
import os
from pathlib import Path
from fastcore.basics import patch_to, patch

from zeroqaret.dataset import BEIRDataset, our_list as eval_list
from zeroqaret.evaluation import ColBERTRetrievalSearch, ResultsCollector

from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch
from beir.retrieval.evaluation import EvaluateRetrieval
from beir import util
from time import time

from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

import random

  from tqdm.autonotebook import tqdm


## Get Datasets

In [4]:
# these are the list of datasets to be evaluated
eval_list 

['fiqa', 'trec-covid']

In [5]:
beir_datasets = BEIRDataset()

[32m2023-10-26 17:26:32.422[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36m__init__[0m:[36m51[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


## FIQA

In [15]:
dataset_name = "fiqa"
corpus, queries, qrels = beir_datasets.load_dataset(dataset_name)

# The indices in BeIR datasets may not be monotic, 
### so we will need a dictionary with enumerated indices (which is used in ColBERT) as keys and BeIR index as values
### collection_ids = {colbert_index: beir_index}
collection_ids = {idx: str(val) for idx, val in enumerate(list(corpus))}

# Load datasets for ColBERT
collection_path, queries_path = beir_datasets.convert_for_colbert(dataset_name)
colbert_collection, colbert_queries = Collection(path=collection_path), Queries(path=queries_path)

  0%|          | 0/57638 [00:00<?, ?it/s]

  0%|          | 0/57638 [00:00<?, ?it/s]

[32m2023-10-26 16:12:36.997[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mconvert_for_colbert[0m:[36m98[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/fiqa/colbert/fiqa_collection.tsv ...[0m
100%|████| 57638/57638 [00:00<00:00, 66117.34it/s]
[32m2023-10-26 16:12:37.884[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mconvert_for_colbert[0m:[36m105[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/fiqa/colbert/fiqa_queries.tsv ...[0m
100%|███████| 648/648 [00:00<00:00, 487604.77it/s]

[Oct 26, 16:12:37] #> Loading collection...
0M 





[Oct 26, 16:12:38] #> Loading the queries from /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/fiqa/colbert/fiqa_queries.tsv ...
[Oct 26, 16:12:38] #> Got 648 queries. All QIDs are unique.



### Baseline SBERT

In [16]:
sbert_model_name = "all-mpnet-base-v2"
sbert_model = models.SentenceBERT(model_path=sbert_model_name)
batch_size = 256,
normalize = True

In [17]:
sbert_model = DenseRetrievalExactSearch(models.SentenceBERT(sbert_model_name), batch_size = 256, corpus_chunk_size=512*9999)
sbert_retriever = EvaluateRetrieval(sbert_model, score_function="dot")

In [18]:
start_time = time()
sbert_results = sbert_retriever.retrieve(corpus, queries)
end_time = time()
print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/226 [00:00<?, ?it/s]

Time taken to retrieve: 284.14 seconds


In [27]:
results_collector = ResultsCollector()

In [28]:
results_collector.evaluate("SBERT Baseline", sbert_retriever, qrels, sbert_results)

In [29]:
results_collector.ndcg

Unnamed: 0,SBERT Baseline
NDCG@1,0.49074
NDCG@3,0.45489
NDCG@5,0.47133
NDCG@10,0.49963
NDCG@100,0.56564
NDCG@1000,0.58932


### Baseline ColBERT

In [36]:
checkpoint = 'colbert-ir/colbertv2.0'
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens

experiment_name = 'baseline'
index_name = f'{experiment_name}_{dataset_name}.{nbits}bits'
kmeans=4

In [37]:
colbert_model = ColBERTRetrievalSearch(
                                checkpoint, 
                                index_name, 
                                experiment_name, 
                                colbert_collection, 
                                collection_ids, 
                                doc_maxlen, 
                                nbits, 
                                kmeans, 
                                overwrite_param=True
                            )



[Oct 26, 16:59:17] #> Creating directory /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/notebook/indexes/baseline_fiqa.2bits 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 300,
    "ma

0it [00:00, ?it/s]

[Oct 26, 17:02:16] [0] 		 #> Saving chunk 0: 	 25,000 passages and 3,215,621 embeddings. From #0 onward.


1it [00:55, 55.68s/it]

[Oct 26, 17:02:17] [0] 		 #> Encoding 25000 passages..
[Oct 26, 17:03:12] [0] 		 #> Saving chunk 1: 	 25,000 passages and 3,246,215 embeddings. From #25,000 onward.


2it [01:51, 55.69s/it]

[Oct 26, 17:03:13] [0] 		 #> Encoding 7638 passages..
[Oct 26, 17:03:30] [0] 		 #> Saving chunk 2: 	 7,638 passages and 975,896 embeddings. From #50,000 onward.


3it [02:08, 42.81s/it]
100%|██████████| 3/3 [00:00<00:00, 285.55it/s]


[Oct 26, 17:03:30] [0] 		 #> Checking all files were saved...
[Oct 26, 17:03:30] [0] 		 Found all files!
[Oct 26, 17:03:30] [0] 		 #> Building IVF...
[Oct 26, 17:03:30] [0] 		 #> Loading codes...
[Oct 26, 17:03:30] [0] 		 Sorting codes...
[Oct 26, 17:03:31] [0] 		 Getting unique codes...
[Oct 26, 17:03:31] #> Optimizing IVF to store map from centroids to list of pids..
[Oct 26, 17:03:31] #> Building the emb2pid mapping..
[Oct 26, 17:03:31] len(emb2pid) = 7437732


100%|██████████| 32768/32768 [00:00<00:00, 40875.58it/s]


[Oct 26, 17:03:32] #> Saved optimized IVF to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/notebook/indexes/baseline_fiqa.2bits/ivf.pid.pt
[Oct 26, 17:03:32] [0] 		 #> Saving the indexing metadata to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/notebook/indexes/baseline_fiqa.2bits/metadata.json ..
#> Joined...
[Oct 26, 17:03:37] #> Loading codec...
[Oct 26, 17:03:37] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Oct 26, 17:03:37] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Oct 26, 17:03:37] #> Loading IVF...
[Oct 26, 17:03:38] #> Loading doclens...


100%|██████████████| 3/3 [00:00<00:00, 808.25it/s]

[Oct 26, 17:03:38] #> Loading codes and residuals...



100%|███████████████| 3/3 [00:00<00:00, 17.86it/s]


In [41]:
colbert_retriever = EvaluateRetrieval(colbert_model)

In [42]:
colbert_results = colbert_retriever.retrieve(colbert_collection, colbert_queries)

648it [00:08, 75.66it/s]


In [43]:
results_collector.evaluate("ColBERT Baseline", colbert_retriever, qrels, colbert_results)

In [46]:
results_collector.ndcg

Unnamed: 0,SBERT Baseline,ColBERT Baseline
NDCG@1,0.49074,0.33796
NDCG@3,0.45489,0.31461
NDCG@5,0.47133,0.32837
NDCG@10,0.49963,0.35277
NDCG@100,0.56564,0.41278
NDCG@1000,0.58932,0.442


> Results for FIQA

## TREC-Covid

In [6]:
dataset_name = "trec-covid"
trec_corpus, trec_queries, trec_qrels = beir_datasets.load_dataset(dataset_name)

# The indices in BeIR datasets may not be monotic, 
### so we will need a dictionary with enumerated indices (which is used in ColBERT) as keys and BeIR index as values
### collection_ids = {colbert_index: beir_index}
trec_collection_ids = {idx: str(val) for idx, val in enumerate(list(trec_corpus))}

# Load datasets for ColBERT
trec_collection_path, trec_queries_path = beir_datasets.convert_for_colbert(dataset_name)
trec_colbert_collection, trec_colbert_queries = Collection(path=trec_collection_path), Queries(path=trec_queries_path)

[32m2023-10-26 17:26:34.698[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m62[0m - [1mDownloading dataset 'trec-covid'...[0m
[32m2023-10-26 17:26:34.701[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mload_dataset[0m:[36m65[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid'[0m


  0%|          | 0/171332 [00:00<?, ?it/s]

  0%|          | 0/171332 [00:00<?, ?it/s]

[32m2023-10-26 17:26:37.782[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mconvert_for_colbert[0m:[36m98[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid/colbert/trec-covid_collection.tsv ...[0m
100%|██| 171332/171332 [00:03<00:00, 44269.15it/s]
[32m2023-10-26 17:26:41.714[0m | [1mINFO    [0m | [36mzeroqaret.dataset[0m:[36mconvert_for_colbert[0m:[36m105[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid/colbert/trec-covid_queries.tsv ...[0m
100%|█████████| 50/50 [00:00<00:00, 300021.75it/s]


[Oct 26, 17:26:41] #> Loading collection...
0M 
[Oct 26, 17:26:42] #> Loading the queries from /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid/colbert/trec-covid_queries.tsv ...
[Oct 26, 17:26:42] #> Got 50 queries. All QIDs are unique.



### Baseline SBERT

In [7]:
sbert_model_name = "all-mpnet-base-v2"
sbert_model = models.SentenceBERT(model_path=sbert_model_name)
batch_size = 256,
normalize = True

In [8]:
sbert_model = DenseRetrievalExactSearch(models.SentenceBERT(sbert_model_name), batch_size = 128, corpus_chunk_size=512*9999)
sbert_retriever = EvaluateRetrieval(sbert_model, score_function="dot")

In [9]:
start_time = time()
sbert_trec_results = sbert_retriever.retrieve(trec_corpus, trec_queries)
end_time = time()
print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1339 [00:00<?, ?it/s]

Time taken to retrieve: 985.40 seconds


In [20]:
from fastcore.utils import save_pickle

save_pickle(f"../datasets/{dataset_name}/results/sbert_trec_results.pkl",  sbert_trec_results)

In [10]:
trec_results_collector = ResultsCollector()

In [12]:
trec_results_collector.evaluate("SBERT Baseline", sbert_retriever, trec_qrels, sbert_trec_results)

In [15]:
trec_results_collector.map

Unnamed: 0,SBERT Baseline
MAP@1,0.00195
MAP@3,0.00465
MAP@5,0.00672
MAP@10,0.01218
MAP@100,0.07061
MAP@1000,0.19734


### Baseline ColBERT

In [21]:
checkpoint = 'colbert-ir/colbertv2.0'
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens

experiment_name = 'baseline'
index_name = f'{experiment_name}_{dataset_name}.{nbits}bits'
kmeans=4

In [22]:
colbert_model = ColBERTRetrievalSearch(
                                checkpoint, 
                                index_name, 
                                experiment_name, 
                                trec_colbert_collection, 
                                trec_collection_ids, 
                                doc_maxlen, 
                                nbits, 
                                kmeans, 
                                overwrite_param=True
                            )



[Oct 26, 17:49:00] #> Creating directory /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/notebook/indexes/baseline_trec-covid.2bits 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 300,
 

0it [00:00, ?it/s]

[Oct 26, 17:54:44] [0] 		 #> Saving chunk 0: 	 25,000 passages and 3,741,686 embeddings. From #0 onward.


1it [00:57, 57.85s/it]

[Oct 26, 17:54:46] [0] 		 #> Encoding 25000 passages..
[Oct 26, 17:55:42] [0] 		 #> Saving chunk 1: 	 25,000 passages and 4,776,533 embeddings. From #25,000 onward.


2it [01:56, 58.56s/it]

[Oct 26, 17:55:46] [0] 		 #> Encoding 25000 passages..
[Oct 26, 17:56:42] [0] 		 #> Saving chunk 2: 	 25,000 passages and 4,964,069 embeddings. From #50,000 onward.


3it [02:56, 58.90s/it]

[Oct 26, 17:56:45] [0] 		 #> Encoding 25000 passages..
[Oct 26, 17:57:40] [0] 		 #> Saving chunk 3: 	 25,000 passages and 3,995,756 embeddings. From #75,000 onward.


4it [03:54, 58.54s/it]

[Oct 26, 17:57:43] [0] 		 #> Encoding 25000 passages..
[Oct 26, 17:58:38] [0] 		 #> Saving chunk 4: 	 25,000 passages and 3,455,785 embeddings. From #100,000 onward.


5it [04:51, 58.11s/it]

[Oct 26, 17:58:40] [0] 		 #> Encoding 25000 passages..
[Oct 26, 17:59:36] [0] 		 #> Saving chunk 5: 	 25,000 passages and 4,875,300 embeddings. From #125,000 onward.


6it [05:50, 58.40s/it]

[Oct 26, 17:59:39] [0] 		 #> Encoding 21332 passages..
[Oct 26, 18:00:27] [0] 		 #> Saving chunk 6: 	 21,332 passages and 4,102,886 embeddings. From #150,000 onward.


7it [06:40, 57.28s/it]
  0%|          | 0/7 [00:00<?, ?it/s]

[Oct 26, 18:00:30] [0] 		 #> Checking all files were saved...
[Oct 26, 18:00:30] [0] 		 Found all files!
[Oct 26, 18:00:30] [0] 		 #> Building IVF...
[Oct 26, 18:00:30] [0] 		 #> Loading codes...
[Oct 26, 18:00:30] [0] 		 Sorting codes...


100%|██████████| 7/7 [00:00<00:00, 154.08it/s]


[Oct 26, 18:00:32] [0] 		 Getting unique codes...
[Oct 26, 18:00:32] #> Optimizing IVF to store map from centroids to list of pids..
[Oct 26, 18:00:32] #> Building the emb2pid mapping..
[Oct 26, 18:00:33] len(emb2pid) = 29912015


100%|██████████| 65536/65536 [00:03<00:00, 20425.94it/s]


[Oct 26, 18:00:36] #> Saved optimized IVF to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/notebook/indexes/baseline_trec-covid.2bits/ivf.pid.pt
[Oct 26, 18:00:36] [0] 		 #> Saving the indexing metadata to /home/bengsoon/Projects/xcs224u_project/zeroqaret/nbs/experiments/notebook/indexes/baseline_trec-covid.2bits/metadata.json ..
#> Joined...
[Oct 26, 18:00:43] #> Loading codec...
[Oct 26, 18:00:43] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Oct 26, 18:00:43] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Oct 26, 18:00:44] #> Loading IVF...
[Oct 26, 18:00:44] #> Loading doclens...


100%|██████████████| 7/7 [00:00<00:00, 707.47it/s]

[Oct 26, 18:00:44] #> Loading codes and residuals...



100%|███████████████| 7/7 [00:00<00:00, 11.13it/s]


In [23]:
colbert_retriever = EvaluateRetrieval(colbert_model)

In [24]:
trec_colbert_results = colbert_retriever.retrieve(trec_colbert_collection, trec_colbert_queries)


#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . what is the origin of COVID-19, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  2003,  1996,  4761,  1997,  2522, 17258,  1011,
         2539,   102,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])



50it [00:01, 48.26it/s]


In [25]:
trec_results_collector.evaluate("ColBERT Baseline", colbert_retriever, trec_qrels, trec_colbert_results)

In [26]:
trec_results_collector.ndcg

Unnamed: 0,SBERT Baseline,ColBERT Baseline
NDCG@1,0.6,0.79
NDCG@3,0.54989,0.76061
NDCG@5,0.52905,0.76205
NDCG@10,0.51318,0.73627
NDCG@100,0.41738,0.53686
NDCG@1000,0.4322,0.45803


In [27]:
trec_results_collector.map

Unnamed: 0,SBERT Baseline,ColBERT Baseline
MAP@1,0.00195,0.00205
MAP@3,0.00465,0.00578
MAP@5,0.00672,0.00945
MAP@10,0.01218,0.0174
MAP@100,0.07061,0.09543
MAP@1000,0.19734,0.22305


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()