# Dataset

> Dataset Preparation & Exploration: This is where we will prepare and explore the datasets for our experiment 

In [102]:
#| default_exp dataset

In [21]:
#| hide
from nbdev.showdoc import *

In [22]:
#| hide
import nbdev; nbdev.nbdev_export()

## Imports

In [23]:
#| export
from loguru import logger
import os
from pathlib import Path
from fastcore.basics import patch_to, patch

from beir.datasets.data_loader import GenericDataLoader
from beir import util

import csv
from tqdm import tqdm

from typing import Union, List, Tuple
import re

## BEIR datasets

In [24]:
#| export
udapdr_list = ['arguana', 'webis-touche2020', 'trec-covid', 'nfcorpus', 'hotpotqa', 'dbpedia-entity', 'climate-fever', 'fever', 'scifact', 'scidocs',  'fiqa']

#| export
our_list = ["fiqa", "trec-covid"]

In [25]:
#| export
class BEIRDataset:
    def __init__(self, 
                 url: str = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip"
                ):
        """ Wrapper to load and manage BEIR datasets."""
        # # split the names of the datasets if in string
        # if datasets:
        #     if isinstance(datasets, str):
        #         self.datasets = datasets.split(",")
        #     else:
        #         self.datasets = datasets
        # else: # if None
        #     # these are all the public datasets available on https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/ (retrieved 20231022)
        #     self.datasets = ['fiqa', 'trec-covid']    

        self.url = url
        
        ###  self.out_dir is where we will save all the datasets ###
        try: 
            parent_dir = Path(__file__).absolute().parents[1]
        except:
            parent_dir = Path().absolute().parents[0]
        self.out_dir = (parent_dir /  "datasets").as_posix()
        logger.info(f"Datasets will be saved in '{self.out_dir}'")
        
        ### this is where we will store all the datasets with their corresponding paths ###
        self.datasets = {}
        
    def load_dataset(self,
                     dataset: str,
                     split: str = "test",
                    ): 
        
        if dataset not in self.datasets:
            logger.info(f"Downloading dataset '{dataset}'...")
            cur_url = self.url.format(dataset)
            dataset_path= util.download_and_unzip(cur_url, self.out_dir)
            logger.info(f"Saved on '{dataset_path}'")
    
            self.datasets[dataset] = dataset_path

        return GenericDataLoader(self.datasets[dataset]).load(split=split) # corpus, queries, qrels

    

In [26]:
beir_dataset = BEIRDataset()

[32m2023-10-26 09:30:01.422[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m25[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [16]:
trec_covid = beir_dataset.load_dataset("trec-covid")

[32m2023-10-24 10:43:37.347[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataset[0m:[36m35[0m - [1mDownloading dataset 'trec-covid'...[0m


/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid.zip:   0%|          | 0.00/70.5M [00:00<…

[32m2023-10-24 10:45:07.605[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataset[0m:[36m38[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid'[0m


  0%|          | 0/171332 [00:00<?, ?it/s]

In [202]:
beir_dataset.datasets

{'trec-covid': '/home/bengsoon/Projects/xcs224u_project/zeroshot_qa_retrieval/datasets/trec-covid'}

In [27]:
#| export
@patch_to(BEIRDataset)
def convert_for_colbert(self,
                         dataset: str,
                         split: str = "test",
                        ) -> Tuple[str, str]: 

    """ 
    Converts downloaded BeIR datasets into tsv for ColBERT.
    Returns (collection_path, queries_path)
    """
    
    # load corpus, queries, qrels
    corpus, queries, qrels = self.load_dataset(dataset, split)
    corpus_ids = list(corpus)

        
    # make dir
    os.makedirs(Path(self.datasets[dataset]) / "colbert", exist_ok=True)

    # collection_path and queries_path
    collection_path = (Path(self.datasets[dataset]) / "colbert" / f"{dataset}_collection.tsv").as_posix()
    queries_path = (Path(self.datasets[dataset]) / "colbert" / f"{dataset}_queries.tsv").as_posix()

    
    logger.info("Preprocessing Corpus and Saving to {} ...".format(collection_path))
    with open(collection_path, 'w') as f_in:
        writer = csv.writer(f_in, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
        for idx, doc_id in enumerate(tqdm(corpus_ids, total=len(corpus_ids))):
            doc = corpus[doc_id]
            writer.writerow([idx, (self.preprocess(doc.get("title", "")) + " " + self.preprocess(doc.get("text", ""))).strip(), doc_id])

    logger.info("Preprocessing Corpus and Saving to {} ...".format(queries_path))
    with open(queries_path, 'w') as f_in:
        writer = csv.writer(f_in, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
        for qid, query in tqdm(queries.items(), total=len(queries)):
            writer.writerow([qid, query])

    return collection_path, queries_path

@patch_to(BEIRDataset)
def preprocess(self, text):
    return text.replace("\r", " ").replace("\t", " ").replace("\n", " ")

In [28]:
beir_dataset = BEIRDataset()

[32m2023-10-26 09:30:07.455[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m25[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [30]:
beir_dataset.convert_for_colbert("fiqa")

  0%|          | 0/57638 [00:00<?, ?it/s]

[32m2023-10-26 09:31:33.190[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_for_colbert[0m:[36m26[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/fiqa/colbert/fiqa_collection.tsv ...[0m
100%|████| 57638/57638 [00:00<00:00, 64175.72it/s]
[32m2023-10-26 09:31:34.103[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_for_colbert[0m:[36m33[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/fiqa/colbert/fiqa_queries.tsv ...[0m
100%|███████| 648/648 [00:00<00:00, 398403.55it/s]


('/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/fiqa/colbert/fiqa_collection.tsv',
 '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/fiqa/colbert/fiqa_queries.tsv')

In [206]:
beir_dataset.convert_for_colbert("trec-covid")

[32m2023-10-23 18:00:08.237[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataset[0m:[36m35[0m - [1mDownloading dataset 'trec-covid'...[0m
[32m2023-10-23 18:00:08.239[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataset[0m:[36m38[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroshot_qa_retrieval/datasets/trec-covid'[0m


  0%|          | 0/171332 [00:00<?, ?it/s]

[32m2023-10-23 18:00:09.665[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_for_colbert[0m:[36m23[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroshot_qa_retrieval/datasets/trec-covid/colbert/trec-covid_collection.tsv ...[0m
100%|██| 171332/171332 [00:03<00:00, 47369.46it/s]
[32m2023-10-23 18:00:13.286[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_for_colbert[0m:[36m30[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroshot_qa_retrieval/datasets/trec-covid/colbert/trec-covid_queries.tsv ...[0m
100%|█████████| 50/50 [00:00<00:00, 365357.49it/s]


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()