# Dataset

> Dataset Preparation & Exploration: This is where we will prepare and explore the datasets for our experiment 

In [102]:
#| default_exp dataset

In [1]:
#| hide
from nbdev.showdoc import *

In [22]:
#| hide
import nbdev; nbdev.nbdev_export()

## Imports

In [120]:
#| export
from loguru import logger
import os
from pathlib import Path
from fastcore.basics import patch_to, patch
from fastcore.utils import  *

from beir.datasets.data_loader import GenericDataLoader
from beir import util

from zeroqaret.helper import write_file

import csv
from tqdm import tqdm

from typing import Union, List, Tuple
import re

import pandas as pd
import ujson

## BEIR datasets

In [24]:
#| export
udapdr_list = ['arguana', 'webis-touche2020', 'trec-covid', 'nfcorpus', 'hotpotqa', 'dbpedia-entity', 'climate-fever', 'fever', 'scifact', 'scidocs',  'fiqa']

#| export
our_list = ["fiqa", "trec-covid"]

In [3]:
#| export
class BEIRDataset:
    def __init__(self, 
                 url: str = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip"
                ):
        """ Wrapper to load and manage BEIR datasets."""
        # # split the names of the datasets if in string
        # if datasets:
        #     if isinstance(datasets, str):
        #         self.datasets = datasets.split(",")
        #     else:
        #         self.datasets = datasets
        # else: # if None
        #     # these are all the public datasets available on https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/ (retrieved 20231022)
        #     self.datasets = ['fiqa', 'trec-covid']    

        self.url = url
        
        ###  self.out_dir is where we will save all the datasets ###
        try: 
            parent_dir = Path(__file__).absolute().parents[1]
        except:
            parent_dir = Path().absolute().parents[0]
        self.out_dir = (parent_dir /  "datasets").as_posix()
        logger.info(f"Datasets will be saved in '{self.out_dir}'")
        
        ### this is where we will store all the datasets with their corresponding paths ###
        self.datasets = {}
        
    def load_dataset(self,
                     dataset: str,
                     split: str = "test",
                    ): 
        
        if dataset not in self.datasets:
            logger.info(f"Downloading dataset '{dataset}'...")
            cur_url = self.url.format(dataset)
            dataset_path= util.download_and_unzip(cur_url, self.out_dir)
            logger.info(f"Saved on '{dataset_path}'")
    
            self.datasets[dataset] = dataset_path

        return GenericDataLoader(self.datasets[dataset]).load(split=split) # corpus, queries, qrels

    

In [26]:
beir_dataset = BEIRDataset()

[32m2023-10-26 09:30:01.422[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m25[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [16]:
trec_covid = beir_dataset.load_dataset("trec-covid")

[32m2023-10-24 10:43:37.347[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataset[0m:[36m35[0m - [1mDownloading dataset 'trec-covid'...[0m


/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid.zip:   0%|          | 0.00/70.5M [00:00<…

[32m2023-10-24 10:45:07.605[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataset[0m:[36m38[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets/trec-covid'[0m


  0%|          | 0/171332 [00:00<?, ?it/s]

In [202]:
beir_dataset.datasets

{'trec-covid': '/home/bengsoon/Projects/xcs224u_project/zeroshot_qa_retrieval/datasets/trec-covid'}

In [132]:
#| export
@patch_to(BEIRDataset)
def convert_for_colbert(self,
                         dataset: str,
                         split: str = "test",
                        ) -> Tuple[str, str]: 

    """ 
    Converts downloaded BeIR datasets into tsv for ColBERT.
    Returns (collection_path, queries_path)
    """
    
    # load corpus, queries, qrels
    corpus, queries, qrels = self.load_dataset(dataset, split)
    corpus_ids = list(corpus)
    query_ids = list(queries)

        
    # make dir
    os.makedirs(Path(self.datasets[dataset]) / "colbert", exist_ok=True)

    # collection_path and queries_path
    collection_path = (Path(self.datasets[dataset]) / "colbert" / f"{dataset}_collection.tsv").as_posix()
    queries_path = (Path(self.datasets[dataset]) / "colbert" / f"{dataset}_queries.tsv").as_posix()

    
    logger.info("Preprocessing Corpus and Saving to {} ...".format(collection_path))
    with open(collection_path, 'w') as f_in:
        writer = csv.writer(f_in, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
        for idx, doc_id in enumerate(tqdm(corpus_ids, total=len(corpus_ids))):
            doc = corpus[doc_id]
            writer.writerow([idx, (self.preprocess(doc.get("title", "")) + " " + self.preprocess(doc.get("text", ""))).strip(), doc_id])

    logger.info("Preprocessing Corpus and Saving to {} ...".format(queries_path))
    with open(queries_path, 'w') as f_in:
        writer = csv.writer(f_in, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
        for idx, query_id in enumerate(tqdm(query_ids, total=len(queries))):
            query = queries[query_id]
            writer.writerow([idx, query])

    return collection_path, queries_path

@patch_to(BEIRDataset)
def preprocess(self, text):
    return text.replace("\r", " ").replace("\t", " ").replace("\n", " ")

In [131]:
overwrite = False
mode = "a"

if (overwrite) and (mode == "a") or (not overwrite):
    print("Warning!")



In [126]:
#| export
@patch_to(BEIRDataset)
def prepare_qg_for_colbert_training(self,
                                    csv_path: str, # path to CSV training file (containing pid, passage, question, title columns)
                                    mode: str = "a", # mode to write the file (append by default)
                                    replace: bool = False, # if True, will throw error if existing files are found and `mode = "a"`
                                ) -> (str, str, str):
    """
    Converting and preparing the dataframe loaded from `csv_path` (containing at least 'pid', 'passage' and 'question' columns) for training.
    This will create the following:
        - collection.tsv: TSV containing "pid \t passage text"
        - queries.tsv: TSV containing "generated qid \t title - query"
        - triples.jsonl: jsonl contianing [qid, pid+, pid-] list per line

    Returns tuple of:
        - Path to triples.jsonl
        - Path to queries.tsv
        - Path to collection.tsv
    """
    
    # this is the path to save the files
    save_path = Path(csv_path).parent / "colbert_training"
    # make directory if it does not exist
    save_path.mkdir(exist_ok=True)
    
    if (replace) and (mode == "a") or (not replace):
        assert not os.listdir(save_path), f"'{save_path}' is not empty! Please ensure that the folder is backed up and cleared before proceeding!"
    
    logger.info(f"Creating ColBERT training files from {save_path}...")
    
    # read csv as dataframe
    train_df = pd.read_csv(csv_path)

    # generate qid from {index}
    train_df["qid"] = train_df.index

    # also use {index} as colbert_idx
    train_df["colbert_idx"] = train_df.index

    # query as a combination of {title} - {question} 
    train_df["query"] = train_df["title"].astype(str) + " - " + train_df["question"].astype(str)

    # create a shuffled pids for negative sampling
    shuffled_idx = train_df.sample(len(train_df))["colbert_idx"]

    # create collection.tsv, queries.tsv and triples.jsonl from train_df
    for idx, idx_n, pid, passage, qid, query in tqdm(zip(train_df["colbert_idx"], shuffled_idx, train_df["pid"], train_df["passage"], train_df["qid"], train_df["query"]), desc="Training files: "):
        write_file(f"{save_path}/triples.jsonl", ujson.dumps([qid, idx, idx_n])  + '\n', mode=mode)
        write_file(f"{save_path}/queries.tsv", f"{qid} \t {query} \n", mode=mode)
        write_file(f"{save_path}/collection.tsv", f"{idx} \t {passage} \t {pid} \n", mode=mode)

    logger.info(f"triples.jsonl, queries,tsv and collection.tsv files created in {save_path}.")

    return (f"{save_path}/triples.jsonl", f"{save_path}/queries.tsv", f"{save_path}/collection.tsv")

In [117]:
beir_dataset = BEIRDataset()

[32m2023-11-01 00:36:21.482[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m25[0m - [1mDatasets will be saved in '/home/bengsoon/Projects/xcs224u_project/zeroqaret/datasets'[0m


In [119]:
beir_dataset.prepare_qg_for_colbert_training("../datasets/scifact/qg/scifact_qg_all.csv")

[32m2023-11-01 00:36:29.534[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_qg_for_colbert_training[0m:[36m21[0m - [1mCreating ColBERT training files from ../datasets/scifact/qg/colbert_training...[0m
Training files: : 5183it [00:00, 22680.41it/s]
[32m2023-11-01 00:36:29.946[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_qg_for_colbert_training[0m:[36m44[0m - [1mtriples.jsonl, queries,tsv and collection.tsv files created in ../datasets/scifact/qg/colbert_training.[0m


In [206]:
beir_dataset.convert_for_colbert("trec-covid")

[32m2023-10-23 18:00:08.237[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataset[0m:[36m35[0m - [1mDownloading dataset 'trec-covid'...[0m
[32m2023-10-23 18:00:08.239[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataset[0m:[36m38[0m - [1mSaved on '/home/bengsoon/Projects/xcs224u_project/zeroshot_qa_retrieval/datasets/trec-covid'[0m


  0%|          | 0/171332 [00:00<?, ?it/s]

[32m2023-10-23 18:00:09.665[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_for_colbert[0m:[36m23[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroshot_qa_retrieval/datasets/trec-covid/colbert/trec-covid_collection.tsv ...[0m
100%|██| 171332/171332 [00:03<00:00, 47369.46it/s]
[32m2023-10-23 18:00:13.286[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_for_colbert[0m:[36m30[0m - [1mPreprocessing Corpus and Saving to /home/bengsoon/Projects/xcs224u_project/zeroshot_qa_retrieval/datasets/trec-covid/colbert/trec-covid_queries.tsv ...[0m
100%|█████████| 50/50 [00:00<00:00, 365357.49it/s]


In [122]:
#| hide
import nbdev; nbdev.nbdev_export()