# COVID Abstract Data Generator

This is the data generator script for the COVID 19 paper dataset where we look to map titles to abstracts of papers. This task shares a lot of similarity to document retrieval, as seen in RAG systems today. This dataset still has a lot of token overlap, making it somewhat easier to get good accuracy for, but it is more challenging than the Walmart-Amazon dataset.

In [None]:
# Download covid_abstracts.csv from https://www.kaggle.com/datasets/anandhuh/covid-abstracts

In [1]:
import pandas as pd

In [2]:
data_df = pd.read_csv("./data/covid-abstracts/covid_abstracts.csv")
data_df = data_df.reset_index().rename({"index": "id"}, axis=1)

In [3]:
def get_catalog_and_queries_df(title_abstract_df):

    target_ids = np.unique(walmart_df["subject_id"].values)
    catalog_df = meta_data_df[walmart_df["subject_id"].isin(target_ids)]
    catalog_df = catalog_df.rename({"subject_id": "catalog_id"})
    
    train_df = {
        "input_text": [],
        "match_id": [],
        "judgment": [],
    }

    for j, row in tqdm(data_df.iterrows(), total=len(data_df.index)):
        source_id = row["source_id"]
        target_id = row["target_id"]

        train_df["input_text"].append(amazon_df[amazon_df["subject_id"] == target_id].iloc[0]["title"])
        train_df["match_id"].append(source_id)
        train_df["judgment"].append(row["matching"])

    queries_df = pd.DataFrame(train_df)
    
    return catalog_df, queries_df

In [4]:
from tqdm import tqdm

def get_catalog_and_queries_df(data_df):
    data = {
        "input_text": [],
        "match_id": [],
        "judgment": []
    }

    for j, row in tqdm(data_df.iterrows(), total=len(data_df.index)):
        data["input_text"].append(row["title"])
        data["match_id"].append(row["id"])
        data["judgment"].append(True)

        # Add negative Sample
        data["input_text"].append(row["title"])
        data["match_id"].append((row["id"]+1) % len(data_df.index))
        data["judgment"].append(False)

    queries_df = pd.DataFrame(data)

    catalog_df = data_df.copy()
    catalog_df = catalog_df.rename({"id": "catalog_id", "abstract": "text"}, axis=1)
    del catalog_df["title"] 

    return catalog_df,queries_df

In [5]:
# 60 20 20 split in pandas
from sklearn.model_selection import train_test_split

# Split data_df into train (60%), validation (20%), and test (20%)
train_df, temp_df = train_test_split(data_df, test_size=0.4, random_state=42)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Print the sizes of the train, validation, and test dataframes
print("Train data size:", len(train_df))
print("Validation data size:", len(validation_df))
print("Test data size:", len(test_df))

Train data size: 6000
Validation data size: 2000
Test data size: 2000


In [6]:
train_catalog_df, train_queries_df = get_catalog_and_queries_df(train_df)
val_catalog_df, val_queries_df = get_catalog_and_queries_df(validation_df)
test_catalog_df, test_queries_df = get_catalog_and_queries_df(test_df)

100%|██████████| 6000/6000 [00:00<00:00, 27630.86it/s]
100%|██████████| 2000/2000 [00:00<00:00, 28013.01it/s]
100%|██████████| 2000/2000 [00:00<00:00, 28051.70it/s]


In [7]:
train_catalog_df.to_csv("data/covid-abstracts/train_catalog.csv", index=False)
train_queries_df.to_csv("data/covid-abstracts/train_queries.csv", index=False)
val_catalog_df.to_csv("data/covid-abstracts/val_catalog.csv", index=False)
val_queries_df.to_csv("data/covid-abstracts/val_queries.csv", index=False)
test_catalog_df.to_csv("data/covid-abstracts/test_catalog.csv", index=False)
test_queries_df.to_csv("data/covid-abstracts/test_queries.csv", index=False)