In [10]:
import os
import json
import pandas as pd

from src.process_html import (
    load_documents_from_jsonl,
)

%load_ext autoreload
%autoreload 2

In [2]:
data_dir = os.path.join('.', 'data')

In [3]:
# Reload saved files if not running from the beginning
filename = os.path.join(data_dir, "documents_{}.jsonl")

documents_sports = load_documents_from_jsonl(input_path=filename.format("sports"))
documents_finance = load_documents_from_jsonl(input_path=filename.format("finance"))
documents_movie = load_documents_from_jsonl(input_path=filename.format("movie"))

In [5]:
# get all interaction ids in three vector database
interaction_ids = []

for documents in [documents_sports, documents_finance, documents_movie]:
    for doc in documents:
        interaction_ids.append(doc.metadata['interaction_id'])

interaction_ids = set(interaction_ids)
print("Num of unique interaction_ids in all three vector databases:", len(interaction_ids))

Num of unique interaction_ids in all three vector databases: 101


In [6]:
# select a subset of interaction_ids as a question set
num_questions = 50
random_seed = 42
question_fraq = num_questions / len(interaction_ids)

selected_question_ids = pd.Series(
    list(interaction_ids)
).sample(
    frac=question_fraq, 
    random_state=random_seed
).to_list()

In [7]:
# retrieve metadata, query and answers from raw dataset
path = os.path.join(data_dir, "crag_task_1_and_2_dev_v4.jsonl")

metadata_list = []

fields_to_keep = [
    'interaction_id', 
    'domain', 
    'question_type', 
    'static_or_dynamic', 
    'query',
    'answer',
]

with open(path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        record = json.loads(line)
        if record['interaction_id'] in selected_question_ids:
            subset = {
                field: record[field] for field in fields_to_keep
            }
            
            metadata_list.append(subset)

In [8]:
question_metadata = pd.DataFrame(metadata_list)

# sanity check
print(len(question_metadata))
print(question_metadata['question_type'].unique())
print(question_metadata['domain'].unique())
print(question_metadata['static_or_dynamic'].unique())

question_metadata.head(2)

50
['aggregation' 'post-processing' 'comparison' 'multi-hop']
['movie' 'finance' 'sports']
['static']


Unnamed: 0,interaction_id,domain,question_type,static_or_dynamic,query,answer
0,47859020-9974-4c81-a897-96594beca8fb,movie,aggregation,static,how many family movies were there that came ou...,109
1,80365e4f-795e-4039-8afb-b7a8e8d54285,movie,post-processing,static,what was the average budget for all movies in ...,"$147,375,000"


In [9]:
question_metadata.to_csv(
    os.path.join(data_dir, "question_metadata.csv"),
    index=False
)