In [25]:
import os
import json
import pandas as pd

from src.process_html import (
    record_to_document_list, 
    record_sanity_check,
    save_documents_as_jsonl,
    remove_duplicate_pages,
)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Data Selection

1. Download file `crag_task_1_and_2_dev_v4.jsonl.bz2` from [here](https://github.com/facebookresearch/CRAG/tree/main/data)
2. run in terminal run `bunzip2 crag_task_1_and_2_dev_v4.jsonl.bz2`, the file will be replaced by `crag_task_1_and_2_dev_v4.jsonl`

In [2]:
data_dir = os.path.join('.', 'data')

In [5]:
# Extract metadata used as filters
path = os.path.join(data_dir, "crag_task_1_and_2_dev_v4.jsonl")

metadata_list = []
fields_to_keep = [
    'interaction_id', 
    'domain', 
    'question_type', 
    'static_or_dynamic', 
    'query',
]

with open(path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        record = json.loads(line)

        subset = {
            field: record[field] for field in fields_to_keep
        }
        
        metadata_list.append(subset)

In [6]:
# Covnert metadata_list to dataframe
metadata_df = pd.DataFrame(metadata_list)
print('metadata_df shape:', metadata_df.shape)
metadata_df.head(2)

metadata_df shape: (2706, 5)


Unnamed: 0,interaction_id,domain,question_type,static_or_dynamic,query
0,7bb29eb4-12f9-45f9-bf8a-66832b3c8962,sports,post-processing,static,how many 3-point attempts did steve nash avera...
1,a2486535-98e7-4876-9880-80374eac2fa8,movie,simple_w_condition,static,what is a movie to feature a person who can cr...


In [7]:
metadata_df.to_csv(
    os.path.join(data_dir, "crag_v4_metadata.csv"),
    index=False,
)

In [8]:
# Filter samples by the following data fields
selected_domains = ["finance", "movie", "sports"]
selected_question_type = ["comparison", "aggregation", "post-processing", "multi-hop"]
selected_static_or_dynamic = ["static"]

In [9]:
metadata_filtered = metadata_df[
    metadata_df['domain'].isin(selected_domains) &\
    metadata_df['question_type'].isin(selected_question_type) &\
    metadata_df['static_or_dynamic'].isin(selected_static_or_dynamic)
    ]

print('metadata shape after filtering:', metadata_filtered.shape)

metadata shape after filtering: (264, 5)


In [12]:
# select 100 samples into knowledge base
knowledge_base_size = 100
knowledge_base_frac = knowledge_base_size / len(metadata_filtered)

random_seed = 42
groupby_cols = ['domain', 'question_type']

# Using stratified sampling by domain and question_type
knowledge_base_metadata = metadata_filtered.groupby(
    groupby_cols, group_keys=False
).apply(
    lambda x: x.sample(frac=knowledge_base_frac, random_state=random_seed)
)

knowledge_base_metadata.head(2)

  ).apply(


Unnamed: 0,interaction_id,domain,question_type,static_or_dynamic,query
469,f068a33c-2458-4d27-8448-02906e6b5fa5,finance,aggregation,static,how many major stock exchanges are there in th...
977,8e6f6025-d096-432b-b1d3-7fbd42007e2c,finance,aggregation,static,how many times did ftai aviation pay dividends...


In [13]:
# sanity check: make sure selected samples are in desired categories
print(knowledge_base_metadata['question_type'].unique())
print(knowledge_base_metadata['domain'].unique())
print(knowledge_base_metadata['static_or_dynamic'].unique())

['aggregation' 'comparison' 'multi-hop' 'post-processing']
['finance' 'movie' 'sports']
['static']


In [14]:
knowledge_base_metadata.to_csv(
    os.path.join(data_dir, "kb_metadata.csv"),
    index=False,
)

# 2. Text Cleaning

Each query relates to 5 searched web with html format. Extract content from html and parse them into a `Document` object.

Create three knowledge bases for "sports", "finance", and "movie" repectively.

In [15]:
# create "sport" knowledge base
sports_meta = knowledge_base_metadata[knowledge_base_metadata['domain']=="sports"].copy()
print(sports_meta.shape)
# create "finance" knowledge base 
finance_meta = knowledge_base_metadata[knowledge_base_metadata['domain']=="finance"].copy()
print(finance_meta.shape)
# create "movie" knowledge base
movie_meta = knowledge_base_metadata[knowledge_base_metadata['domain']=="movie"].copy()
print(movie_meta.shape)

(42, 5)
(19, 5)
(40, 5)


In [16]:
sports_meta['loaded_to_kb'] = 0
finance_meta['loaded_to_kb'] = 0
movie_meta['loaded_to_kb'] = 0

In [18]:
documents_sports = []
documents_finance = []
documents_movie = []

path = os.path.join(data_dir, "crag_task_1_and_2_dev_v4.jsonl")

with open(path, "r", encoding="utf-8") as f:
    for idx, line in enumerate(f):
        record = json.loads(line)
        interaction_id=record['interaction_id']
        
        if record['domain'] == "sports":
            if interaction_id in sports_meta['interaction_id'].to_list():

                record_sanity_check(
                    record, 
                    sports_meta[sports_meta['interaction_id']==interaction_id].iloc[0]
                )

                # convert 5 pages into a list of Document objects
                doc_list = record_to_document_list(record)

                documents_sports += doc_list

                sports_meta.loc[(sports_meta['interaction_id']==interaction_id), 'loaded_to_kb'] += 1
            
        elif record['domain'] == "finance":
            if interaction_id in finance_meta['interaction_id'].to_list():

                record_sanity_check(
                    record, 
                    finance_meta[finance_meta['interaction_id']==interaction_id].iloc[0]
                )

                # convert 5 pages into documents
                doc_list = record_to_document_list(record)

                documents_finance += doc_list

                finance_meta.loc[(finance_meta['interaction_id']==interaction_id), 'loaded_to_kb'] += 1

        elif record['domain'] == "movie":
            if interaction_id in movie_meta['interaction_id'].to_list():

                record_sanity_check(
                    record, 
                    movie_meta[movie_meta['interaction_id']==interaction_id].iloc[0]
                )

                # convert 5 pages into documents
                doc_list = record_to_document_list(record)

                documents_movie += doc_list

                movie_meta.loc[(movie_meta['interaction_id']==interaction_id), 'loaded_to_kb'] += 1

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parse

In [20]:
print(len(documents_sports))
print(len(documents_finance))
print(len(documents_movie))

204
95
179


In [21]:
# get only unique pages, remove duplicate url
documents_sports = remove_duplicate_pages(documents_sports)
documents_finance = remove_duplicate_pages(documents_finance)
documents_movie = remove_duplicate_pages(documents_movie)

print(len(documents_sports))
print(len(documents_finance))
print(len(documents_movie))



144
71
144


In [24]:
# save documents to jsonl
filename = os.path.join(data_dir, "documents_{}.jsonl")

save_documents_as_jsonl(
    documents=documents_sports,
    output_path=filename.format("sports")
)

save_documents_as_jsonl(
    documents=documents_finance,
    output_path=filename.format("finance")
)

save_documents_as_jsonl(
    documents=documents_movie,
    output_path=filename.format("movie")
)