In [None]:
import bz2
import json
import os.path
import re
import requests
import shutil
import tarfile
from glob import glob

import pandas as pd
from tqdm.notebook import tqdm
from huggingface_hub import snapshot_download
from windows_toasts import Toast, InteractableWindowsToaster

from config import HOTPOT_QA_ROOT, HOTPOT_DOCUMENT_ROOT

# Download QA part of HotpotQA

It contains 2 folders
- fullwiki: QA without gold documents, used to test retrieval and generation
- distractor: QA with gold documents, used to test generation

However, two folders have same training and validation data, so we only use **fullwiki** part.

In [None]:
if os.path.exists(f'{HOTPOT_QA_ROOT}/fullwiki'):
    print('Hotpot QA is already downloaded')
else:
    snapshot_download(
        repo_id='hotpotqa/hotpot_qa',
        local_dir=HOTPOT_QA_ROOT,
        cache_dir='caches',
        repo_type='dataset',
    )

## Examine QA samples

Each QA has following fields:
- id
- question
- answer
- type: str; question type, either 'bridge' or 'comparison'
    - 'bridge': Ask a fact, where the fact needs an intermediate entity(bridge) to retrieve
    - 'comparison': Compare the same attribute of two entities
- level: str; difficulty level, one of 'easy', 'medium', or 'hard'
- supporting_facts: list\[dict{'title', 'index'}]; The gold documents used to answer the question
    - It gives titles and document indices, the actual sentences can be found in field **context** or HotPot documents(see next part).
- context: list\[dict{'title', 'sentences'}]; 10 documents with 2 gold and 8 distractor
    - It gives titles and sentences

In [None]:
df = pd.read_parquet(f'{HOTPOT_QA_ROOT}/fullwiki/train-00000-of-00002.parquet')
df.head(3)

# Download documents of HotpotQA

In [None]:
# Download

url = ('https://nlp.stanford.edu/projects/hotpotqa'
       '/enwiki-20171001-pages-meta-current-withlinks-processed.tar.bz2')
bz2_path = (f'{HOTPOT_DOCUMENT_ROOT}'
            f'/enwiki-20171001-pages-meta-current-withlinks-processed.tar.bz2')

if os.path.exists(bz2_path):
    print('Hotpot document is already downloaded')
else:
    # Request downloading url
    response = requests.get(url, stream=True)
    response.raise_for_status()

    # Save file
    total_size, chunk_size = int(response.headers['Content-Length']), 8192
    with tqdm(total=total_size) as progress_bar:
        with open(bz2_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    file.write(chunk)
                    progress_bar.update(len(chunk))

In [None]:
# Unzip and flatten

if os.path.exists(f'{HOTPOT_DOCUMENT_ROOT}/0.bz2'):
    print('Hotpot document is already unzipped and flattened')
else:
    # Unzip
    file_size = os.path.getsize(bz2_path)
    with open(bz2_path, 'rb') as file:
        with tqdm.wrapattr(file, 'read', total=file_size) as file_wrapper:
            with tarfile.open(fileobj=file_wrapper, mode="r:bz2") as tar:
                tar.extractall(HOTPOT_DOCUMENT_ROOT)
    # Flatten
    for i, data_path in tqdm(
            enumerate(glob(f'{HOTPOT_DOCUMENT_ROOT}/**/*.bz2', recursive=True))
    ):
        shutil.move(data_path, f'{HOTPOT_DOCUMENT_ROOT}/{i}.bz2')
    shutil.rmtree(bz2_path.replace('.tar.bz2', ''))