In [9]:
import json
from datasets import Dataset, DatasetDict

In [4]:
with open('../../tmp/hotpotqa/hotpot_dev_distractor_v1.json') as f:
    hotpotqa_distractor_samples = json.load(f)
len(hotpotqa_distractor_samples)

7405

In [17]:
def dedup_consecutive(sequence: list) -> list:
    """deduplicate consecutive items if same"""
    deduped = []
    for item in sequence:
        if not deduped or item != deduped[-1]:
            deduped.append(item)
    return deduped

assert dedup_consecutive([1, 1, 2, 3, 2, 4, 4]) == [1, 2, 3, 2, 4]
assert dedup_consecutive([]) == []

In [22]:
def convert_hotpotqa_to_musique(hotpotqa_sample):
    musique_sample = {
        "id": hotpotqa_sample["_id"],
        "paragraphs": [],
        "question": hotpotqa_sample["question"],
        "question_decomposition": [],
        "answer": hotpotqa_sample["answer"],
        "answer_aliases": [],
        "answerable": True
    }

    # Step 1: Process Context Paragraphs
    supporting_paragraph_titles = [sf[0] for sf in hotpotqa_sample["supporting_facts"]]
    for idx, (title, sentences) in enumerate(hotpotqa_sample["context"]):
        paragraph_text = "".join(sentences)  # Concatenate sentences into paragraph
        is_supporting = title in supporting_paragraph_titles
        paragraph = {
            "idx": idx,
            "is_supporting": is_supporting,
            "paragraph_text": paragraph_text,
            "title": title
        }
        musique_sample["paragraphs"].append(paragraph)

    # Step 2: Question Decomposition (as per given structure, skipping as we have no decomposition info)
    supporting_paragraph_titles_deduped = dedup_consecutive(supporting_paragraph_titles)
    for idx, title in enumerate(supporting_paragraph_titles_deduped):
        # Find the paragraph index in the context list that matches the supporting fact title
        paragraph_support_idx = next(p['idx'] for p in musique_sample["paragraphs"] if title == p['title'])
        question_decomposition = {
            "answer": "",
            "id": idx,
            "paragraph_support_idx": paragraph_support_idx,
            "question": hotpotqa_sample["question"]  # Keeping the original question since no decomposition exists
        }
        musique_sample["question_decomposition"].append(question_decomposition)

    return musique_sample


In [23]:
hotpotqa_sample = hotpotqa_distractor_samples[1]
hotpotqa_sample

{'_id': '5a8c7595554299585d9e36b6',
 'answer': 'Chief of Protocol',
 'question': 'What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?',
 'supporting_facts': [['Kiss and Tell (1945 film)', 0],
  ['Shirley Temple', 0],
  ['Shirley Temple', 1]],
 'context': [['Meet Corliss Archer',
   ["Meet Corliss Archer, a program from radio's Golden Age, ran from January 7, 1943 to September 30, 1956.",
    ' Although it was CBS\'s answer to NBC\'s popular "A Date with Judy", it was also broadcast by NBC in 1948 as a summer replacement for "The Bob Hope Show".',
    ' From October 3, 1952 to June 26, 1953, it aired on ABC, finally returning to CBS.',
    " Despite the program's long run, fewer than 24 episodes are known to exist."]],
  ['Shirley Temple',
   ["Shirley Temple Black (April 23, 1928 – February 10, 2014) was an American actress, singer, dancer, businesswoman, and diplomat who was Hollywood's number one box-office draw as a child actress fr

In [24]:
musique_sample = convert_hotpotqa_to_musique(hotpotqa_sample)
musique_sample

{'id': '5a8c7595554299585d9e36b6',
 'paragraphs': [{'idx': 0,
   'is_supporting': False,
   'paragraph_text': 'Meet Corliss Archer, a program from radio\'s Golden Age, ran from January 7, 1943 to September 30, 1956. Although it was CBS\'s answer to NBC\'s popular "A Date with Judy", it was also broadcast by NBC in 1948 as a summer replacement for "The Bob Hope Show". From October 3, 1952 to June 26, 1953, it aired on ABC, finally returning to CBS. Despite the program\'s long run, fewer than 24 episodes are known to exist.',
   'title': 'Meet Corliss Archer'},
  {'idx': 1,
   'is_supporting': True,
   'paragraph_text': "Shirley Temple Black (April 23, 1928 – February 10, 2014) was an American actress, singer, dancer, businesswoman, and diplomat who was Hollywood's number one box-office draw as a child actress from 1935 to 1938. As an adult, she was named United States ambassador to Ghana and to Czechoslovakia and also served as Chief of Protocol of the United States.",
   'title': 'Shir

In [25]:
# Convert the list of HotPotQA samples to MuSiQue format and create a HuggingFace dataset
musique_samples = [convert_hotpotqa_to_musique(sample) for sample in hotpotqa_distractor_samples]
dataset = Dataset.from_list(musique_samples)

dataset_dict = DatasetDict({"validation": dataset})

dataset_dict.push_to_hub("bdsaglam/hotpotqa-distractor")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/bdsaglam/hotpotqa-distractor/commit/0db9d87d3306b7b14f2b289a040e416b7baf3654', commit_message='Upload dataset', commit_description='', oid='0db9d87d3306b7b14f2b289a040e416b7baf3654', pr_url=None, pr_revision=None, pr_num=None)