In [None]:
import json

def jprint(obj):
    print(json.dumps(obj, indent=2))

In [None]:
def print_dict_structure(d, indent=0):
    """
    Recursively prints the structure of a nested dictionary and nested lists.
    For lists, it displays the structure based on the first element of the list (recursively).

    Parameters:
    d (dict): The dictionary whose structure to print.
    indent (int): The indentation level (for internal use during recursion).
    """
    def print_list_structure(lst, indent):
        """Helper function to handle nested list structures."""
        if len(lst) == 0:
            print(" " * indent + "[empty list]")
        else:
            first_element = lst[0]
            if isinstance(first_element, list):
                print(" " * indent + "[list of list]")
                print_list_structure(first_element, indent + 4)
            elif isinstance(first_element, dict):
                print(" " * indent + "[list of dict]")
                print_dict_structure(first_element, indent + 4)
            else:
                print(" " * indent + "[list of " + type(first_element).__name__ + "]")

    for key, value in d.items():
        print(" " * indent + str(key) + ": ", end="")
        if isinstance(value, dict):
            print()  # Start a new line for nested dictionary
            print_dict_structure(value, indent + 4)  # Increase indentation for nested dicts
        elif isinstance(value, list):
            print()  # Move to a new line for lists
            print_list_structure(value, indent + 4)  # Recursively print list structure
        else:
            print(type(value).__name__)  # Print the type of the value


In [None]:
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
dsd = load_dataset("hotpotqa/hotpot_qa", "distractor", trust_remote_code=True)
dsd

In [None]:
dsd['train'][1]

In [None]:
musique_dsd = load_dataset('bdsaglam/musique-mini', 'answerable')
musique_dsd

In [None]:
musique_dsd['train'][0]['question_decomposition']

In [None]:
print_dict_structure(musique_dsd['train'][0])

In [None]:
def convert_hotpotqa_to_musique(hotpotqa_sample):
    musique_sample = {
        "id": hotpotqa_sample["id"],
        "paragraphs": [],
        "question": hotpotqa_sample["question"],
        "question_decomposition": [],
        "answer": hotpotqa_sample["answer"],
        "answer_aliases": [],
        "answerable": True
    }

    # Step 1: Process Context Paragraphs
    context_list = list(zip(hotpotqa_sample['context']['title'], hotpotqa_sample['context']['sentences']))
    for idx, (title, sentences) in enumerate(context_list):
        paragraph_text = " ".join(sentences)  # Concatenate sentences into paragraph
        is_supporting = any([supporting_title == title for supporting_title in hotpotqa_sample["supporting_facts"]['title']])

        paragraph = {
            "idx": idx,
            "is_supporting": is_supporting,
            "paragraph_text": paragraph_text,
            "title": title
        }
        musique_sample["paragraphs"].append(paragraph)

    # There must be at least one supporting paragraph
    assert any(p['is_supporting'] for p in musique_sample["paragraphs"])

    # Step 2: Question Decomposition (as per given structure, skipping as we have no decomposition info)
    for idx, title in enumerate(hotpotqa_sample["supporting_facts"]['title']):
        # Find the paragraph index in the context list that matches the supporting fact title
        paragraph_support_idx = next((i for i, t in enumerate(hotpotqa_sample["context"]['title']) if t == title), -1)

        question_decomposition = {
            "answer": "",
            "id": idx,
            "paragraph_support_idx": paragraph_support_idx,
            "question": hotpotqa_sample["question"]  # Keeping the original question since no decomposition exists
        }
        musique_sample["question_decomposition"].append(question_decomposition)

    return musique_sample


In [None]:
# Example HotPotQA sample
hotpotqa_sample = dsd['train'][1]
jprint(hotpotqa_sample)

In [None]:
# Convert and print the MuSiQue sample
hotpotqa_sample_musique = convert_hotpotqa_to_musique(hotpotqa_sample)
jprint(hotpotqa_sample_musique)


In [None]:
hotpotqa_musique_dsd = dsd.map(convert_hotpotqa_to_musique)
hotpotqa_musique_dsd

In [None]:
hotpotqa_musique_dsd.push_to_hub("bdsaglam/hotpotqa-distractor")


In [None]:
def sample_evenly(dataset, n_samples):
    for level in ['easy', 'hard', 'medium']:
        yield from dataset.filter(lambda x: x['level'] == level).shuffle().select(range(n_samples))

def publish_mini_variant(path: str, config_name: str, n_samples: int):
    dsd = load_dataset(path, config_name)
    target_dsd = DatasetDict()
    for split, ds in dsd.items():
        target_dsd[split] = Dataset.from_list(list(sample_evenly(ds, n_samples)))
    target_dsd.push_to_hub(f"{path}-mini", config_name=config_name)

In [None]:
publish_mini_variant("bdsaglam/hotpotqa-distractor", "default", 100)