In [1]:
from bellem.utils import set_seed
import numpy as np
from datasets import load_dataset, DatasetDict, Dataset

set_seed(89)

## N-hop variants

In [2]:
def publish_nhop_variant(path: str, config_name: str, n_hop: int):
    dsd = load_dataset(path, config_name)
    target_dsd = DatasetDict()
    for split, ds in dsd.items():
        target_dsd[split] = ds.filter(lambda example: len(example['question_decomposition']) == n_hop)
    target_dsd.push_to_hub(f"{path}-{n_hop}hop", config_name=config_name)

In [3]:
# publish_nhop_variant("bdsaglam/musique", "default", 2)
# publish_nhop_variant("bdsaglam/musique", "answerable", 2)

## Mini version with equal distribution of number of hops

In [4]:
# a function that samples from the dataset with equal distribution of n_hops
def sample_evenly(dataset, n_samples):
    dataset = dataset.map(lambda x: {'n_hops': len(x['question_decomposition'])})
    n_hops = np.unique(dataset['n_hops'])
    samples_per_hop = n_samples // len(n_hops)
    for hop in n_hops:
        hop_samples = dataset.filter(lambda x: x['n_hops'] == hop).shuffle().select(range(samples_per_hop))
        yield from hop_samples

In [5]:
def publish_mini_variant(path: str, config_name: str, n_samples: int):
    dsd = load_dataset(path, config_name)
    target_dsd = DatasetDict()
    for split, ds in dsd.items():
        target_dsd[split] = Dataset.from_list(list(sample_evenly(ds, n_samples)))
    target_dsd.push_to_hub(f"{path}-mini", config_name=config_name)

In [6]:
# publish_mini_variant("bdsaglam/musique", "default", 300)
# publish_mini_variant("bdsaglam/musique", "answerable", 300)

## The subset used in my thesis

In [7]:
def publish_thesis_variant(path: str, config_name: str, record_ids: list[str]):
    dsd = load_dataset(path, config_name)
    target_dsd = DatasetDict()
    for split, ds in dsd.items():
        ds_subset = ds.filter(lambda x: x['id'] in record_ids)
        if not len(ds_subset):
            continue
        target_dsd[split] = ds_subset
    target_dsd.push_to_hub(f"{path}-thesis", config_name=config_name)

In [8]:
from bellem.musique.constants import ABLATION_RECORD_IDS

publish_thesis_variant("bdsaglam/musique", "answerable", ABLATION_RECORD_IDS)

Filter:   0%|          | 0/19938 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2417 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]