In [None]:
from dotenv import load_dotenv


load_dotenv()

In [21]:
from bellem.utils import set_seed
import numpy as np
from datasets import load_dataset, DatasetDict, Dataset

set_seed(89)

## N-hop variants

In [22]:
def publish_nhop_variant(path: str, config_name: str, n_hop: int):
    dsd = load_dataset(path, config_name)
    target_dsd = DatasetDict()
    for split, ds in dsd.items():
        target_dsd[split] = ds.filter(lambda example: len(example['question_decomposition']) == n_hop)
    target_dsd.push_to_hub(f"{path}-{n_hop}hop", config_name=config_name)

In [23]:
# publish_nhop_variant("bdsaglam/musique", "default", 2)
# publish_nhop_variant("bdsaglam/musique", "answerable", 2)

## Mini version with equal distribution of number of hops

In [24]:
# a function that samples from the dataset with equal distribution of n_hops
def sample_evenly(dataset, n_samples):
    dataset = dataset.map(lambda x: {'n_hops': len(x['question_decomposition'])})
    n_hops = np.unique(dataset['n_hops'])
    samples_per_hop = n_samples // len(n_hops)
    for hop in n_hops:
        hop_samples = dataset.filter(lambda x: x['n_hops'] == hop).shuffle().select(range(samples_per_hop))
        yield from hop_samples

In [25]:
def publish_mini_variant(path: str, config_name: str, n_samples: int):
    dsd = load_dataset(path, config_name)
    target_dsd = DatasetDict()
    for split, ds in dsd.items():
        target_dsd[split] = Dataset.from_list(list(sample_evenly(ds, n_samples)))
    target_dsd.push_to_hub(f"{path}-mini", config_name=config_name)

In [26]:
# publish_mini_variant("bdsaglam/musique", "default", 300)
# publish_mini_variant("bdsaglam/musique", "answerable", 300)

In [27]:
dsd = load_dataset("bdsaglam/musique-mini", "answerable")

In [28]:
train_mini_ids = dsd['train']['id']
val_mini_ids = dsd['validation']['id']
mini_ids = train_mini_ids + val_mini_ids

In [30]:
# a function that samples from the dataset with equal distribution of n_hops
def sample_evenly_with_exclude(dataset, n_samples, exclude_ids):
    dataset = dataset.map(lambda x: {'n_hops': len(x['question_decomposition'])})
    n_hops = np.unique(dataset['n_hops'])
    samples_per_hop = n_samples // len(n_hops)
    for hop in n_hops:
        hop_samples = dataset.filter(lambda x: x['n_hops'] == hop and x['id'] not in exclude_ids).shuffle().select(range(samples_per_hop))
        yield from hop_samples

def publish_sweep_variant(path: str, config_name: str, n_samples: int):
    dsd = load_dataset(path, config_name)
    target_dsd = DatasetDict()
    for split, ds in dsd.items():
        target_dsd[split] = Dataset.from_list(list(sample_evenly_with_exclude(ds, n_samples, mini_ids)))
    target_dsd.push_to_hub(f"{path}-sweep", config_name=config_name)

In [None]:
# publish_sweep_variant("bdsaglam/musique", "default", 300)
# publish_sweep_variant("bdsaglam/musique", "answerable", 300)

## The subset used in my thesis

In [7]:
def publish_thesis_variant(path: str, config_name: str, record_ids: list[str]):
    dsd = load_dataset(path, config_name)
    target_dsd = DatasetDict()
    for split, ds in dsd.items():
        ds_subset = ds.filter(lambda x: x['id'] in record_ids)
        if not len(ds_subset):
            continue
        target_dsd[split] = ds_subset
    target_dsd.push_to_hub(f"{path}-thesis", config_name=config_name)

In [None]:
# from bellem.musique.constants import ABLATION_RECORD_IDS

# publish_thesis_variant("bdsaglam/musique", "answerable", ABLATION_RECORD_IDS)