In [39]:
import os

os.environ["JAX_PLATFORMS"] = "cpu" 


from datasets import Dataset

from gensbi_examples.tasks import get_task

from jax import numpy as jnp

import json
from huggingface_hub import upload_file

# Your dictionary with metadata


In [40]:
tasks = ["two_moons", "slcp", "gaussian_linear_uniform", "gaussian_linear", "gaussian_mixture"]
repo_name = "aurelio-amerio/SBI-benchmarks"

In [41]:
metadata = {}

for task_name in tasks:
    task = get_task(task_name)
    dim_data = task.data["dim_data"].item()
    dim_theta = task.data["dim_theta"].item()

    metadata[task_name] = {"dim_data": dim_data, "dim_theta": dim_theta}

file_path = "metadata.json"
with open(file_path, 'w') as f:
    json.dump(metadata, f, indent=4)

./task_data/data_two_moons.npz already exists, skipping download.
./task_data/data_slcp.npz already exists, skipping download.
./task_data/data_gaussian_linear_uniform.npz already exists, skipping download.
./task_data/data_gaussian_linear.npz already exists, skipping download.
./task_data/data_gaussian_mixture.npz already exists, skipping download.


In [42]:
upload_file(
    path_or_fileobj=file_path,
    path_in_repo="metadata.json",  # The name of the file in the repo
    repo_id=repo_name,
    repo_type="dataset",
)

CommitInfo(commit_url='https://huggingface.co/datasets/aurelio-amerio/SBI-benchmarks/commit/c764c682edf58b001210b1c453567b5989fe2a9d', commit_message='Upload metadata.json with huggingface_hub', commit_description='', oid='c764c682edf58b001210b1c453567b5989fe2a9d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/aurelio-amerio/SBI-benchmarks', endpoint='https://huggingface.co', repo_type='dataset', repo_id='aurelio-amerio/SBI-benchmarks'), pr_revision=None, pr_num=None)

In [43]:
# upload dataset function

def upload_dataset(task_name: str, repo_name: str):
    task = get_task(task_name)
    data_dict = dict(task.data)

    max_samples = int(1e6)
    dtype = jnp.float32

    xs = data_dict["xs"][: max_samples]
    xs = xs.astype(dtype)
    thetas = data_dict["thetas"][: max_samples]
    thetas = thetas.astype(dtype)

    xs_val = data_dict["xs"][max_samples :]
    xs_val = xs_val.astype(dtype)
    thetas_val = data_dict["thetas"][max_samples :]
    thetas_val = thetas_val.astype(dtype)

    observations = data_dict["observations"]
    observations = observations.astype(dtype)

    reference_samples = data_dict["reference_samples"]
    reference_samples = reference_samples.astype(dtype)

    true_parameters = data_dict["true_parameters"]
    true_parameters = true_parameters.astype(dtype)

    dim_data = data_dict["dim_data"]
    dim_theta = data_dict["dim_theta"]
    # dim_joint = dim_data + dim_theta
    # num_observations = data_dict["num_observations"]

    dataset_train = Dataset.from_dict({"xs": xs, "thetas": thetas})
    dataset_val = Dataset.from_dict({"xs": xs_val, "thetas": thetas_val})
    dataset_reference_posterior = Dataset.from_dict(
        {"reference_samples": reference_samples, "observations": observations, "true_parameters": true_parameters}
    )

    dataset_train.push_to_hub(repo_name, config_name=task_name, split="train", private=False)
    dataset_val.push_to_hub(repo_name, config_name=task_name, split="validation", private=False)
    dataset_reference_posterior.push_to_hub(repo_name, config_name=f"{task_name}_posterior", split="reference_posterior", private=False)

    return

# upload datasets

In [44]:
upload_dataset("two_moons", "aurelio-amerio/SBI-benchmarks")

./task_data/data_two_moons.npz already exists, skipping download.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/379 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/503 [00:00<?, ?B/s]

In [45]:
upload_dataset("gaussian_linear", "aurelio-amerio/SBI-benchmarks")

./task_data/data_gaussian_linear.npz already exists, skipping download.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

In [46]:
upload_dataset("gaussian_linear_uniform", "aurelio-amerio/SBI-benchmarks")

./task_data/data_gaussian_linear_uniform.npz already exists, skipping download.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

In [47]:
upload_dataset("gaussian_mixture", "aurelio-amerio/SBI-benchmarks")

./task_data/data_gaussian_mixture.npz already exists, skipping download.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

In [48]:
upload_dataset("slcp", "aurelio-amerio/SBI-benchmarks")

./task_data/data_slcp.npz already exists, skipping download.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

# read the dataset

In [6]:
from datasets import load_dataset
import grain

In [65]:
dataset = load_dataset("aurelio-amerio/SBI-benchmarks", "two_moons").with_format("jax")
datset_posteriors = load_dataset("aurelio-amerio/SBI-benchmarks", "two_moons_posterior").with_format("jax")

In [68]:
dataset["train"]["thetas"]

Column([Array([-0.07232666,  0.15797222], dtype=float32), Array([ 0.06684327, -0.0785135 ], dtype=float32), Array([0.7485914 , 0.79264987], dtype=float32), Array([0.4217006 , 0.38446832], dtype=float32), Array([-0.53844297, -0.78507245], dtype=float32)])

In [66]:
df_train, df_val = dataset["train"], dataset["validation"]

In [69]:
df_train[:1000]["xs"]

Array([[ 0.281683  ,  0.2242634 ],
       [ 0.2971876 , -0.03324421],
       [-0.75120413,  0.08253357],
       ...,
       [-0.06863332,  0.00407624],
       [-0.2576473 ,  0.0116563 ],
       [-0.3101945 , -0.05234248]], dtype=float32)

In [54]:
dataset_grain = (
    grain.MapDataset.source(df_train)
    .shuffle(seed=42)
    .batch(30))

In [55]:
df_it = iter(dataset_grain)

In [56]:
next(df_it)

{'thetas': [array([ 0.467538  ,  0.88914347,  0.26116252, -0.52116048, -0.93691456,
         -0.31210661,  0.65375125, -0.33742607,  0.06828678, -0.28487289,
          0.20822608, -0.77626491, -0.05730355, -0.78603745,  0.10012674,
          0.5670855 ,  0.95254672, -0.954916  , -0.31865442, -0.6957866 ,
         -0.27683496, -0.2223177 , -0.49766672, -0.52744901,  0.39993548,
          0.24547338,  0.03779721,  0.24671066,  0.96027672, -0.80480754]),
  array([ 0.90814471,  0.32589257, -0.9304893 ,  0.7387073 ,  0.53239202,
          0.38518476, -0.60087037, -0.83268023,  0.9578861 ,  0.11601651,
          0.75384724, -0.02665102,  0.0180403 ,  0.20102799,  0.70654619,
         -0.69085133,  0.32683408, -0.78483844,  0.18504393, -0.74988341,
          0.91683877, -0.10444295,  0.60779667, -0.81385148,  0.8729471 ,
         -0.23154068,  0.52339888, -0.95439363,  0.87607026,  0.24402499])],
 'xs': [array([-0.6812014 , -0.5300777 , -0.12253505,  0.17347679,  0.02442998,
          0.28448

In [31]:
from huggingface_hub import hf_hub_download

In [36]:
fname = hf_hub_download(repo_id=repo_name, filename="metadata.json", repo_type="dataset")
with open(fname, 'r') as f:
    metadata = json.load(f)

In [38]:
metadata

{'two_moons': {'dim_data': 2, 'dim_theta': 2},
 'slcp': {'dim_data': 8, 'dim_theta': 5},
 'gaussian_linear_uniform': {'dim_data': 10, 'dim_theta': 10},
 'gaussian_linear': {'dim_data': 10, 'dim_theta': 10},
 'gaussian_mixture': {'dim_data': 2, 'dim_theta': 2}}