## Create Dataset & Upload to Hub


In [1]:
from datasets import load_dataset
import os
import numpy as np
import aiohttp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_data(dataset: str = "jg583/NSynth", num_train_samples: int = 41600, num_val_samples: int = 3200, num_test_samples: int = 128, mini: bool = False):

    dataset = load_dataset(
                dataset,
                trust_remote_code=True,
                cache_dir=os.getcwd() + "/data",
                storage_options={
                    "client_kwargs": {"timeout": aiohttp.ClientTimeout(total=3600)}
                }, # https://github.com/huggingface/datasets/issues/7164
            )
    
    splits = ["train", "validation", "test"]
    
    # remove all synthetic instruments
    for split in splits :
        dataset[split] = dataset[split].filter(
            lambda x: x["instrument_source_str"] != "synthetic", num_proc=5
        )
        dataset[split] = dataset[split].filter(
            lambda x: x["velocity"] > 50, num_proc=5
        )

    num_samples = (num_train_samples, num_val_samples, num_test_samples)
    if mini:
        num_samples = (1280, 256, 128)

    # only keep N samples for each split
    for split, num_sample in zip(splits, num_samples):
        dataset[split] = dataset[split].shuffle(seed=42).select(range(num_sample))

    return dataset

In [3]:
nsynth = get_data("jg583/NSynth")
for split in ["train", "validation", "test"]:
    print(f"{split}: {len(nsynth[split])}")

train: 41600
validation: 3200
test: 128


In [4]:
def chunk_data(data):
    
    max_start_index = 16_000
    audio_length = int(16_000 * 2.1)

    # randomly select 2 second segement of audio starting within first second
    def chunk_audio(x): 
        start = np.random.randint(0, max_start_index)
        x["audio"]["array"] = x["audio"]["array"][start : start + audio_length]
        return x

    data["train"] = data["train"].map(chunk_audio, num_proc=5)
    data["validation"] = data["validation"].map(chunk_audio, num_proc=5)

    return data

In [5]:
nsynth_cut = chunk_data(nsynth)

Map (num_proc=5): 100%|██████████| 3200/3200 [00:12<00:00, 246.29 examples/s] 


In [6]:
nsynth_cut["train"][0]["audio"]["array"].shape

(33600,)

In [7]:
nsynth_cut["validation"][0]["audio"]["array"].shape

(33600,)

In [8]:
nsynth_cut["test"][0]["audio"]["array"].shape

(64000,)

In [9]:
nsynth_cut.push_to_hub("aphamm/mamba-muse")

Map: 100%|██████████| 6934/6934 [00:00<00:00, 8540.19 examples/s]s]
Creating parquet from Arrow format: 100%|██████████| 70/70 [00:00<00:00, 287.53ba/s]
Map: 100%|██████████| 6934/6934 [00:00<00:00, 7178.45 examples/s]31.52s/it]
Creating parquet from Arrow format: 100%|██████████| 70/70 [00:00<00:00, 286.42ba/s]
Map: 100%|██████████| 6933/6933 [00:00<00:00, 7623.72 examples/s]31.71s/it]
Creating parquet from Arrow format: 100%|██████████| 70/70 [00:00<00:00, 272.97ba/s]
Map: 100%|██████████| 6933/6933 [00:00<00:00, 7527.48 examples/s]31.13s/it]
Creating parquet from Arrow format: 100%|██████████| 70/70 [00:00<00:00, 257.99ba/s]
Map: 100%|██████████| 6933/6933 [00:00<00:00, 7658.68 examples/s]29.00s/it]
Creating parquet from Arrow format: 100%|██████████| 70/70 [00:00<00:00, 278.03ba/s]
Map: 100%|██████████| 6933/6933 [00:00<00:00, 7281.65 examples/s]28.70s/it]
Creating parquet from Arrow format: 100%|██████████| 70/70 [00:00<00:00, 250.82ba/s]
Uploading the dataset shards: 100%|███████

CommitInfo(commit_url='https://huggingface.co/datasets/aphamm/mamba-muse/commit/83384e5fe567aeab31d16856b1684beb52b7c500', commit_message='Upload dataset', commit_description='', oid='83384e5fe567aeab31d16856b1684beb52b7c500', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/aphamm/mamba-muse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='aphamm/mamba-muse'), pr_revision=None, pr_num=None)