In [None]:
!pip install transformers datasets torchaudio torchcodec

Collecting torchcodec
  Downloading torchcodec-0.6.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (10 kB)
Downloading torchcodec-0.6.0-cp312-cp312-manylinux_2_28_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.6.0


In [None]:
from datasets import load_dataset
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def add_duration(data):
    audio = data["audio"]
    data["duration"] = len(audio["array"]) / audio["sampling_rate"]
    return data

In [None]:
def dataset_creation(dataset, dataset_sizes=[600, 3600, 36000]):
    """
    Create nested subsets of the dataset with given durations (in seconds).
    Example: [600, 3600, 36000] -> 10min ⊂ 1h ⊂ 10h
    """
    dataset_sizes = sorted(dataset_sizes)

    subsets_indices = [[] for _ in dataset_sizes]
    total_durations = [0.0 for _ in dataset_sizes]

    dataset = dataset.shuffle(seed=42)

    current_subset = 0

    for i, ex in enumerate(dataset):
        duration = ex["duration"]

        if current_subset >= len(dataset_sizes):
            break

        # Add this example to *all* active subsets
        for j in range(current_subset, len(dataset_sizes)):
            subsets_indices[j].append(i)
            total_durations[j] += duration

        # If smallest target is filled, move to next
        if total_durations[current_subset] >= dataset_sizes[current_subset]:
            current_subset += 1

    subsets = [dataset.select(indices) for indices in subsets_indices]

    for size, indices, total in zip(dataset_sizes, subsets_indices, total_durations):
        print(f"Target {size/60:.1f} min -> Got {len(indices)} samples, {total/60:.2f} min")

    return subsets


In [None]:
def CreateDatasets(dataset_sizes=[600, 3600, 36000]):
  # dataset = load_dataset("librispeech_asr", "clean", split="train.100")
  dataset = load_dataset(
    "parquet",
    data_files="/content/drive/MyDrive/SP/0000.parquet"
)
  dataset = dataset["train"]
  dataset = dataset.map(add_duration)
  datasets = dataset_creation(dataset, dataset_sizes=dataset_sizes)

  return datasets

In [None]:
# dataset10min, dataset1h, dataset10h = CreateDatasets()
dataset10d = CreateDatasets(dataset_sizes=[600])

Target 10.0 min -> Got 46 samples, 10.15 min


In [None]:
dataset = load_dataset(
    "parquet",
    data_files="/content/drive/MyDrive/SP/0000.parquet"
)

In [None]:
dataset = dataset['train'].select(range(50))

In [None]:
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id', 'duration'],
    num_rows: 50
})

In [None]:
  dataset = dataset.map(add_duration)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
for i in range(50):
    print(dataset[i]["duration"])

14.53
16.085
13.295
11.125
14.08
13.685
13.33
15.75
13.72
15.685
14.155
13.185
15.57
12.965
14.15
14.84
15.35
14.335
15.73
8.305
14.36
12.965
15.08
14.9
15.31
14.185
2.96
15.29
15.035
15.125
15.105
15.555
13.79
13.975
15.745
8.605
13.825
14.39
9.145
8.73
15.78
17.21
13.43
10.365
15.725
11.23
4.02
16.315
16.36
14.47


In [None]:
save_path = "/content/drive/MyDrive/SP/librispeech_datasets"

dataset10d[0].save_to_disk(f"{save_path}/dataset_10h")

Saving the dataset (0/1 shards):   0%|          | 0/46 [00:00<?, ? examples/s]

In [None]:
save_path = "/content/drive/MyDrive/SP/librispeech_datasets"

dataset10min.save_to_disk(f"{save_path}/dataset_10min")
dataset1h.save_to_disk(f"{save_path}/dataset_1h")
dataset10h.save_to_disk(f"{save_path}/dataset_10h")

print("Datasets saved successfully!")

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/291 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/2850 [00:00<?, ? examples/s]

Datasets saved successfully!


In [None]:
valid_dataset_clean = load_dataset("librispeech_asr", "clean", split="validation")

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

In [None]:
valid_dataset_clean.save_to_disk(f"{save_path}/dataset_val_clean")

Saving the dataset (0/1 shards):   0%|          | 0/2703 [00:00<?, ? examples/s]

In [None]:
test_dataset_clean = load_dataset("librispeech_asr", "clean", split="test")

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

In [None]:
test_dataset_clean.save_to_disk(f"{save_path}/dataset_test_clean")

Saving the dataset (0/1 shards):   0%|          | 0/2620 [00:00<?, ? examples/s]