In [None]:
from datasets import load_dataset, Audio, concatenate_datasets
from datasets import DatasetDict
import numpy as np
import librosa
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Data Loading
ds = load_dataset("DTU54DL/common-accent")
train_ds = ds["train"].cast_column("audio", Audio(sampling_rate=16_000))
test_ds  = ds["test"].cast_column("audio", Audio(sampling_rate=16_000))

full_ds = concatenate_datasets([train_ds, test_ds])
full_ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.42k [00:00<?, ?B/s]

(…)-00000-of-00001-fc6c4977ae9f62c1.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00000-of-00001-53b88232efc0bf7e.parquet:   0%|          | 0.00/19.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/451 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'sentence', 'accent'],
    num_rows: 10451
})

In [None]:
# keep only top 6
# Distribution of combined data
accent_counts = Counter(full_ds["accent"])

df_accent = (
    pd.DataFrame.from_dict(accent_counts, orient="index", columns=["count"])
      .sort_values("count", ascending=False)
)

top6_accents = [acc for acc, _ in accent_counts.most_common(6)]
filtered_ds = full_ds.filter(lambda example: example["accent"] in top6_accents)

Filter:   0%|          | 0/10451 [00:00<?, ? examples/s]

## Random sampling 400 optimization

In [None]:
# Randomly sample 400 samples from top2
top2_accents = [acc for acc, _ in accent_counts.most_common(2)]

In [None]:
# Optimization
import time
from collections import defaultdict
import random
import pandas as pd
from datasets import concatenate_datasets, Dataset

In [None]:
# ─── ORIGINAL ─────────────────────────────────────────────
t0 = time.time()
# your original loop
sampled = []
for acc in top2_accents:
    acc_ds = filtered_ds.filter(lambda ex: ex["accent"] == acc)
    acc_sample = acc_ds.shuffle(seed=42).select(range(400))
    sampled.append(acc_sample)
orig_ds = concatenate_datasets(sampled)
print(f"Original       : {time.time() - t0:.4f} s")

Filter:   0%|          | 0/10168 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10168 [00:00<?, ? examples/s]

Original       : 206.7360 s


In [None]:
# ─── METHOD 1: Single-pass index sampling ───────────────────
t1 = time.time()
# 1) bucket indices in one scan
idxs = defaultdict(list)
for i, ex in enumerate(filtered_ds):
    a = ex["accent"]
    if a in top2_accents:
        idxs[a].append(i)
# 2) sample in Python
random.seed(42)
all_idxs = []
for a in top2_accents:
    all_idxs += random.sample(idxs[a], 400)
# 3) one .select
sp_ds = filtered_ds.select(all_idxs)
print(f"Single-pass    : {time.time() - t1:.4f} s")


# ─── METHOD 2: Pandas group-by sample ──────────────────────
t2 = time.time()
df = filtered_ds.to_pandas()
df2 = (
    df[df["accent"].isin(top2_accents)]
      .groupby("accent")
      .sample(n=400, random_state=42)
)
pd_ds = Dataset.from_pandas(df2, preserve_index=False)
print(f"Pandas sample  : {time.time() - t2:.4f} s")


# ─── METHOD 3: Parallel HF pipeline ────────────────────────
t3 = time.time()
parts = []
for acc in top2_accents:
    part = (
        filtered_ds
          .filter(lambda ex: ex["accent"] == acc, num_proc=4)
          .shuffle(seed=42)
          .select(range(400))
    )
    parts.append(part)
pp_ds = concatenate_datasets(parts)
print(f"HF parallel    : {time.time() - t3:.4f} s")

Single-pass    : 108.7374 s
Pandas sample  : 1.7670 s


Filter (num_proc=4):   0%|          | 0/10168 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/10168 [00:00<?, ? examples/s]

HF parallel    : 170.3046 s


In [None]:
# Combine to a Final Dataset to use
other_ds = filtered_ds.filter(lambda ex: ex["accent"] not in top2_accents)
final_dataset = concatenate_datasets(sampled + [other_ds])

In [None]:
final_dataset

Dataset({
    features: ['audio', 'sentence', 'accent'],
    num_rows: 1867
})

## Simple Cleaning optimization

In [None]:
# Numba and Cython Optimizations for Simple Cleaning

# 1) Install and load Cython in Colab
!pip install Cython
%load_ext Cython



In [None]:
# 2) Original cleaning for reference
def clean_py(data):
    audio = data["audio"]["array"]
    # 1) Mono
    if audio.ndim > 1:
        audio = audio.mean(axis=1)
    # 2) Trim silence
    trimmed, _ = librosa.effects.trim(audio, top_db=30)
    # 3) Normalize
    return {"speech": trimmed / np.max(np.abs(trimmed))}

In [None]:
# 3) Numba-optimized cleaning
from numba import njit

@njit
def clean_nb_array(audio):
    # Manual silence trim + normalize on 1D numpy array
    # a) find peak
    peak = 0.0
    for x in audio:
        if abs(x) > peak:
            peak = abs(x)
    thresh = 0.01 * peak

    # b) find start and end indices
    start = 0
    for i in range(len(audio)):
        if abs(audio[i]) > thresh:
            start = i
            break
    end = len(audio)
    for i in range(len(audio)-1, -1, -1):
        if abs(audio[i]) > thresh:
            end = i + 1
            break

    # c) slice and normalize
    out = audio[start:end].copy()
    maxval = 0.0
    for x in out:
        if abs(x) > maxval:
            maxval = abs(x)
    for i in range(len(out)):
        out[i] /= maxval

    return out

def clean_nb(data):
    audio = data["audio"]["array"]
    # mono
    if audio.ndim > 1:
        audio = audio.mean(axis=1)
    return {"speech": clean_nb_array(audio)}


In [None]:
%%cython
import numpy as np
cimport numpy as np

def clean_cy_array(np.ndarray[np.float64_t, ndim=1] audio):
    cdef Py_ssize_t N = audio.shape[0]
    cdef Py_ssize_t i
    cdef double x, peak = 0.0

    # a) find global peak
    for i in range(N):
        x = audio[i]
        if x < 0:
            x = -x
        if x > peak:
            peak = x
    cdef double thresh = 0.01 * peak

    # b) find trim start
    cdef Py_ssize_t start = 0
    for i in range(N):
        x = audio[i]
        if x < 0:
            x = -x
        if x > thresh:
            start = i
            break

    # c) find trim end
    cdef Py_ssize_t end = N
    for i in range(N - 1, -1, -1):
        x = audio[i]
        if x < 0:
            x = -x
        if x > thresh:
            end = i + 1
            break

    # d) copy slice
    cdef Py_ssize_t L = end - start
    cdef np.ndarray[np.float64_t, ndim=1] out = np.empty(L, dtype=np.float64)
    for i in range(L):
        out[i] = audio[start + i]

    # e) normalize
    cdef double peak2 = 0.0
    for i in range(L):
        x = out[i]
        if x < 0:
            x = -x
        if x > peak2:
            peak2 = x
    for i in range(L):
        out[i] /= peak2

    return out


Content of stderr:
In file included from /usr/local/lib/python3.11/dist-packages/numpy/_core/include/numpy/ndarraytypes.h:1909,
                 from /usr/local/lib/python3.11/dist-packages/numpy/_core/include/numpy/ndarrayobject.h:12,
                 from /usr/local/lib/python3.11/dist-packages/numpy/_core/include/numpy/arrayobject.h:5,
                 from /root/.cache/ipython/cython/_cython_magic_d306d0061e5f0182267b943cd4b46f1a21dfa4c7.c:1250:
      |  ^~~~~~~

In [None]:
def clean_cy(data):
    audio = data["audio"]["array"]
    if audio.ndim > 1:
        audio = audio.mean(axis=1)
    return {"speech": clean_cy_array(audio)}

In [None]:
# prepare dummy 1D audio
dummy = {"audio": {"array": np.random.randn(16000*5).astype(np.float64)}}

# warm up numba
_ = clean_nb(dummy)
t_py = time.time(); _ = clean_py(dummy); print("Original:", time.time()-t_py)
t_nb = time.time(); _ = clean_nb(dummy); print("Numba  :", time.time()-t_nb)
t_cy = time.time(); _ = clean_cy(dummy); print("Cython :", time.time()-t_cy)

Original: 2.948168992996216
Numba  : 0.0006520748138427734
Cython : 0.0015382766723632812


## Optimization for segmentation

In [None]:
# Train-test Split
from datasets import concatenate_datasets
# Segment成5秒
from datasets import load_dataset
YAM = load_dataset("ZZZtong/common-accent-YAMNet")

all_ds = concatenate_datasets([split for split in YAM.values()])

# 80% 20%
split1 = all_ds.train_test_split(test_size=0.2, seed=42)
train_ds = split1["train"]
temp_ds  = split1["test"]

# 10% 10% Test set
split2 = temp_ds.train_test_split(test_size=0.5, seed=42)
val_ds  = split2["train"]
test_ds = split2["test"]

print(f"Train: {len(train_ds)}  ({len(train_ds)/len(all_ds):.2%})")
print(f"Val:   {len(val_ds)}    ({len(val_ds)/len(all_ds):.2%})")
print(f"Test:  {len(test_ds)}   ({len(test_ds)/len(all_ds):.2%})")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/616 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/33.9M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/33.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2538 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/308 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/309 [00:00<?, ? examples/s]

Train: 2524  (80.00%)
Val:   315    (9.98%)
Test:  316   (10.02%)


In [None]:
# Timing Original vs Numba vs Cython on train_ds segmentation

import numpy as np
import time
import pandas as pd
import pyximport, numpy as np
from numba import njit

# Constants (assumes train_ds & all_ds are already defined)
MAX_LEN = 5 * 16000
HOP     = MAX_LEN

# 1) Original segmentation timing
t0 = time.time()
orig_count = 0
for ex in train_ds:
    arr = ex["audio"]["array"]
    L = len(arr)
    for start in range(0, L, HOP):
        chunk = arr[start:start+MAX_LEN]
        if len(chunk) < MAX_LEN:
            chunk = np.pad(chunk, (0, MAX_LEN-len(chunk)), mode="constant")
        orig_count += 1
t0 = time.time() - t0

# 2) Numba-optimized segmentation
@njit
def segment_array_nb(arr, MAX_LEN):
    n = arr.shape[0]
    hop = MAX_LEN
    segs = (n + hop - 1) // hop
    res = np.empty((segs, MAX_LEN), dtype=arr.dtype)
    for i in range(segs):
        start = i * hop
        for j in range(MAX_LEN):
            idx = start + j
            res[i, j] = arr[idx] if idx < n else 0.0
    return res

# warm-up
_ = segment_array_nb(train_ds[0]["audio"]["array"], MAX_LEN)

t1 = time.time()
nb_count = 0
for ex in train_ds:
    mats = segment_array_nb(ex["audio"]["array"], MAX_LEN)
    nb_count += mats.shape[0]
t1 = time.time() - t1

# 3) Cython-optimized segmentation via pyximport
cython_code = '''
import numpy as np
cimport numpy as np

def segment_array_cy(np.ndarray[np.float64_t, ndim=1] arr, int MAX_LEN):
    cdef Py_ssize_t n = arr.shape[0]
    cdef int hop = MAX_LEN
    cdef int segs = (n + hop - 1) // hop
    cdef np.ndarray[np.float64_t, ndim=2] res = np.empty((segs, MAX_LEN), dtype=np.float64)
    cdef Py_ssize_t i, j, idx
    for i in range(segs):
        for j in range(MAX_LEN):
            idx = i * hop + j
            res[i, j] = arr[idx] if idx < n else 0.0
    return res
'''
with open("seg_cy.pyx", "w") as f:
    f.write(cython_code)

pyximport.install(setup_args={"include_dirs":[np.get_include()]})
import seg_cy

t2 = time.time()
cy_count = 0
for ex in train_ds:
    mats = seg_cy.segment_array_cy(ex["audio"]["array"], MAX_LEN)
    cy_count += mats.shape[0]
t2 = time.time() - t2

# 4) Display results
results = pd.DataFrame({
    "method":   ["Original Python", "Numba JIT", "Cython"],
    "segments": [orig_count, nb_count, cy_count],
    "time_sec": [t0, t1, t2]
})
from IPython.display import display
display(results)


Unnamed: 0,method,segments,time_sec
0,Original Python,2524,3.237111
1,Numba JIT,2524,2.801746
2,Cython,2524,2.456285


In [None]:
train_ds = train_ds.cast_column("audio", Audio(sampling_rate=16_000))

MAX_LEN   = 5 * 16_000
HOP       = MAX_LEN
audio_col = "audio"

def segment_batch(batch):
    seg_audio, seg_sent, seg_acc = [], [], []

    audio_dicts = batch[audio_col]
    sentences   = batch["sentence"]
    accents     = batch["accent"]

    # zip through each example in the batch
    for audio_field, sent, acc in zip(audio_dicts, sentences, accents):
        arr = audio_field.get("array")
        sr  = audio_field.get("sampling_rate")
        if arr is None:
            continue
        for start in range(0, len(arr), HOP):
            chunk = arr[start : start + MAX_LEN]
            if len(chunk) < MAX_LEN:
                chunk = np.pad(chunk, (0, MAX_LEN - len(chunk)), mode="constant")
            seg_audio.append({"array": chunk, "sampling_rate": sr})
            seg_sent.append(sent)
            seg_acc.append(acc)

    return {
        audio_col:     seg_audio,
        "sentence":    seg_sent,
        "accent":      seg_acc,
    }

In [None]:
# Run this in your Colab **after** you’ve defined `train_ds`, `val_ds`, `test_ds`, and `all_ds`:

# 1) Ensure Cython is installed & load extension
!pip install Cython
%load_ext Cython

import numpy as np
import time
from numba import njit
import pyximport

# 2) Constants (reuse your variables)
MAX_LEN   = 5 * 16_000
HOP       = MAX_LEN
audio_col = "audio"

# 3) Original `.map(segment_batch)` timing
start = time.time()
orig_ds = train_ds.map(
    segment_batch,
    batched=True,
    batch_size=1,
    remove_columns=train_ds.column_names,
)
orig_time = time.time() - start
print(f"Original map time: {orig_time:.4f} s, segments: {len(orig_ds)}")

# 4) Numba-optimized segment_batch

@njit
def segment_array_nb(arr, max_len):
    n = arr.shape[0]
    hop = max_len
    segs = (n + hop - 1) // hop
    out = np.empty((segs, max_len), dtype=arr.dtype)
    for i in range(segs):
        start = i * hop
        for j in range(max_len):
            idx = start + j
            out[i, j] = arr[idx] if idx < n else 0.0
    return out

def segment_batch_nb(batch):
    seg_audio, seg_sent, seg_acc = [], [], []
    for audio_field, sent, acc in zip(batch[audio_col], batch["sentence"], batch["accent"]):
        arr = audio_field["array"]
        sr  = audio_field["sampling_rate"]
        if arr is None:
            continue
        mats = segment_array_nb(arr, MAX_LEN)
        for row in mats:
            seg_audio.append({"array": row, "sampling_rate": sr})
            seg_sent.append(sent)
            seg_acc.append(acc)
    return {audio_col: seg_audio, "sentence": seg_sent, "accent": seg_acc}

# Warm up JIT
_ = segment_batch_nb({
    audio_col:     [train_ds[0][audio_col]],
    "sentence":    [train_ds[0]["sentence"]],
    "accent":      [train_ds[0]["accent"]],
})

start = time.time()
nb_ds = train_ds.map(
    segment_batch_nb,
    batched=True,
    batch_size=1,
    remove_columns=train_ds.column_names,
)
nb_time = time.time() - start
print(f"Numba map time:    {nb_time:.4f} s, segments: {len(nb_ds)}")

# 5) Cython-optimized segment_batch via pyximport
pyximport.install(setup_args={"include_dirs":[np.get_include()]})

# write Cython helper
cy_code = '''
import numpy as np
cimport numpy as np

def segment_array_cy(np.ndarray[np.float64_t, ndim=1] arr, int max_len):
    cdef Py_ssize_t n = arr.shape[0]
    cdef int hop = max_len
    cdef int segs = (n + hop - 1) // hop
    cdef np.ndarray[np.float64_t, ndim=2] out = np.empty((segs, max_len), dtype=np.float64)
    cdef Py_ssize_t i, j, idx
    for i in range(segs):
        for j in range(max_len):
            idx = i * hop + j
            out[i, j] = arr[idx] if idx < n else 0.0
    return out
'''
with open("seg_cy.pyx", "w") as f:
    f.write(cy_code)

import seg_cy

def segment_batch_cy(batch):
    seg_audio, seg_sent, seg_acc = [], [], []
    for audio_field, sent, acc in zip(batch[audio_col], batch["sentence"], batch["accent"]):
        arr = audio_field["array"]
        sr  = audio_field["sampling_rate"]
        if arr is None:
            continue
        mats = seg_cy.segment_array_cy(arr, MAX_LEN)
        for row in mats:
            seg_audio.append({"array": row, "sampling_rate": sr})
            seg_sent.append(sent)
            seg_acc.append(acc)
    return {audio_col: seg_audio, "sentence": seg_sent, "accent": seg_acc}

# Warm up Cython
_ = segment_batch_cy({
    audio_col:     [train_ds[0][audio_col]],
    "sentence":    [train_ds[0]["sentence"]],
    "accent":      [train_ds[0]["accent"]],
})

start = time.time()
cy_ds = train_ds.map(
    segment_batch_cy,
    batched=True,
    batch_size=1,
    remove_columns=train_ds.column_names,
)
cy_time = time.time() - start
print(f"Cython map time:   {cy_time:.4f} s, segments: {len(cy_ds)}")

# 6) Summary
import pandas as pd
from IPython.display import display

df = pd.DataFrame({
    "method": ["Original", "Numba JIT", "Cython"],
    "time_sec": [orig_time, nb_time, cy_time],
    "segments": [len(orig_ds), len(nb_ds), len(cy_ds)]
})
display(df)


The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


Map:   0%|          | 0/2524 [00:00<?, ? examples/s]

Original map time: 272.6523 s, segments: 2524


Map:   0%|          | 0/2524 [00:00<?, ? examples/s]

Numba map time:    273.9930 s, segments: 2524


Map:   0%|          | 0/2524 [00:00<?, ? examples/s]

Cython map time:   271.7192 s, segments: 2524


Unnamed: 0,method,time_sec,segments
0,Original,272.652287,2524
1,Numba JIT,273.992969,2524
2,Cython,271.719178,2524


In [None]:
import torch
import torch.nn.functional as F

MAX_LEN = 5 * 16000
HOP     = MAX_LEN

def segment_batch_gpu(batch):
    # 1) stack raw arrays into one [B, T] tensor
    audios = torch.stack([
        torch.from_numpy(a["array"].astype(np.float32))
        for a in batch["audio"]
    ], dim=0).cuda()                 # → [B, T]

    # 2) pad right so length is a multiple of MAX_LEN
    B, T = audios.shape
    pad_amt = (MAX_LEN - (T % MAX_LEN)) % MAX_LEN
    audios = F.pad(audios.unsqueeze(1), (0, pad_amt)).squeeze(1)
    # → [B, T + pad_amt]

    # 3) break into non-overlapping windows:
    #    .unfold(dim=1, size=MAX_LEN, step=HOP) → [B, n_segs, MAX_LEN]
    windows = audios.unfold(1, MAX_LEN, HOP)
    B, n_segs, L = windows.shape

    # 4) reshape to [B*n_segs, MAX_LEN]
    flat = windows.contiguous().view(-1, MAX_LEN)

    # 5) move back to CPU & to numpy
    flat_np = flat.cpu().numpy()

    # now build your output lists
    out_audio = [{"array": clip, "sampling_rate": batch["audio"][0]["sampling_rate"]}
                 for clip in flat_np]
    out_sent  = []
    out_acc   = []
    for sent, acc in zip(batch["sentence"], batch["accent"]):
        out_sent += [sent] * n_segs
        out_acc  += [acc]  * n_segs

    return {
      "audio":     out_audio,
      "sentence":  out_sent,
      "accent":    out_acc,
    }


In [None]:
import time
t0 = time.time()
gpu_ds = train_ds.map(
    segment_batch_gpu,
    batched=True,
    batch_size=16,           # process 16 files at once
    remove_columns=train_ds.column_names,
)
print("GPU map took", time.time() - t0, "seconds")


Map:   0%|          | 0/2524 [00:00<?, ? examples/s]

GPU map took 198.4181296825409 seconds


In [None]:

t1 = time.time()
gpu_ds = train_ds.map(
    segment_batch_gpu,
    batched=True,
    batch_size=1,           # process 16 files at once
    remove_columns=train_ds.column_names,
)
print("GPU map took", time.time() - t1, "seconds")


Map:   0%|          | 0/2524 [00:00<?, ? examples/s]

GPU map took 209.6043963432312 seconds
