In [54]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [36]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("Ko-Yin-Maung/mig-burmese-audio-transcription")
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'speaker', 'sex', 'age', 'size', 'duration', 'title', 'category', 'type', 'bit_rate'],
        num_rows: 2682
    })
    test: Dataset({
        features: ['audio', 'transcription', 'speaker', 'sex', 'age', 'size', 'duration', 'title', 'category', 'type', 'bit_rate'],
        num_rows: 140
    })
})

In [37]:
ds = ds.remove_columns(['speaker', 'sex', 'age', 'size', 'duration', 'title', 'category', 'type', 'bit_rate'])

In [38]:
ds = ds['test']
ds

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 140
})

In [48]:
from datasets import load_dataset, Audio

ds = load_dataset("Ko-Yin-Maung/mig-burmese-audio-transcription", split="train")

# Make sure audio column is cast without decoding so we get full path
ds = ds.cast_column("audio", Audio(decode=False))

# Check first example's audio info
print(ds[0]["audio"])


{'bytes': b'RIFF\x12[\x01\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x80>\x00\x00\x00}\x00\x00\x02\x00\x10\x00data\xa4Z\x01\x00\n\x00\xc9\xff\xaa\xff\xd0\xffB\xffT\xffv\xfe\x9f\xfem\xfe&\xfen\xfeS\xfdS\xfd\x87\xfc\x1a\xfc\xb0\xfbZ\xfa\xf4\xf9\xc3\xf8U\xf9[\xfad\xfcZ\xfeU\xfe(\xfe\x82\xfd\xd6\xfdp\xfe\x87\xfeT\xfeb\xfe#\xff&\x01\xc1\x02\xf1\x03\x16\x04}\x03J\x03q\x02\xc1\x01\xc2\x00\xf0\xff\xcf\xff\xbe\xffo\x00\xc5\x00\xf8\x00(\x01\xa0\x00%\x00\xe8\xfe\xfc\xfd\\\xfdO\xfd\xb3\xfd/\xfe\xe7\xfe\x98\xff\x8c\x00\x0f\x01$\x019\x01\xfd\x00\x17\x01\x0b\x01 \x01{\x01\xe4\x01\xb9\x02\x93\x03F\x04\xc7\x04\x86\x04\x1e\x04}\x03\xd7\x02,\x02\x04\x02E\x02q\x02\xdc\x02\xc5\x02\xd2\x02\x81\x02B\x02\x02\x02\xea\x00"\x00\x13\x00)\x00\xed\x00\xe2\x00%\x01\x8e\x018\x01>\x01\xb6\x00\xd0\xff\x02\xff\xb5\xfe\xa7\xfe?\xfe6\xfeo\xfd\xbb\xfc\xeb\xfc\xa9\xfc8\xfc\xac\xfb\xb2\xfa\xb5\xf9x\xf9W\xf8\xb5\xf7\x02\xf7\xb3\xf5\x0f\xf5\xae\xf6\x00\xfa\x1a\xff"\x03\x0e\x03[\x02\x89\x01\xe9\x01\xc6\x02\n\x01\xd3\x00\xb3\x0

In [49]:
import io
import soundfile as sf
import numpy as np
from IPython.display import Audio

audio_bytes = ds[0]["audio"]["bytes"]
byte_io = io.BytesIO(audio_bytes)

# Read audio data and sample rate from bytes buffer
data, samplerate = sf.read(byte_io)

# Play audio using IPython.display.Audio
Audio(data, rate=samplerate)


In [51]:
import os
import csv

os.makedirs("audio_files", exist_ok=True)

with open("mig_burmese_metadata.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["audio_path", "transcription"])

    for example in ds:
        filename = example["audio"]["path"]
        audio_bytes = example["audio"]["bytes"]
        transcription = example["transcription"].replace("\t", " ").replace("\n", " ")

        filepath = os.path.join("audio_files", filename)

        # Save bytes to wav file
        with open(filepath, "wb") as wav_file:
            wav_file.write(audio_bytes)

        writer.writerow([filepath, transcription])


In [52]:
import csv
from IPython.display import Audio

with open("mig_burmese_metadata.tsv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")
    first = next(reader)

print("Sentence:", first["transcription"])
print("Audio path:", first["audio_path"])

Audio(first["audio_path"])


Sentence: ကျွန်တော် အစမှာပြောခဲ့တဲ့ လောကကြီး ၊ စကြာဝဠာကြီးက အရာရာကို
Audio path: audio_files/80-20-Hypothesis_147.wav


In [None]:
import csv
from pathlib import Path
import random
import numpy as np
import soundfile as sf
import nlpaug.augmenter.audio as naa

class AudioAugmentationPipeline:
    def __init__(self, sr):
        self.sr = sr
        self.augmenters = {
            'loudness': naa.LoudnessAug(),
            'crop': naa.CropAug(sampling_rate=sr),
            'mask': naa.MaskAug(sampling_rate=sr, zone=(0.0, 1.0), coverage=0.1, mask_with_noise=False, stateless=True),
            'noise': naa.NoiseAug(),
            'pitch': naa.PitchAug(sampling_rate=sr, factor=(2, 3)),
            'shift': naa.ShiftAug(sampling_rate=sr),
            'speed': naa.SpeedAug(zone=(0.0, 1.0), coverage=1.0, factor=(1.5, 1.5)),
            'vtlp': naa.VtlpAug(sampling_rate=sr),
        }

    def augment(self, data, augmenter_name):
        if augmenter_name not in self.augmenters:
            raise ValueError(f"Augmentation '{augmenter_name}' is not supported.")
        if len(data) < 100:
            raise ValueError("Audio too short.")
        augmented = self.augmenters[augmenter_name].augment(data)
        return np.array(augmented, dtype=np.float32)

class HF_Augmentation:
    def __init__(self, metadata_path, output_dir, sr=16000):
        self.metadata_path = Path(metadata_path)
        self.output_dir = Path(output_dir)
        self.sr = sr
        self.pipeline = AudioAugmentationPipeline(sr=sr)

        with open(metadata_path, "r", encoding="utf-8", newline='') as f:
            reader = csv.reader(f, delimiter='\t')
            self.entries = [row for row in reader if len(row) >= 2]

    def augment(self, percent, methods):
        total = len(self.entries)
        sample_count = max(1, int(total * percent / 100))
        selected = random.sample(self.entries, sample_count)

        aug_meta_path = self.output_dir / "aug_metadata.txt"
        self.output_dir.mkdir(parents=True, exist_ok=True)

        for method in methods:
            method_dir = self.output_dir / method
            method_dir.mkdir(exist_ok=True)

            with open(aug_meta_path, "w", encoding="utf-8") as meta_out:
                for wav_path_str, text in selected:
                    wav_path = Path(wav_path_str).resolve()

                    if not wav_path.exists():
                        print(f"Warning: file not found: {wav_path}")
                        continue

                    data, sr = sf.read(wav_path)
                    if sr != self.sr:
                        print(f"Warning: sample rate mismatch for {wav_path}: expected {self.sr}, got {sr}")

                    augmented = self.pipeline.augment(data, method)

                    if augmented.ndim == 2:
                        augmented = augmented.T
                        if augmented.shape[1] == 1:
                            augmented = augmented.squeeze()

                    max_val = np.max(np.abs(augmented))
                    if max_val > 0:
                        augmented = augmented / max_val * 0.8

                    out_filename = f"{method}_{wav_path.name}"
                    out_path = method_dir / out_filename
                    sf.write(out_path, augmented.astype(np.float32), self.sr, format="WAV", subtype="PCM_16")

                    meta_out.write(f"{out_path.resolve()}\t{text.strip()}\n")

                    print(f"Augmented: {out_path}")


In [None]:
pipeline = HF_Augmentation("/content/mig_burmese_metadata.tsv", "output_of_aug")
pipeline.augment(1, ["noise", "pitch"])

Augmented: output_of_aug/noise/noise_ChitOoNyo_Wait_for_me_256.wav
Augmented: output_of_aug/noise/noise_80-20-Hypothesis_112.wav
Augmented: output_of_aug/noise/noise_The_brain_that_continues_to_function_when_people_are_about_to_die_124.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_426.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_488.wav
Augmented: output_of_aug/noise/noise_ChitOoNyo_Wait_for_me_168.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_216.wav
Augmented: output_of_aug/noise/noise_King_Wan_Landon_Diary_419.wav
Augmented: output_of_aug/noise/noise_ChitOoNyo_Wait_for_me_330.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_168.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_506.wav
Augmented: output_of_aug/noise/noise_ChitOoNyo_Wait_for_me_283.wav
Augmented: output_of_aug/noise/noise_Studying_how_trees_ommunicate_with_each_other_056.wav
Augmented: output_of_aug/noise/noise_King_Wan_Lando