In [1]:
!pip install nlpaug



In [1]:
from datasets import load_dataset, Audio

ds = load_dataset("Ko-Yin-Maung/mig-burmese-audio-transcription", split="train")

ds = ds.cast_column("audio", Audio(decode=False)) ### Decode Problem

#print(ds[0]["audio"])


  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the dataset since Ko-Yin-Maung/mig-burmese-audio-transcription couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\ASUS\.cache\huggingface\datasets\Ko-Yin-Maung___mig-burmese-audio-transcription\default\0.0.0\fe2cc61628db7751570b2772d5dc875f0ca14f18 (last modified on Thu Jul 10 15:36:23 2025).


In [2]:
import io
import soundfile as sf
import numpy as np
from IPython.display import Audio

audio_bytes = ds[0]["audio"]["bytes"]
byte_io = io.BytesIO(audio_bytes)

data, samplerate = sf.read(byte_io)

Audio(data, rate=samplerate) # Test audio 


In [3]:
import os
import csv

os.makedirs("audio_files", exist_ok=True)

with open("mig_burmese_metadata.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["audio_path", "transcription"])

    for i, example in enumerate(ds):
        filename = example["audio"].get("path")
        audio_bytes = example["audio"]["bytes"]
        transcription = example["transcription"].replace("\t", " ").replace("\n", " ")

        if filename is None:
            filename = f"audio_{i}.wav"

        filepath = os.path.join("audio_files", filename)

        with open(filepath, "wb") as wav_file:
            wav_file.write(audio_bytes)

        writer.writerow([filepath, transcription])


In [None]:
#@markdown if you run this cell, you would get error run the cell above
import os
import csv

os.makedirs("audio_files", exist_ok=True)

with open("mig_burmese_metadata.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["audio_path", "transcription"])

    for example in ds:
        filename = example["audio"]["path"]
        audio_bytes = example["audio"]["bytes"]
        transcription = example["transcription"].replace("\t", " ").replace("\n", " ")

        filepath = os.path.join("audio_files", filename)

        with open(filepath, "wb") as wav_file:
            wav_file.write(audio_bytes)

        writer.writerow([filepath, transcription])


In [5]:
import csv
from IPython.display import Audio

with open("mig_burmese_metadata.tsv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")
    first = next(reader)

print("Sentence:", first["transcription"])
print("Audio path:", first["audio_path"])

Audio(first["audio_path"])


Sentence: ကျွန်တော် အစမှာပြောခဲ့တဲ့ လောကကြီး ၊ စကြာဝဠာကြီးက အရာရာကို
Audio path: audio_files\80-20-Hypothesis_147.wav


In [None]:
import csv
from pathlib import Path
import random
import numpy as np
import soundfile as sf
import nlpaug.augmenter.audio as naa

class AudioAugmentationPipeline:
    def __init__(self, sr):
        self.sr = sr
        self.augmenters = {
            'loudness': naa.LoudnessAug(),
            'crop': naa.CropAug(sampling_rate=sr),
            'mask': naa.MaskAug(sampling_rate=sr, zone=(0.0, 1.0), coverage=0.1, mask_with_noise=False, stateless=True),
            'noise': naa.NoiseAug(),
            'pitch': naa.PitchAug(sampling_rate=sr, factor=(2, 3)),
            'shift': naa.ShiftAug(sampling_rate=sr),
            'speed': naa.SpeedAug(zone=(0.0, 1.0), coverage=1.0, factor=(1.5, 1.5)),
            'vtlp': naa.VtlpAug(sampling_rate=sr),
        }

    def augment(self, data, augmenter_name):
        if augmenter_name not in self.augmenters:
            raise ValueError(f"Augmentation '{augmenter_name}' is not supported.")
        if len(data) < 100:
            raise ValueError("Audio too short.")
        augmented = self.augmenters[augmenter_name].augment(data)
        return np.array(augmented, dtype=np.float32)

class HF_Augmentation:
    def __init__(self, metadata_path, output_dir, sr=16000):
        self.metadata_path = Path(metadata_path)
        self.output_dir = Path(output_dir)
        self.sr = sr
        self.pipeline = AudioAugmentationPipeline(sr=sr)

        with open(metadata_path, "r", encoding="utf-8", newline='') as f:
            reader = csv.reader(f, delimiter='\t')
            self.entries = [row for row in reader if len(row) >= 2]

    def augment(self, percent, methods):
        total = len(self.entries)
        sample_count = max(1, int(total * percent / 100))
        selected = random.sample(self.entries, sample_count)

        aug_meta_path = self.output_dir / "aug_metadata.txt"
        self.output_dir.mkdir(parents=True, exist_ok=True)

        for method in methods:
            method_dir = self.output_dir / method
            method_dir.mkdir(exist_ok=True)

            with open(aug_meta_path, "w", encoding="utf-8") as meta_out:
                for wav_path_str, text in selected:
                    wav_path = Path(wav_path_str).resolve()

                    if not wav_path.exists():
                        print(f"Warning: file not found: {wav_path}")
                        continue

                    data, sr = sf.read(wav_path)
                    if sr != self.sr:
                        print(f"Warning: sample rate mismatch for {wav_path}: expected {self.sr}, got {sr}")

                    augmented = self.pipeline.augment(data, method)

                    if augmented.ndim == 2:
                        augmented = augmented.T
                        if augmented.shape[1] == 1:
                            augmented = augmented.squeeze()

                    max_val = np.max(np.abs(augmented))
                    if max_val > 0:
                        augmented = augmented / max_val * 0.8

                    out_filename = f"{method}_{wav_path.name}"
                    out_path = method_dir / out_filename
                    sf.write(out_path, augmented.astype(np.float32), self.sr, format="WAV", subtype="PCM_16")

                    meta_out.write(f"{out_path.resolve()}\t{text.strip()}\n")

                    print(f"Augmented: {out_path}")


In [None]:
pipeline = HF_Augmentation("/content/mig_burmese_metadata.tsv", "output_of_aug")
pipeline.augment(1, ["noise", "pitch"])

Augmented: output_of_aug/noise/noise_ChitOoNyo_Wait_for_me_256.wav
Augmented: output_of_aug/noise/noise_80-20-Hypothesis_112.wav
Augmented: output_of_aug/noise/noise_The_brain_that_continues_to_function_when_people_are_about_to_die_124.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_426.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_488.wav
Augmented: output_of_aug/noise/noise_ChitOoNyo_Wait_for_me_168.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_216.wav
Augmented: output_of_aug/noise/noise_King_Wan_Landon_Diary_419.wav
Augmented: output_of_aug/noise/noise_ChitOoNyo_Wait_for_me_330.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_168.wav
Augmented: output_of_aug/noise/noise_UNu_Does_hell_have_horns_506.wav
Augmented: output_of_aug/noise/noise_ChitOoNyo_Wait_for_me_283.wav
Augmented: output_of_aug/noise/noise_Studying_how_trees_ommunicate_with_each_other_056.wav
Augmented: output_of_aug/noise/noise_King_Wan_Lando