In [1]:
from datasets import load_dataset

ds = load_dataset('ganga4364/stt_tibetan_dialects_data')
ds

DatasetDict({
    train: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
        num_rows: 112870
    })
    validation: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
        num_rows: 8000
    })
})

In [2]:
# filter for Kham dialect and non-empty transcripts
ds = ds.filter(lambda x: (x["dept"] == "STT_KH" or x["dept"] == "STT_KH_AB")and x['uni'].strip() != '')
ds

DatasetDict({
    train: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
        num_rows: 67273
    })
    validation: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
        num_rows: 4000
    })
})

In [6]:
import librosa
import requests
import tempfile

def prepare_dataset(batch):
    # Download audio from URL
    response = requests.get(batch["url"])
    response.raise_for_status()

    # Save to temporary WAV file
    with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
        tmp.write(response.content)
        tmp.flush()

        # Load and resample audio using librosa
        waveform, sr = librosa.load(tmp.name, sr=16000)
        batch["audio"] = {"array": waveform, "sampling_rate": sr}
        batch["transcript"] = batch["uni"]

    return batch

ds = ds.map(prepare_dataset, num_proc=4)

Map (num_proc=4):   0%|          | 0/67273 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [7]:
ds.save_to_disk('kham_asr_dataset')

Saving the dataset (0/34 shards):   0%|          | 0/67273 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]