In [1]:
from datasets import load_from_disk, Audio
from transformers import Wav2Vec2ForPreTraining, Wav2Vec2FeatureExtractor, Trainer, TrainingArguments

# Load unlabeled dataset
dataset = load_from_disk("../Data/kham_asr_dataset")['train']
dataset

Loading dataset from disk:   0%|          | 0/34 [00:00<?, ?it/s]

Dataset({
    features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
    num_rows: 67273
})

In [2]:
dataset[0]

{'file_name': 'STT_KH0058_0123_605150_to_609454',
 'uni': 'ཁྱེད་རང་ཚོ་ཚང་མ་དེ་ག་རེ་རེད་ཟེར་ཡོང་དུས་ཙམ་པ་ཡ། མི་གཞུངས་དྲང་པོ་རེད་ཟེར། ཨེ་ནས།  ',
 'wylie': 'khyed rang tsho tshang ma de ga re red zer yong dus tsam pa ya/_mi gzhungs drang po red zer/_e nas/__',
 'url': 'https://d38pmlk0v88drf.cloudfront.net/wav16k/STT_KH0058_0123_605150_to_609454.wav',
 'dept': 'STT_KH',
 'grade': 2,
 'char_len': 81,
 'audio_len': 4.304,
 '__index_level_0__': 0,
 'audio': {'array': [-0.00079345703125,
   -0.000823974609375,
   -0.002227783203125,
   -0.002197265625,
   3.0517578125e-05,
   -3.0517578125e-05,
   -0.002593994140625,
   -0.00274658203125,
   -0.001861572265625,
   -0.002044677734375,
   -0.001434326171875,
   -0.001007080078125,
   -0.001861572265625,
   -0.0013427734375,
   -0.000579833984375,
   -0.0008544921875,
   -0.000823974609375,
   -0.000579833984375,
   -0.00054931640625,
   -0.0018310546875,
   -0.00189208984375,
   -0.001708984375,
   -0.00115966796875,
   -0.000946044921875,
   -

In [3]:

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")

# Preprocessing with feature extractor
def preprocess_function(batch):
    # Extract audio arrays
    audio_arrays = [audio["array"] for audio in batch["audio"]]
    
    # Filter out very short audio (less than 1 second at 16kHz)
    min_length = 16000  # 1 second
    audio_arrays = [arr for arr in audio_arrays if len(arr) >= min_length]
    
    if len(audio_arrays) == 0:
        return {"input_values": []}
    
    # Use feature extractor to normalize and process
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=16000,  # Adjust if your data has different sample rate
        return_tensors="pt",
        padding=True,  # We'll pad in collator
    )
    
    return {"input_values": inputs.input_values}

dataset = dataset.map(
    preprocess_function,
    batched=True,
    batch_size=32,
    remove_columns=dataset.column_names,
)

# Filter out empty results from preprocessing
dataset = dataset.filter(lambda x: len(x["input_values"]) > 0)

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

In [4]:
dataset.save_to_disk("kham_asr_preprocessed")

Saving the dataset (0/63 shards):   0%|          | 0/67273 [00:00<?, ? examples/s]