In [1]:
from datasets import load_from_disk, Audio
from transformers import Wav2Vec2ForPreTraining, Wav2Vec2FeatureExtractor, Trainer, TrainingArguments

# Load unlabeled dataset
dataset = load_from_disk("../../kham_asr_dataset")['train']
dataset

Loading dataset from disk:   0%|          | 0/34 [00:00<?, ?it/s]

Dataset({
    features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
    num_rows: 67273
})

In [None]:

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("openpecha/Garchen_Rinpoche_stt")

# Preprocessing with feature extractor
def preprocess_function(batch):
    # Extract audio arrays
    audio_arrays = [audio["array"] for audio in batch["audio"]]
    
    # Filter out very short audio (less than 1 second at 16kHz)
    min_length = 16000  # 1 second
    audio_arrays = [arr for arr in audio_arrays if len(arr) >= min_length]
    
    if len(audio_arrays) == 0:
        return {"input_values": []}
    
    # Use feature extractor to normalize and process
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=16000,  # Adjust if your data has different sample rate
        return_tensors="pt",
        padding=True,  # We'll pad in collator
    )
    
    return {"input_values": inputs.input_values}

dataset = dataset.map(
    preprocess_function,
    batched=True,
    batch_size=1,
    remove_columns=dataset.column_names,
)

# Filter out empty results from preprocessing
dataset = dataset.filter(lambda x: len(x["input_values"]) > 0)

Map:   0%|          | 0/67273 [00:00<?, ? examples/s]

In [None]:
dataset.save_to_disk("../Data/kham_pretrain_preprocessed")