In [11]:
!pip install --quiet "transformers>=4.38" "peft>=0.10" "accelerate>=0.28" "torch>=2.2" "datasets" "evaluate" "librosa"

In [None]:

import torch
import pandas as pd
from transformers import pipeline, AutoModelForSpeechSeq2Seq, WhisperProcessor
from peft import PeftModel
import os


peft_model_path = "results/whisper-small-uyghur-finetuned-peft/checkpoint-2000" 
model_id = "ixxan/whisper-small-uyghur-common-voice"

print(f"Loading fine-tuned model from: {peft_model_path}")


base_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
model = PeftModel.from_pretrained(base_model, peft_model_path)
model = model.merge_and_unload()
print(" Fine-tuned model loaded and merged.")


processor = WhisperProcessor.from_pretrained(model_id, task="transcribe")

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch.float16,
    device="mps", 
)
print(" New inference pipeline created for Mac.")


print("Starting BATCHED transcription...")
BASE_PATH = "the-uyghur-voice-cup/"
test_df = pd.read_csv(os.path.join(BASE_PATH, "test.csv"))
audio_paths = [os.path.join(BASE_PATH, filepath) for filepath in test_df['filepath']]

transcriptions = pipe(
    audio_paths,
    batch_size=2,
    generate_kwargs={"task": "transcribe"}
)
final_texts = [result['text'] for result in transcriptions]
submission_df = pd.DataFrame({
    "ID": test_df['ID'],
    "transcription": final_texts
})
submission_df.to_csv("submission.csv", index=False)
print("\n New submission file 'submission.csv' has been created!")

Loading fine-tuned model from: results/whisper-small-uyghur-finetuned-peft/checkpoint-2000
 Fine-tuned model loaded and merged.


Device set to use mps


 New inference pipeline created for Mac.
Starting BATCHED transcription...
