In [None]:
'''
Remember to put vocab.json, merges.txt, ect files in the checkpoint model if they are in different folders
'''

In [None]:
import librosa
import soundfile as sf
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

In [None]:
#Define the path to your fine-tuned model directory
model_dir = "/path/to/your/model/checkpoint-5000"

#Load the processor
processor = WhisperProcessor.from_pretrained(model_dir)

#Load the model
model = WhisperForConditionalGeneration.from_pretrained(model_dir)


In [None]:
#Load your audio file
audio_input, original_sample_rate = sf.read("/path/to/audio/test.wav")

#Resample the audio to 16000 Hz
audio_input_16000 = librosa.resample(audio_input, orig_sr=original_sample_rate, target_sr=16000)


In [None]:
#Ensure the audio array is in the correct shape (1D array)
if len(audio_input_16000.shape) > 1:
    audio_input_16000 = audio_input_16000.mean(axis=1)

#Preprocess the audio input using the feature extractor directly
input_features = processor.feature_extractor(audio_input_16000, sampling_rate=16000, return_tensors="pt").input_features

#Prepare decoder input ids (start of sequence token)
decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]])

#Perform inference
with torch.no_grad():
    generated_ids = model.generate(
        inputs=input_features,
        decoder_input_ids=decoder_input_ids,
        max_length=225,
        num_beams=5,
        early_stopping=True
    )


In [None]:
#Decode the generated ids to get the transcription
transcription = processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

print("Transcription:", transcription[0])
