In [1]:
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from safetensors import safe_open
import librosa

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")  # Use any model architecture

safetensor_path = "./model/model.safetensors"  # Path to your saved safetensors file
tensors = {}

with safe_open(safetensor_path, framework="pt", device="cpu") as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

model.load_state_dict(tensors, strict=False)

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

def transcribe_audio(file_path):
    audio_array, sampling_rate = librosa.load(file_path, sr=16000)

    input_features = processor(
        audio_array, sampling_rate=sampling_rate, return_tensors="pt"
    ).input_features

    with torch.no_grad():
        generated_ids = model.generate(input_features)

    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return transcription

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Example usage
audio_file_path = "./train_marathimale_00020.wav"  # Replace with your audio file path
result = transcribe_audio(audio_file_path)
print("Transcription:", result)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription:  Clubs and balls, and cities, grew to be only memories. 


In [None]:
import requests

API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
headers = {"Authorization": "Bearer XXXX"}  # Replace with your API key

def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

output = query("train_marathimale_00020.wav")

In [9]:
output

{'text': ' Clubs and walls and cities grew to be only memories.'}