In [51]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


In [90]:
import torch
import torchaudio
from transformers import WhisperProcessor, pipeline
from datasets import load_dataset
import evaluate
import tempfile
import os

# Step 1: Load the common_voice dataset for Tamil
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "ta", split="test")


In [64]:
dataset[0]["audio"]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/256dec25b50e8072243364b2a76958f3978dd0a3272161faa2272dcf50c8fa20/ta_test_0/common_voice_ta_26622621.mp3',
 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.34414472e-05, 2.36578981e-05, 1.05088511e-05]),
 'sampling_rate': 48000}

In [91]:
import torchaudio

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch


In [92]:
dataset = dataset.map(speech_file_to_array_fn, remove_columns=dataset.column_names)

In [93]:
import librosa
import numpy as np

def resample(batch):
    speech_array = np.asarray(batch["speech"])
    batch["speech"] = librosa.resample(speech_array, orig_sr=48_000, target_sr=16_000)
    batch["sampling_rate"] = 16_000
    return batch

In [94]:
dataset = dataset.map(resample, num_proc=4)

Map (num_proc=4):   0%|          | 0/11815 [00:00<?, ? examples/s]

In [98]:
sample = dataset[2]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [102]:
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ta", task="transcribe")


# load dummy dataset and read audio files
#ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
#sample = ds[0]["audio"]
#sample = dataset[0]
#input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 
input_features = processor(sample["speech"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
print(transcription)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

['<|startoftranscript|><|ta|><|transcribe|><|notimestamps|> அந்த பாட்டின் இரண்டாவது பகத்தில் இதை வண்ணது திரிவலை ஆடலை சொல்லுகிறார்']
[' அந்த பாட்டின் இரண்டாவது பகத்தில் இதை வண்ணது திரிவலை ஆடலை சொல்லுகிறார்']


In [None]:
def transcribe(batch):
    input_features = processor(batch["speech"], sampling_rate=batch["sampling_rate"], return_tensors="pt").input_features 

    # generate token ids
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    # decode token ids to text
    #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
    #print(transcription)
    batch["transcription"] = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    #print(transcription)
    return batch