# Automatic Speech Recognition

In [13]:
from transformers.utils import logging
logging.set_verbosity_error()
from IPython.display import Audio as IPythonAudio
import librosa

Testing OpenAI's Whisper Model

In [None]:
# building Whisper Pipeline
from transformers import WhisperProcessor, WhisperForConditionalGeneration

processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

In [39]:
# setting context tokens for decoder unforced | means model predicts language & task
model.config.forced_decoder_ids = None

In [18]:
# transcribing
y, sr = librosa.load("pue_599_long_935_eng-005000-008544-2-1.wav", sr=16000)
display(IPythonAudio(y, rate=sr))

input_features = processor(y, sampling_rate=sr, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features)

# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

[" to announce the phone's price and release date the effort come as phone cells are slowing."]


In [32]:
# now setting context tokens for decoder forced
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")

y, sr = librosa.load("ch_512_long_708_eng-005000-009088-2-1.wav", sr=16000)
display(IPythonAudio(y, rate=sr))

input_features = processor(y, sampling_rate=sr, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

[' The Earth Trojan is elusive. Today scientists have only discussed one Earth Trojan asteroid.']


In [43]:
# lets see how it performs for hindi speech
forced_decoder_ids = processor.get_decoder_prompt_ids(language="hindi", task="transcribe")

y, sr = librosa.load("ahd_48_long_112_hin-002500-005500-2-1.wav", sr=16000)
display(IPythonAudio(y, rate=sr))

input_features = processor(y, sampling_rate=sr, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

[' चाहने एक बाज लगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा नगा']


So, Open AI's whisper fails for hindi language.
Lets try **Meta's MMS Model**

In [None]:
# loading model
from transformers import Wav2Vec2ForCTC, AutoProcessor
import torch

model_id = "facebook/mms-1b-all"

processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

In [None]:
# print(processor.tokenizer.vocab.keys())

# preparing model for hindi language | invoking adapter for Hindi language
processor.tokenizer.set_target_lang("hin")
model.load_adapter("hin")

In [14]:
# passing hindi speech
y, sr = librosa.load("ahd_48_long_112_hin-002500-005500-2-1.wav", sr=16000)
display(IPythonAudio(y, rate=sr))

inputs = processor(y, sampling_rate=16_000, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs).logits

ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)
print(transcription)

उन्हों ने कहा क्या इंडरसन यह कहना चाहते है कि यदि आप इंग्लैंड में रन बनाते हैं तो ही आप पर अच्छे बल्ले बाज होने का ठप्पा लगेगा
