# SPEECH TO TEXT USING OpenAi Whisper Pre-trained Model

Load the Model

In [1]:
import whisper

model = whisper.load_model("base")

Setup SpeechRecognition Library and Record the Audio

In [67]:
import speech_recognition as sr
#Set SR Instance
r = sr.Recognizer()
#Specify Microphone Device which may vary
mic = sr.Microphone(device_index=1)

#Start Recording Using Specified Microphone Device
#The Recording would automatically stop once silence is recognized
with mic as source:
    r.adjust_for_ambient_noise(source)
    print("Start Talking")
    audio = r.listen(source)
print("Done Recording")

with open("input_audio.wav", "wb") as file:
    file.write(audio.get_wav_data())
file.close()
print("Recording File Saved")


Start Talking
Done Recording
Recording File Saved


Load the Audio and Pad or Trim to 30s span

In [68]:
theAudio = whisper.load_audio("input_audio.wav")
theAudio = whisper.pad_or_trim(theAudio)

Make log-Mel spectrogram and Move to the same device as model

In [69]:
m = whisper.log_mel_spectrogram(theAudio).to(model.device)

Detect the Language of the Recorded Audio

In [70]:
_, probs = model.detect_language(m)
print(f"Detected language: {max(probs, key=probs.get)}")

Detected language: id


Decode the Audio

In [71]:
options = whisper.DecodingOptions(fp16=False)
output = whisper.decode(model, m, options)

Get the Text from Converted Audio

In [72]:
transcription = output.text
print(transcription)

Halo, abri nyanyi! Waaah! Oh maaagaaah!


Delete File After Extracting the Text

In [73]:
import os
os.remove("input_audio.wav")