In [25]:

!pip install torch transformers librosa soundfile
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa
import numpy as np



In [26]:
model_name = "openai/whisper-large"  # Use the large model for best accuracy
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [27]:
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Load and resample to 16kHz
    return audio

In [28]:
def split_audio(audio, chunk_length=30):
    sr = 16000  
    chunk_size = chunk_length * sr  
    chunks = [audio[i:i + chunk_size] for i in range(0, len(audio), chunk_size)]
    return chunks

In [29]:
def transcribe_audio(audio, language=None):
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
    
    # Generate transcription with optional language specification
    if language:
        predicted_ids = model.generate(inputs.input_features, language=language)
    else:
        predicted_ids = model.generate(inputs.input_features)
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

In [30]:
def transcribe_any_audio(file_path, language=None):
    
    audio = load_audio(file_path)
    
    # Check if the audio is shorter than 30 seconds
    if len(audio) <= 30 * 16000:  
        print("Transcribing short audio...")
        transcription = transcribe_audio(audio, language=language)
    else:
        # Split the audio into chunks
        chunks = split_audio(audio)
        print(f"Transcribing long audio in {len(chunks)} chunks...")
        
        # Transcribe each chunk
        full_transcription = ""
        for i, chunk in enumerate(chunks):
            print(f"Transcribing chunk {i + 1} of {len(chunks)}...")
            transcription = transcribe_audio(chunk, language=language)
            full_transcription += transcription + " "
        
        transcription = full_transcription.strip()
    
    return transcription

In [31]:

audio_path = "C:\\Users\\bibha\\Downloads\\STT_Whisper_audio (online-audio-converter.com).mp3"

In [32]:
# Specify the language (optional, e.g., "fr" for French, "es" for Spanish, "hi" for Hindi)
language = None  # Set to None for auto-detection,

In [33]:
transcription = transcribe_any_audio(audio_path, language=language)
print("Full Transcription:", transcription)


Transcribing long audio in 3 chunks...
Transcribing chunk 1 of 3...
Transcribing chunk 2 of 3...
Transcribing chunk 3 of 3...
Full Transcription: Hello everyone, my name is Biba Kumari. My project name is STT Whisper. This project demonstrates how to use Whispers model via Hugging Phase Transformers to convert MP3 audio files into text implemented in Python using a Jupyter notebook for ease of understanding and experimentation. It includes steps for loading and pre-processing audio files, transcribing them using the Whisper model and evaluating the model's  performance on custom audio inputs. The Whisper model supports the transcription and can automatically detect the language of the audio, making it versatile for applications like transcribing meetings, generating subtitles or aiding language learning. Ideal for AI ML enthusiasts, developers and researchers, this project provides a user-friendly, well-documented implementation that can be customized for specific  cases such as fine t