In [50]:
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa

In [52]:
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [53]:
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Load and resample to 16kHz
    return audio

In [56]:
def split_audio(audio, chunk_length=30):
    sr = 16000  # Sample rate
    chunk_size = chunk_length * sr  # 30 seconds * sample rate
    chunks = [audio[i:i + chunk_size] for i in range(0, len(audio), chunk_size)]
    return chunks

In [58]:
def transcribe_audio(audio):
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
    predicted_ids = model.generate(inputs.input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

In [60]:
def transcribe_long_audio(file_path):
    # Load the audio
    audio = load_audio(file_path)
    
    # Split the audio into chunks
    chunks = split_audio(audio)
    
    # Transcribe each chunk
    full_transcription = ""
    for i, chunk in enumerate(chunks):
        print(f"Transcribing chunk {i + 1} of {len(chunks)}...")
        transcription = transcribe_audio(chunk)
        full_transcription += transcription + " "
    
    return full_transcription.strip()

In [62]:
# Replace this path with the path to your MP3 file
audio_path = "C:\\Users\\bibha\\Downloads\\STT_Whisper_audio (online-audio-converter.com).mp3"


In [64]:
# Transcribe the long audio file
transcription = transcribe_long_audio(audio_path)
print("Full Transcription:", transcription)

Transcribing chunk 1 of 3...
Transcribing chunk 2 of 3...
Transcribing chunk 3 of 3...
Full Transcription: Hello everyone, my name is Biba Kumari. My project name is STT Whisper. This project demonstrates how to use Whisper's model via Hugging Phase Transformers to convert MP3 audio files into text implemented in Python using a Jupyter notebook for ease of understanding and experimentation. It includes steps for loading and pre-processing audio files, transcribing them using the Whisper model and evaluating the model's  performance on custom audio inputs. The whisper model supports the transcription and can automatically detect the language of the audio, making it versatile for applications like transcribing meetings, generating subtitles, or aiding language learning. Ideal for AI ML enthusiasts, developers, and researchers, this project provides a user-friendly well-documented implementation that can be customized for specific users.  cases such as fine tuning for domain specific task