In [1]:
!pip install transformers datasets torchaudio librosa pydub

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torchaudio
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor, MBartForConditionalGeneration, MBartTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
import librosa

In [3]:
# Function to load and preprocess audio
def load_audio(file_path, target_sampling_rate=16000):
    audio, sr = librosa.load(file_path, sr=target_sampling_rate)
    return audio, sr

In [4]:
# Load Whisper model for speech recognition
def load_whisper_model():
    processor = WhisperProcessor.from_pretrained("openai/whisper-base")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
    return processor, model

In [5]:
# Perform speech-to-text
def speech_to_text(audio_path, processor, model):
    audio, sr = load_audio(audio_path)
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt").input_features
    predicted_ids = model.generate(inputs)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

In [6]:
# Load mBART model for text translation
def load_translation_model():
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
    return model, tokenizer

In [7]:
# Perform text translation
def translate_text(input_text, tokenizer, model, src_lang, tgt_lang):
    tokenizer.src_lang = src_lang
    inputs = tokenizer(input_text, return_tensors="pt")
    generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translated_text

In [8]:
# Speech translation pipeline
def speech_translation_pipeline(audio_path, src_lang, tgt_lang):
    # Step 1: Load models
    whisper_processor, whisper_model = load_whisper_model()
    mbart_model, mbart_tokenizer = load_translation_model()

    # Step 2: Perform Speech-to-Text
    transcription = speech_to_text(audio_path, whisper_processor, whisper_model)
    print(f"Transcription: {transcription}")

    # Step 3: Perform Translation
    translated_text = translate_text(transcription, mbart_tokenizer, mbart_model, src_lang, tgt_lang)
    print(f"Translated Text: {translated_text}")

    return transcription, translated_text

In [12]:
# Test with an example audio file
if __name__ == "__main__":
    # Upload audio file (must be in .wav format)
    from google.colab import files
    uploaded = files.upload()
    audio_file = list(uploaded.keys())[0]

    # Define source and target languages
    src_language = "en_XX"  # English
    tgt_language = "fr_XX"  # French

    # Run the pipeline
    transcription, translation = speech_translation_pipeline(audio_file, src_language, tgt_language)

    print("\nSpeech Translation Completed!")

Saving WhatsApp Ptt 2024-12-06 at 10.43.18 AM.ogg to WhatsApp Ptt 2024-12-06 at 10.43.18 AM.ogg


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


Transcription:  Hello, how are you?
Translated Text: Bonjour, comment va-t-il?

Speech Translation Completed!
