In [1]:
!pip install --upgrade pip
!pip install --upgrade transformers datasets[audio] accelerate




In [6]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, MarianMTModel, MarianTokenizer
from datasets import load_dataset

In [8]:
# Check device and dtype
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load the ASR model and processor
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

# Set up the ASR pipeline
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

# Set up the translation model and tokenizer
def load_translation_model(source_lang='en', target_lang='fr'):
    model_id = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
    translation_model = MarianMTModel.from_pretrained(model_id).to(device)
    translation_tokenizer = MarianTokenizer.from_pretrained(model_id)
    return translation_model, translation_tokenizer

# Load translation model (for example, English to French)
translation_model, translation_tokenizer = load_translation_model('en', 'fr')

# Function to transcribe and translate audio
def transcribe_and_translate(audio_sample, generate_kwargs):
    try:
        # Transcribe the audio
        transcription_result = asr_pipe(audio_sample)
        transcription = transcription_result["text"]
        print("Transcription:", transcription)

        # Prepare for translation
        translated_inputs = translation_tokenizer(transcription, return_tensors="pt", padding=True)
        translated_inputs = {key: value.to(device) for key, value in translated_inputs.items()}

        # Generate the translation
        translated_result = translation_model.generate(**translated_inputs)
        translation = translation_tokenizer.batch_decode(translated_result, skip_special_tokens=True)
        print("Translation:", translation[0])  # Assuming one output

        return transcription, translation[0]
    except Exception as e:
        print(f"Error processing audio: {e}")
        return None, None

# Define generation arguments for translation
generate_kwargs = {
    "max_new_tokens": 448,
    "num_beams": 4,
    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
}

# Process your audio sample
sample = "/content/sample2.flac"  # Update with your audio file path
transcribe_and_translate(sample, generate_kwargs)

# Note: If you encounter a warning about "sacremoses", you can install it using:
# !pip install sacremoses



Transcription:  before he had time to answer a much encumbered vera burst into the room with the question i say can i leave these here these were a small black pig and a lusty specimen of black-red game-cock
Translation: avant qu'il n'ait eu le temps de répondre à une vera beaucoup grevée a éclaté dans la pièce avec la question je dis que je peux laisser ceux-là ici ce sont un petit cochon noir et un spécimen lustré de noir-rouge game-cock


(' before he had time to answer a much encumbered vera burst into the room with the question i say can i leave these here these were a small black pig and a lusty specimen of black-red game-cock',
 "avant qu'il n'ait eu le temps de répondre à une vera beaucoup grevée a éclaté dans la pièce avec la question je dis que je peux laisser ceux-là ici ce sont un petit cochon noir et un spécimen lustré de noir-rouge game-cock")