In [None]:
# !pip install datasets gradio

In [2]:
from transformers import pipeline, VitsModel, VitsTokenizer
from datasets import load_dataset
import torch
import gradio as gr
import numpy as np
from IPython.display import Audio

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

# load text-to-speech checkpoint and speaker embeddings
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# using VITS MMS TTS instead of T5 TTS

model = VitsModel.from_pretrained("facebook/mms-tts-deu")

tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-deu")

def translate(audio):
    try:
        outputs = asr_pipe(audio, generate_kwargs={"task": "translate", "return_timestamps": True})
        return outputs["text"]
    except Exception as e:
        print(f"Error in translation: {e}")
        return "Error during translation"


def synthesise(text):
    try:
        inputs = tokenizer(text, return_tensors="pt")
        input_ids = inputs["input_ids"]
        with torch.no_grad():
            outputs = model(input_ids)
        speech = outputs["waveform"]
        speech = speech.cpu()
        return speech.squeeze()
    except Exception as e:
        print(f"Error in synthesis: {e}")
        return None



def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    print('translated text:\t', translated_text)
    if translated_text == "Error during translation":
        return None, None # Return None for both outputs in case of translation error.

    synthesised_speech = synthesise(translated_text)

    if synthesised_speech is None:
        return None, None # Return None for both outputs in case of synthesis error.

    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech


title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:

![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Microphone(type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

# demo.launch(debug=True, height=600)
demo.launch(height=600)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://57a2466a511e038e48.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




translated text:	  Reputation won't last if you don't do your job. The clinic is part of your job. I want you to do your job. As the philosopher Jagger once said, you can't always get what you want. Oh, I looked into that philosopher you quoted. Jagger. And you're right. You can't always get what you want. But as it turns out, if you try sometimes, you get what you need. You come to this age, come to power.


### Testing the translation - synthesis methods

In [12]:
translated_text = translate("./ycagwyw.wav")
print('translated text:\t', translated_text)



translated text:	  Reputation won't last if you don't do your job. The clinic is part of your job. I want you to do your job. As the philosopher Jagger once said, you can't always get what you want. Oh, I looked into that philosopher you quoted. Jagger. And you're right. You can't always get what you want. But as it turns out, if you try sometimes, you get what you need. You come to this age, come to power.


In [10]:
text_example = ("Du kannst nicht immer bekommen, was du willst, aber wenn du es manchmal versuchst, wirst du feststellen, dass du bekommst, was du brauchst.")

synthesised_speech = synthesise(text_example)

Audio(synthesised_speech, rate=16000)