In [None]:
pip install transformers gradio datasets

In [14]:
from transformers import VitsModel, VitsTokenizer
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
from transformers import pipeline
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import gradio as gr
import numpy as np

In [None]:
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")

trnsl_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
trnsl_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
trnsl_tokenizer.src_lang = "en_XX"

tts_model = VitsModel.from_pretrained("facebook/mms-tts-mal")
tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-mal")

In [16]:
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
translations = {}

def transcribe(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})

    return outputs["text"]

def translate(english):
    encoded_en = trnsl_tokenizer(english,return_tensors = "pt" )
    generated_tokens = trnsl_model.generate(
        **encoded_en,
        forced_bos_token_id=trnsl_tokenizer.lang_code_to_id["ml_IN"]
    )
    mal_text = trnsl_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return mal_text

def synthesise(text):
    inputs = tts_tokenizer(text=text, return_tensors="pt")
    speech = tts_model(**inputs).waveform
    return speech.detach()


def speech_to_speech_translation(audio):
    english_text = transcribe(audio)
    translated_text = translate(english_text)

    translations[english_text] = translated_text

    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech * 32767)
    synthesised_speech = synthesised_speech.numpy().astype(np.int16)
    synthesised_speech = synthesised_speech.squeeze()
    return 16000, synthesised_speech

In [None]:
title = "Cascaded STST"


demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch(debug=True)