In [1]:
#Install All the Required Dependencies
#!pip install --upgrade pip 
#!pip install torch torchvision torchaudio
#!pip install transformers ipywidgets gradio --upgrade
#!pip install --upgrade transformers accelerate
#!pip install --upgrade gradio
#!pip install nltk
#!pip install soundfile
#!pip install librosa numpy jiwer nltk
#!pip install huggingface_hub

In [2]:
#Import Required Libraries
from transformers import pipeline
from jiwer import wer
from transformers import VitsModel, AutoTokenizer, set_seed
import torch
import soundfile as sf
import librosa
from scipy.spatial.distance import euclidean
import numpy as np
import string
import os
import numpy as np
import librosa

In [3]:
#import os
#Select GPU if available
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set PyTorch CUDA memory allocation strategy
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
#print(os.environ.get("PYTORCH_CUDA_ALLOC_CONF")) # Verify that it's set
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
#print(os.environ.get("PYTORCH_CUDA_ALLOC_CONF")) # Verify that it's set

#Empty GPU Cache
#torch.cuda.empty_cache()
#torch.cuda.memory_summary(device=None, abbreviated=False)
#torch.backends.cudnn.benchmark = True


In [4]:
#Define Model Loading Functions
description = """
<div style="text-align: center;">
    <h1 style="color: #FF0000;">ASPMIR-MULTILINGUAL-TEXT2SPEECH-TESTBED</h1>
</div>

<div style="text-align: center;">
<h3 style="color: #D2691E;">This Tool Allows Developers and Researchers to Carry Out Text2Speech Synthesis with Open Sourced Pretrained/Finetuned Text-to-Speech(TTS) AI Models.</h3>
</div>
"""

In [5]:
#Define Translation and Synthesis Function
def speech_synthesis(modelName,langName, langText):
    #reference_translations = "awon apositeli, awon woli, awon ajinrere ati awon oluso agutan ati awon oluko." #'recorder_2024-01-13_11-24-41_453538.wav'#"My name is Joy, I love reading"
    #TTS for the translated_text_target
    if "facebook/mms-tts" in modelName.lower():
        if langName == "Yoruba":
            ttsModel = VitsModel.from_pretrained("facebook/mms-tts-yor")
            tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-yor")
            ttsInputs = tokenizer(langText, return_tensors="pt") 
        elif langName == "English":
            ttsModel = VitsModel.from_pretrained("facebook/mms-tts-eng")
            tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
            ttsInputs = tokenizer(langText, return_tensors="pt")
        elif langName == "French":
            ttsModel = VitsModel.from_pretrained("facebook/mms-tts-fra")
            tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-fra")
            ttsInputs = tokenizer(langText, return_tensors="pt")
        elif langName == "Hausa":
            ttsModel = VitsModel.from_pretrained("facebook/mms-tts-hau")
            tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hau")
            ttsInputs = tokenizer(langText, return_tensors="pt")
        elif langName == "Igbo":
            ttsModel = VitsModel.from_pretrained("facebook/mms-tts-ibo")
            tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ibo")
            ttsInputs = tokenizer(langText, return_tensors="pt")
    else:
        raise ValueError("Unsupported model")
    
    set_seed(555)  # make deterministic     
    with torch.no_grad():
        ttsOutput = ttsModel(**ttsInputs).waveform
    #Convert the tensor to a numpy array 
    ttsWaveform = ttsOutput.numpy()[0] 
    #Save the waveform to an audio file
    sf.write('ttsOutput.wav', ttsWaveform, 16000)
    return 'ttsOutput.wav'

In [6]:
def reset_fields():
    return "", "","", None  # Clear modelName, langName, inputText, and audio output

In [7]:
#Define User Interface Function using Gradio and IPython Libraries
import gradio as gr
from IPython.display import Audio

with gr.Blocks() as interface:
    gr.Markdown(description)
    with gr.Row():
       modelName = gr.Dropdown(["facebook/mms-tts"],
                               label="Select TTS Model", 
                               allow_custom_value=True)
    with gr.Row():
       langName = gr.Dropdown(["Yoruba", #Lang1
                               "Hausa",#Lang2
                               "Igbo",#Lang2
                               "English",#Lang2
                               "French" #Lang2
                               ], 
                               label="Select Language for Text2Speech Sythesis",
                               allow_custom_value=True)
    with gr.Row():
        inputText = gr.Textbox(placeholder="Enter Text for Selected Langauge Here...",label="Input Text", lines=6)
    with gr.Row():
        btn = gr.Button("Generate Speech")
    with gr.Row():
        outputAudio = gr.Audio(type="filepath", label="Click Play/Pause to Generate Speech for Current Text and Reset Button for New Language Selection/Text Input")
    with gr.Row():
        reset_btn = gr.Button("Reset")
    btn.click(
        fn=speech_synthesis,
        inputs= [modelName, langName, inputText],
        outputs=outputAudio,
    )
    reset_btn.click(
        fn=reset_fields,
        inputs=[],
        outputs=[modelName, langName, inputText, outputAudio],
    )
interface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7868
* Running on public URL: https://6234ab7db336428169.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/374 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]