In [17]:
# Required Imports
import torch
import torchaudio
from transformers import pipeline
import IPython.display as ipd
import os
import datetime

In [18]:
# Available Language Models (you can expand this)
LANG_MODELS = {
    "english": "facebook/mms-tts-eng",
    "french": "facebook/mms-tts-fra",
    "german": "facebook/mms-tts-deu",
    "arabic": "facebook/mms-tts-ara",
    "spanish": "facebook/mms-tts-spa",
    "japanese": "facebook/mms-tts-jpn",
}

In [19]:
# Load the TTS model
def load_tts_model(language="english"):
    if language not in LANG_MODELS:
        raise ValueError(f"Unsupported language: {language}")
    
    model_name = LANG_MODELS[language]
    print(f"🔄 Loading TTS model for language: {language} ({model_name})")
    tts_pipeline = pipeline("text-to-speech", model=model_name)
    print("✅ Model successfully loaded.")
    return tts_pipeline

In [20]:
# Text-to-Speech Synthesis Function
def synthesize_and_save_audio(text, tts_model, output_dir="outputs", filename=None, play_audio=True, save=True):
    if not text.strip():
        raise ValueError("❌ Text cannot be empty.")

    print(f"\n🗣️ Synthesizing the following text:\n\"{text}\"")
    output = tts_model(text)

    # Convert to tensor
    audio_tensor = torch.tensor(output["audio"])
    if audio_tensor.ndimension() == 3:
        audio_tensor = audio_tensor.squeeze(0)  # Remove batch dimension
    if audio_tensor.ndimension() == 1:
        audio_tensor = audio_tensor.unsqueeze(0)  # Make it mono

    audio_tensor = audio_tensor.to(torch.float32)
    sample_rate = output["sampling_rate"]

    # Output directory and filename handling
    os.makedirs(output_dir, exist_ok=True)
    if filename is None:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"tts_output_{timestamp}.wav"
    full_path = os.path.join(output_dir, filename)

    # Save audio
    if save:
        torchaudio.save(full_path, audio_tensor, sample_rate)
        print(f"💾 Audio saved to: {full_path}")
    
    # Playback audio
    if play_audio:
        display(ipd.Audio(full_path if save else audio_tensor.numpy(), rate=sample_rate))

    return full_path if save else None

In [21]:
# Example Usage
# Load model
language = "english"  # Change to 'french', 'german', etc.
tts = load_tts_model(language)

# Input your text
text_input = "Hello! This is an improved text-to-speech system with better structure and more features."

# Generate and play audio
audio_path = synthesize_and_save_audio(text_input, tts_model=tts)

🔄 Loading TTS model for language: english (facebook/mms-tts-eng)


Device set to use cpu


✅ Model successfully loaded.

🗣️ Synthesizing the following text:
"Hello! This is an improved text-to-speech system with better structure and more features."
💾 Audio saved to: outputs\tts_output_20250724_002855.wav


In [22]:
# Batch TTS synthesis for multiple phrases
def synthesize_batch(texts, language="english"):
    model = load_tts_model(language)
    for i, text in enumerate(texts):
        print(f"\n📄 Text #{i+1}")
        synthesize_and_save_audio(text, model, filename=f"batch_{i+1}.wav")

In [23]:
texts = [
    "Good morning!",
    "How are you today?",
    "This is an automated voice message.",
    "We support multiple languages now!"
]
synthesize_batch(texts, language="english")

🔄 Loading TTS model for language: english (facebook/mms-tts-eng)


Device set to use cpu


✅ Model successfully loaded.

📄 Text #1

🗣️ Synthesizing the following text:
"Good morning!"
💾 Audio saved to: outputs\batch_1.wav



📄 Text #2

🗣️ Synthesizing the following text:
"How are you today?"
💾 Audio saved to: outputs\batch_2.wav



📄 Text #3

🗣️ Synthesizing the following text:
"This is an automated voice message."
💾 Audio saved to: outputs\batch_3.wav



📄 Text #4

🗣️ Synthesizing the following text:
"We support multiple languages now!"
💾 Audio saved to: outputs\batch_4.wav
