In [2]:
import torch
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsAudioConfig, XttsArgs
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.config import BaseAudioConfig
from TTS.api import TTS
import os
from langdetect import detect

# Allowlist all required XTTS classes
torch.serialization.add_safe_globals([
    XttsConfig,
    XttsAudioConfig,
    XttsArgs,
    BaseDatasetConfig,
    BaseAudioConfig,
])

# Initialize TTS model
print("Loading XTTS model...")
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
print("Model loaded successfully!")

# Define available speakers
male_speakers = ["Marcos Rudaski", "Luis Moray"]
female_speakers = ["Ana Florence"]

# Create output directory
os.makedirs("XTTS_outputs", exist_ok=True)

# Get text input
text = "Online resources offer a variety of English texts suitable for different proficiency levels."
# text = "Je m’appelle Jessica. Je suis une fille, je suis française et j’ai treize ans."

# Auto-detect language
try:
    lang_code = detect(text)
    lang_map = {
        'en': 'en', 'es': 'es', 'fr': 'fr', 'de': 'de', 'it': 'it',
        'pt': 'pt', 'pl': 'pl', 'tr': 'tr', 'ru': 'ru', 'nl': 'nl',
        'cs': 'cs', 'ar': 'ar', 'zh-cn': 'zh-cn', 'ja': 'ja', 'hu': 'hu', 'ko': 'ko'
    }
    language = lang_map.get(lang_code, 'en')
    print(f"Detected language: {language}")
except:
    language = "en"
    print("Could not detect language, using English as default")


# Get voice preference
MaleSpeaker = male_speakers[0]  # for male voice
FemaleSpeaker = female_speakers[0]  # for female voice
chosen_SPEAKER = FemaleSpeaker  # Change to

# Get output filename
# output_path = f"XTTS_outputs/{speaker}.wav"

# Generate speech
print(f"\nGenerating speech with {FemaleSpeaker}...")
tts.tts_to_file(
    text=text,
    file_path="XTTS_outputs/mohamed_test.wav",
    speaker=chosen_SPEAKER,
    language=language,
    split_sentences=True,
    # speaker_wav = ['My voice simple.wav']
)


Loading XTTS model...
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts
Model loaded successfully!
Detected language: en

Generating speech with Ana Florence...
 > Text splitted to sentences.
['Online resources offer a variety of English texts suitable for different proficiency levels.']
 > Processing time: 13.579684257507324
 > Real-time factor: 2.0996861177355863


'XTTS_outputs/mohamed_test.wav'