# TTS Model Test Notebook

This notebook tests the current TTS model implementation using XTTS v2 with voice cloning capabilities.

## Current Model Configuration
- **Model**: `tts_models/multilingual/multi-dataset/xtts_v2`
- **Voice Cloning**: Enabled with custom voice samples
- **Multi-language Support**: 30+ languages supported
- **Emotion Control**: Basic emotion tags supported


In [1]:
# Import required libraries
import os
import sys
import logging
import numpy as np
import soundfile as sf
import IPython.display as ipd
from pathlib import Path
import time
import json

# Add the app directory to Python path
sys.path.append('./app')

# Import our TTS service
from xtts_service_v2 import TTSService

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


  from .autonotebook import tqdm as notebook_tqdm


## 1. Initialize TTS Service

Initialize the TTS service with the current XTTS v2 model configuration.


In [2]:
# Initialize TTS service
print("Initializing TTS Service...")
start_time = time.time()

try:
    tts_service = TTSService(logger=logger)
    init_time = time.time() - start_time
    print(f"✅ TTS Service initialized successfully in {init_time:.2f} seconds")
    print(f"Model: {tts_service.model_name}")
    print(f"XTTS Version: {tts_service.xtts_version}")
    print(f"Configuration: {json.dumps(tts_service.config, indent=2)}")
except Exception as e:
    print(f"❌ Failed to initialize TTS Service: {e}")
    raise


2025-11-18 13:32:27,048 - DEBUG - (logger = <Logger __main__ (DEBUG)>, model_name = 'tts_models/multilingual/multi-dataset/xtts_v2')
2025-11-18 13:32:27,049 - DEBUG - Loading config from config.yaml
2025-11-18 13:32:27,051 - DEBUG - (): Config = {'tau': 0.3, 'gpt_cond_len': 3, 'top_k': 3, 'top_p': 5, 'decoder_iterations': -1, 'use_deepspeed': True, 'max_audio_duration': 30.0, 'expected_wpm': 150, 'model_name': 'tts_models--multilingual--multi-dataset--tts_models--multilingual--multi-dataset--xtts_v2'}
2025-11-18 13:32:27,051 - DEBUG - (): Model name: tts_models/multilingual/multi-dataset/xtts_v2
2025-11-18 13:32:27,051 - DEBUG - (): XTTS version: 2
2025-11-18 13:32:27,051 - DEBUG - Initializing TTSService with model: tts_models/multilingual/multi-dataset/xtts_v2
2025-11-18 13:32:27,052 - DEBUG - Starting model initialization
2025-11-18 13:32:27,052 - INFO - Model already exists, using cached version
2025-11-18 13:32:27,054 - DEBUG - (use_deepspeed = False)


Initializing TTS Service...
TTS version: 0.26.2


2025-11-18 13:32:38,988 - DEBUG - Model moved to CUDA
2025-11-18 13:32:38,992 - DEBUG - Model initialized successfully
2025-11-18 13:32:38,994 - DEBUG - (voice_name = 'dsp', voice_path = 'data/voices/dsp/dsp_01.wav')
2025-11-18 13:32:38,994 - DEBUG - Loading single audio file: data/voices/dsp/dsp_01.wav
2025-11-18 13:32:38,995 - DEBUG - Loaded voice: Voice(name=dsp, file_paths=['data/voices/dsp/dsp_01.wav'], audios=(1, 124155))
2025-11-18 13:32:38,996 - DEBUG - Using batman_01.wav for working notebook compatibility
2025-11-18 13:32:38,996 - DEBUG - (voice_name = 'batman', voice_path = 'data/voices/batman/batman_01.wav')
2025-11-18 13:32:38,997 - DEBUG - Loading single audio file: data/voices/batman/batman_01.wav
2025-11-18 13:32:38,998 - DEBUG - Loaded voice: Voice(name=batman, file_paths=['data/voices/batman/batman_01.wav'], audios=(1, 169701))
2025-11-18 13:32:38,999 - DEBUG - (voice_name = 'biden', voice_path = 'data/voices/biden/biden_01.wav')
2025-11-18 13:32:38,999 - DEBUG - Load

✅ TTS Service initialized successfully in 11.97 seconds
Model: tts_models/multilingual/multi-dataset/xtts_v2
XTTS Version: 2
Configuration: {
  "tau": 0.3,
  "gpt_cond_len": 3,
  "top_k": 3,
  "top_p": 5,
  "decoder_iterations": -1,
  "use_deepspeed": true,
  "max_audio_duration": 30.0,
  "expected_wpm": 150,
  "model_name": "tts_models--multilingual--multi-dataset--tts_models--multilingual--multi-dataset--xtts_v2"
}


## 2. Check Available Voices

List all available voices loaded from the `data/voices` directory.


In [3]:
# Get available voices
available_voices = tts_service.get_voices()
print(f"Available voices ({len(available_voices)}):")
for i, voice in enumerate(available_voices, 1):
    voice_obj = tts_service.voices[voice]
    print(f"  {i}. {voice}")
    print(f"     Files: {voice_obj.file_paths}")
    print(f"     Audio shape: {voice_obj.audios.shape}")
    print()


Available voices (7):
  1. dsp
     Files: ['data/voices/dsp/dsp_01.wav']
     Audio shape: (1, 124155)

  2. batman
     Files: ['data/voices/batman/batman_01.wav']
     Audio shape: (1, 169701)

  3. biden
     Files: ['data/voices/biden/biden_01.wav']
     Audio shape: (1, 191078)

  4. trump_cp
     Files: ['data/voices/trump_cp/1.wav']
     Audio shape: (1, 635874, 2)

  5. major
     Files: ['data/voices/major/gits.wav']
     Audio shape: (1, 875956)

  6. trump
     Files: ['data/voices/trump/trump_01.wav']
     Audio shape: (1, 730596)

  7. loli
     Files: ['data/voices/loli/loli_01.wav']
     Audio shape: (1, 79926)



## 3. Test Basic Speech Generation

Test basic text-to-speech generation with different voices.


In [4]:
# Test text
test_text = "Hello, this is a test of the text-to-speech system using voice cloning technology."

# Test with first available voice
if available_voices:
    test_voice = available_voices[0]
    print(f"Testing speech generation with voice: {test_voice}")
    
    try:
        start_time = time.time()
        audio_data, sample_rate = tts_service.generate_speech(
            text=test_text,
            voice_name=test_voice,
            language="en"
        )
        generation_time = time.time() - start_time
        
        print(f"✅ Speech generated successfully in {generation_time:.2f} seconds")
        print(f"Audio length: {len(audio_data)} samples")
        print(f"Sample rate: {sample_rate} Hz")
        print(f"Duration: {len(audio_data) / sample_rate:.2f} seconds")
        
        # Save and play audio
        output_file = f"test_output_{test_voice}.wav"
        sf.write(output_file, audio_data, sample_rate)
        print(f"Audio saved to: {output_file}")
        
        # Display audio player
        display(ipd.Audio(audio_data, rate=sample_rate))
        
    except Exception as e:
        print(f"❌ Speech generation failed: {e}")
        import traceback
        traceback.print_exc()
else:
    print("❌ No voices available for testing")


2025-11-18 13:32:51,891 - DEBUG - Generating speech: 'Hello, this is a test of the text-to-speech system...'


Testing speech generation with voice: dsp
✅ Speech generated successfully in 1.55 seconds
Audio length: 116992 samples
Sample rate: 24000 Hz
Duration: 4.87 seconds
Audio saved to: test_output_dsp.wav


## 4. Test Multiple Voices

Generate speech with different voices to compare quality and characteristics.


In [5]:
# Test with multiple voices
test_text_short = "This is a voice comparison test."
voices_to_test = available_voices[:5]  # Test first 5 voices

print(f"Testing {len(voices_to_test)} voices...\n")

for voice in voices_to_test:
    print(f"Testing voice: {voice}")
    try:
        start_time = time.time()
        audio_data, sample_rate = tts_service.generate_speech(
            text=test_text_short,
            voice_name=voice,
            language="en"
        )
        generation_time = time.time() - start_time
        
        print(f"  ✅ Generated in {generation_time:.2f}s, Duration: {len(audio_data) / sample_rate:.2f}s")
        
        # Save audio
        output_file = f"voice_test_{voice}.wav"
        sf.write(output_file, audio_data, sample_rate)
        
        # Display audio player
        print(f"  Voice: {voice}")
        display(ipd.Audio(audio_data, rate=sample_rate))
        print()
        
    except Exception as e:
        print(f"  ❌ Failed: {e}")
        print()


2025-11-18 13:32:56,908 - DEBUG - Generating speech: 'This is a voice comparison test....'


Testing 5 voices...

Testing voice: dsp
  ✅ Generated in 4.10s, Duration: 16.38s
  Voice: dsp


2025-11-18 13:33:01,024 - DEBUG - Generating speech: 'This is a voice comparison test....'



Testing voice: batman
  ✅ Generated in 2.35s, Duration: 9.00s
  Voice: batman


2025-11-18 13:33:03,391 - DEBUG - Generating speech: 'This is a voice comparison test....'



Testing voice: biden
  ✅ Generated in 1.93s, Duration: 7.10s
  Voice: biden


2025-11-18 13:33:05,339 - DEBUG - Generating speech: 'This is a voice comparison test....'



Testing voice: trump_cp
  ✅ Generated in 1.27s, Duration: 4.54s
  Voice: trump_cp


2025-11-18 13:33:06,618 - DEBUG - Generating speech: 'This is a voice comparison test....'



Testing voice: major
  ✅ Generated in 2.29s, Duration: 8.26s
  Voice: major





## 5. Test Multi-language Support

Test the model's multi-language capabilities with different languages.


In [None]:
# Test different languages
language_tests = [
    ("en", "Hello, this is a test in English."),
    ("es", "Hola, esta es una prueba en español."),
    ("fr", "Bonjour, ceci est un test en français."),
    ("de", "Hallo, das ist ein Test auf Deutsch."),
    ("it", "Ciao, questo è un test in italiano.")
]

if available_voices:
    test_voice = available_voices[0]  # Use first available voice
    print(f"Testing multi-language support with voice: {test_voice}\n")
    
    for lang_code, text in language_tests:
        print(f"Testing language: {lang_code}")
        print(f"Text: {text}")
        
        try:
            start_time = time.time()
            audio_data, sample_rate = tts_service.generate_speech(
                text=text,
                voice_name=test_voice,
                language=lang_code
            )
            generation_time = time.time() - start_time
            
            print(f"  ✅ Generated in {generation_time:.2f}s")
            
            # Save and play audio
            output_file = f"lang_test_{lang_code}_{test_voice}.wav"
            sf.write(output_file, audio_data, sample_rate)
            
            display(ipd.Audio(audio_data, rate=sample_rate))
            print()
            
        except Exception as e:
            print(f"  ❌ Failed: {e}")
            print()


## 6. Test Parameter Variations

Test different generation parameters to understand their effects.


In [None]:
# Test parameter variations
test_text = "This is a parameter variation test."

if available_voices:
    test_voice = available_voices[0]
    
    # Test different parameter combinations
    parameter_tests = [
        {"name": "Default", "params": {}},
        {"name": "High Temperature", "params": {"tau": 0.9}},
        {"name": "Low Temperature", "params": {"tau": 0.3}},
        {"name": "High GPT Cond Len", "params": {"gpt_cond_len": 6}},
        {"name": "Low GPT Cond Len", "params": {"gpt_cond_len": 1}},
        {"name": "High Top-K", "params": {"top_k": 10}},
        {"name": "Low Top-K", "params": {"top_k": 1}}
    ]
    
    print(f"Testing parameter variations with voice: {test_voice}\n")
    
    for test in parameter_tests:
        print(f"Testing: {test['name']}")
        print(f"Parameters: {test['params']}")
        
        try:
            start_time = time.time()
            audio_data, sample_rate = tts_service.generate_speech(
                text=test_text,
                voice_name=test_voice,
                language="en",
                **test['params']
            )
            generation_time = time.time() - start_time
            
            print(f"  ✅ Generated in {generation_time:.2f}s")
            
            # Save and play audio
            output_file = f"param_test_{test['name'].replace(' ', '_').lower()}.wav"
            sf.write(output_file, audio_data, sample_rate)
            
            display(ipd.Audio(audio_data, rate=sample_rate))
            print()
            
        except Exception as e:
            print(f"  ❌ Failed: {e}")
            print()


## 7. Test Emotion Tags

Test emotion control using emotion tags in the text.


In [None]:
# Test emotion tags
base_text = "This is an emotional speech test."

emotion_tests = [
    ("neutral", base_text),
    ("happy", f"(happy) {base_text}"),
    ("sad", f"(sad) {base_text}"),
    ("angry", f"(angry) {base_text}"),
    ("excited", f"(excited) {base_text}"),
    ("whisper", f"(whisper) {base_text}")
]

if available_voices:
    test_voice = available_voices[0]
    print(f"Testing emotion tags with voice: {test_voice}\n")
    
    for emotion, text in emotion_tests:
        print(f"Testing emotion: {emotion}")
        print(f"Text: {text}")
        
        try:
            start_time = time.time()
            audio_data, sample_rate = tts_service.generate_speech(
                text=text,
                voice_name=test_voice,
                language="en"
            )
            generation_time = time.time() - start_time
            
            print(f"  ✅ Generated in {generation_time:.2f}s")
            
            # Save and play audio
            output_file = f"emotion_test_{emotion}.wav"
            sf.write(output_file, audio_data, sample_rate)
            
            display(ipd.Audio(audio_data, rate=sample_rate))
            print()
            
        except Exception as e:
            print(f"  ❌ Failed: {e}")
            print()


## 8. Performance Benchmarks

Benchmark the model performance with different text lengths and complexity.


In [None]:
# Performance benchmarks
benchmark_texts = [
    ("Short", "Hello world."),
    ("Medium", "This is a medium length sentence for testing speech generation performance."),
    ("Long", "This is a much longer text that contains multiple sentences and should test the model's ability to handle extended speech generation. It includes various words and punctuation marks to simulate real-world usage scenarios."),
    ("Complex", "The quick brown fox jumps over the lazy dog. This pangram contains every letter of the alphabet and is often used for testing. Numbers like 1, 2, 3, and symbols like @, #, $ can also be challenging for text-to-speech systems.")
]

if available_voices:
    test_voice = available_voices[0]
    print(f"Performance benchmarks with voice: {test_voice}\n")
    
    results = []
    
    for category, text in benchmark_texts:
        print(f"Testing {category} text ({len(text)} characters):")
        print(f"Text: {text[:100]}{'...' if len(text) > 100 else ''}")
        
        try:
            start_time = time.time()
            audio_data, sample_rate = tts_service.generate_speech(
                text=text,
                voice_name=test_voice,
                language="en"
            )
            generation_time = time.time() - start_time
            audio_duration = len(audio_data) / sample_rate
            
            # Calculate metrics
            chars_per_second = len(text) / generation_time
            real_time_factor = audio_duration / generation_time
            
            result = {
                'category': category,
                'text_length': len(text),
                'generation_time': generation_time,
                'audio_duration': audio_duration,
                'chars_per_second': chars_per_second,
                'real_time_factor': real_time_factor
            }
            results.append(result)
            
            print(f"  ✅ Generated in {generation_time:.2f}s")
            print(f"  Audio duration: {audio_duration:.2f}s")
            print(f"  Speed: {chars_per_second:.1f} chars/sec")
            print(f"  Real-time factor: {real_time_factor:.2f}x")
            
            # Save audio
            output_file = f"benchmark_{category.lower()}.wav"
            sf.write(output_file, audio_data, sample_rate)
            print()
            
        except Exception as e:
            print(f"  ❌ Failed: {e}")
            print()
    
    # Summary
    if results:
        print("\n📊 Performance Summary:")
        print("Category\t\tChars\tGen Time\tAudio Dur\tChars/sec\tRT Factor")
        print("-" * 70)
        for r in results:
            print(f"{r['category']:<15}\t{r['text_length']:<8}\t{r['generation_time']:<8.2f}\t{r['audio_duration']:<8.2f}\t{r['chars_per_second']:<8.1f}\t{r['real_time_factor']:<8.2f}")
        
        avg_chars_per_sec = sum(r['chars_per_second'] for r in results) / len(results)
        avg_rt_factor = sum(r['real_time_factor'] for r in results) / len(results)
        print(f"\nAverage: {avg_chars_per_sec:.1f} chars/sec, {avg_rt_factor:.2f}x real-time")


## 9. Model Information Summary

Display comprehensive information about the current model setup.


In [None]:
# Model information summary
print("🔍 TTS Model Information Summary")
print("=" * 50)

print(f"Model Name: {tts_service.model_name}")
print(f"XTTS Version: {tts_service.xtts_version}")
print(f"Available Voices: {len(available_voices)}")

# Check CUDA availability
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Configuration details
print("\n📋 Configuration:")
for key, value in tts_service.config.items():
    print(f"  {key}: {value}")

# Voice details
print("\n🎤 Voice Details:")
for voice_name in available_voices:
    voice = tts_service.voices[voice_name]
    print(f"  {voice_name}:")
    print(f"    Files: {len(voice.file_paths)}")
    print(f"    Audio samples: {voice.audios.shape[0]}")
    if voice.audios.shape[0] > 0:
        avg_length = np.mean([len(audio) for audio in voice.audios])
        print(f"    Avg sample length: {avg_length:.0f} samples ({avg_length/22050:.2f}s)")

# Model architecture info (if available)
if hasattr(tts_service, 'model') and tts_service.model is not None:
    print("\n🏗️ Model Architecture:")
    print(f"  Model type: {type(tts_service.model).__name__}")
    if hasattr(tts_service.model, 'device'):
        print(f"  Device: {tts_service.model.device}")

print("\n✅ Model test completed successfully!")


## 10. Cleanup

Clean up generated test files (optional).


In [None]:
# Optional: Clean up generated test files
import glob

cleanup = input("Do you want to clean up generated test files? (y/n): ").lower().strip() == 'y'

if cleanup:
    test_files = glob.glob("*test*.wav") + glob.glob("benchmark*.wav") + glob.glob("emotion*.wav") + glob.glob("lang_test*.wav") + glob.glob("param_test*.wav") + glob.glob("voice_test*.wav")
    
    if test_files:
        print(f"Cleaning up {len(test_files)} test files...")
        for file in test_files:
            try:
                os.remove(file)
                print(f"  Removed: {file}")
            except Exception as e:
                print(f"  Failed to remove {file}: {e}")
        print("Cleanup completed.")
    else:
        print("No test files found to clean up.")
else:
    print("Test files preserved for manual review.")
