# TTS Comparison Test: Live API vs Regular TTS API

This notebook compares latency and quality between:
- Live API (WebSocket-based, gemini-live-2.5-flash-preview-native-audio)
- Regular TTS API (REST-based, gemini-2.5-flash-tts)

Testing with 3 Indonesian language prompts

## Setup and Configuration

In [None]:
# Install required packages
%pip install --upgrade --quiet websockets google-cloud-texttospeech

In [None]:
# Import libraries
import base64
import json
import time
from datetime import datetime
from IPython.display import Audio, Markdown, display
import numpy as np
from websockets.asyncio.client import connect
from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech

In [None]:
# Configuration
PROJECT_ID = "my-project-0004-346516"
LOCATION = "us-central1"
TTS_LOCATION = "global"

# Live API configuration
HOST = "us-central1-aiplatform.googleapis.com"
SERVICE_URL = f"wss://{HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"
LIVE_MODEL_ID = "gemini-live-2.5-flash-preview-native-audio"
LIVE_MODEL = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{LIVE_MODEL_ID}"

# Regular TTS configuration
TTS_MODEL = "gemini-2.5-flash-tts"
VOICE = "Aoede"
LANGUAGE_CODE = "id-ID"

API_ENDPOINT = (
    f"{TTS_LOCATION}-texttospeech.googleapis.com"
    if TTS_LOCATION != "global"
    else "texttospeech.googleapis.com"
)

tts_client = texttospeech.TextToSpeechClient(
    client_options=ClientOptions(api_endpoint=API_ENDPOINT)
)

In [None]:
# Get access token for Live API
bearer_token = !gcloud auth application-default print-access-token
print("Access token obtained")

## Load Test Prompts

In [None]:
# Load test prompts from file
with open('test.txt', 'r', encoding='utf-8') as f:
    test_prompts = [line.strip() for line in f.readlines() if line.strip()]

print(f"Loaded {len(test_prompts)} test prompts\n")
for i, prompt in enumerate(test_prompts, 1):
    print(f"Prompt {i}: {prompt[:100]}..." if len(prompt) > 100 else f"Prompt {i}: {prompt}")
    print()

## Test 1: Live API (WebSocket)

Testing all 3 prompts using the Live API with timing measurements

In [None]:
# Live API Test Function
async def test_live_api(text_input, prompt_num):
    start_time = time.time()
    start_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
    
    display(Markdown(f"### Live API - Prompt {prompt_num}"))
    display(Markdown(f"**Start Time:** {start_timestamp}"))
    display(Markdown(f"**Input:** {text_input[:200]}..."))
    
    GENERATION_CONFIG = {
        "response_modalities": ["AUDIO"],
    }
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {bearer_token[0]}",
    }
    
    async with connect(SERVICE_URL, additional_headers=headers) as ws:
        # Setup the session
        await ws.send(
            json.dumps(
                {
                    "setup": {
                        "model": LIVE_MODEL,
                        "generation_config": GENERATION_CONFIG,
                        "input_audio_transcription": {},
                        "output_audio_transcription": {},
                    }
                }
            )
        )
        
        # Receive setup response
        raw_response = await ws.recv(decode=False)
        setup_response = json.loads(raw_response.decode("ascii"))
        
        # Send text message
        msg = {
            "client_content": {
                "turns": [{"role": "user", "parts": [{"text": text_input}]}],
                "turn_complete": True,
            }
        }
        
        request_sent_time = time.time()
        await ws.send(json.dumps(msg))
        
        responses = []
        output_transcriptions = []
        first_chunk_time = None
        
        # Receive chunks of server response
        async for raw_response in ws:
            if first_chunk_time is None:
                first_chunk_time = time.time()
            
            response = json.loads(raw_response.decode())
            server_content = response.pop("serverContent", None)
            if server_content is None:
                break
            
            if (output_transcription := server_content.get("outputTranscription")) is not None:
                if (text := output_transcription.get("text")) is not None:
                    output_transcriptions.append(text)
            
            model_turn = server_content.pop("modelTurn", None)
            if model_turn is not None:
                parts = model_turn.pop("parts", None)
                if parts is not None:
                    for part in parts:
                        pcm_data = base64.b64decode(part["inlineData"]["data"])
                        responses.append(np.frombuffer(pcm_data, dtype=np.int16))
            
            # End of turn
            turn_complete = server_content.pop("turnComplete", None)
            if turn_complete:
                break
    
    end_time = time.time()
    end_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
    
    total_latency = end_time - start_time
    time_to_first_chunk = first_chunk_time - request_sent_time if first_chunk_time else None
    
    display(Markdown(f"**End Time:** {end_timestamp}"))
    display(Markdown(f"**Total Latency:** {total_latency:.3f} seconds"))
    if time_to_first_chunk:
        display(Markdown(f"**Time to First Chunk:** {time_to_first_chunk:.3f} seconds"))
    
    if responses:
        display(Audio(np.concatenate(responses), rate=24000, autoplay=False))
    
    if output_transcriptions:
        display(Markdown(f"**Output transcription:** {''.join(output_transcriptions)}"))
    
    return {
        'prompt_num': prompt_num,
        'method': 'Live API',
        'total_latency': total_latency,
        'time_to_first_chunk': time_to_first_chunk,
        'start_time': start_timestamp,
        'end_time': end_timestamp
    }

In [None]:
# Run Live API tests
live_api_results = []

for i, prompt in enumerate(test_prompts, 1):
    result = await test_live_api(prompt, i)
    live_api_results.append(result)
    display(Markdown("---"))

## Test 2: Regular TTS API (REST)

Testing all 3 prompts using the regular TTS API with timing measurements

In [None]:
# Regular TTS API Test Function
def test_regular_tts(text_input, prompt_num):
    start_time = time.time()
    start_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
    
    display(Markdown(f"### Regular TTS API - Prompt {prompt_num}"))
    display(Markdown(f"**Start Time:** {start_timestamp}"))
    display(Markdown(f"**Input:** {text_input[:200]}..."))
    
    voice = texttospeech.VoiceSelectionParams(
        name=VOICE, language_code=LANGUAGE_CODE, model_name=TTS_MODEL
    )
    
    # Perform the text-to-speech request
    request_sent_time = time.time()
    response = tts_client.synthesize_speech(
        input=texttospeech.SynthesisInput(text=text_input),
        voice=voice,
        audio_config=texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3
        ),
    )
    
    end_time = time.time()
    end_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
    
    total_latency = end_time - start_time
    
    display(Markdown(f"**End Time:** {end_timestamp}"))
    display(Markdown(f"**Total Latency:** {total_latency:.3f} seconds"))
    
    # Play the generated audio
    display(Audio(response.audio_content, autoplay=False))
    
    return {
        'prompt_num': prompt_num,
        'method': 'Regular TTS',
        'total_latency': total_latency,
        'start_time': start_timestamp,
        'end_time': end_timestamp
    }

In [None]:
# Run Regular TTS tests
regular_tts_results = []

for i, prompt in enumerate(test_prompts, 1):
    result = test_regular_tts(prompt, i)
    regular_tts_results.append(result)
    display(Markdown("---"))

## Latency Comparison Results

In [None]:
# Display comparison table
display(Markdown("## Summary Comparison"))
display(Markdown("\n### Latency Results\n"))

table = "| Prompt | Live API (s) | Regular TTS (s) | Difference (s) | Faster Method |\n"
table += "|--------|--------------|-----------------|----------------|---------------|\n"

for i in range(len(test_prompts)):
    live_latency = live_api_results[i]['total_latency']
    regular_latency = regular_tts_results[i]['total_latency']
    diff = abs(live_latency - regular_latency)
    faster = "Live API" if live_latency < regular_latency else "Regular TTS"
    
    table += f"| {i+1} | {live_latency:.3f} | {regular_latency:.3f} | {diff:.3f} | {faster} |\n"

# Add average row
avg_live = sum(r['total_latency'] for r in live_api_results) / len(live_api_results)
avg_regular = sum(r['total_latency'] for r in regular_tts_results) / len(regular_tts_results)
avg_diff = abs(avg_live - avg_regular)
avg_faster = "Live API" if avg_live < avg_regular else "Regular TTS"

table += f"| **Average** | **{avg_live:.3f}** | **{avg_regular:.3f}** | **{avg_diff:.3f}** | **{avg_faster}** |\n"

display(Markdown(table))

# Display time to first chunk for Live API
display(Markdown("\n### Live API - Time to First Chunk\n"))
ttfc_table = "| Prompt | Time to First Chunk (s) |\n"
ttfc_table += "|--------|-------------------------|\n"

for result in live_api_results:
    ttfc = result.get('time_to_first_chunk', 'N/A')
    ttfc_str = f"{ttfc:.3f}" if isinstance(ttfc, float) else ttfc
    ttfc_table += f"| {result['prompt_num']} | {ttfc_str} |\n"

if all(r.get('time_to_first_chunk') for r in live_api_results):
    avg_ttfc = sum(r['time_to_first_chunk'] for r in live_api_results) / len(live_api_results)
    ttfc_table += f"| **Average** | **{avg_ttfc:.3f}** |\n"

display(Markdown(ttfc_table))

## Conclusions

Key metrics to consider:

1. **Total Latency**: Time from request start to complete audio generation
2. **Time to First Chunk** (Live API only): How quickly the first audio chunk arrives
3. **Voice Quality**: Subjective evaluation (listen to the audio samples above)

The Live API's streaming capability means it can start playing audio before the entire response is generated, which can feel faster to end users even if total latency is similar.