# Echolancer Demo Notebook

This notebook demonstrates how to use the Echolancer model for text-to-speech synthesis.

In [None]:
# Import required libraries
import sys
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import json

# Add project root to path
project_root = os.path.abspath('.')
sys.path.insert(0, project_root)

# Import Echolancer modules
from model import Echolancer
from utils import get_model, load_checkpoint, get_param_num
from utils.metrics import compute_mae
from utils.visualization import plot_spectrogram, plot_attention_map

## 1. Load the Model

In [None]:
# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Configuration (should match training configuration)
config = {
    'vocab_size': 100,
    'mel_channels': 80,
    'emotion_channels': 256,
    'speaker_channels': 32,
    'multi_speaker': False,
    'n_speaker': 1,
    'encoder_hidden': 256,
    'encoder_head': 4,
    'encoder_layer': 4,
    'decoder_hidden': 256,
    'decoder_layer': 4,
    'decoder_head': 4,
    'encoder_dropout': 0.1,
    'decoder_dropout': 0.1,
    'use_alibi': True,
    'alibi_alpha': 1.0,
    'activation': 'relu'
}

print("Creating Echolancer model...")

In [None]:
# Create model
model = get_model(**config)
model = model.to(device)

print(f"Model created successfully")
print(f"Number of parameters: {get_param_num(model) / 1e6:.2f}M")

## 2. Simple Inference

In [None]:
def tokenize_text_demo(text, vocab_size=100):
    """Simple tokenizer for demonstration"""
    tokens = []
    for char in text.lower():
        if char.isalnum():
            # Simple hashing for demo purposes
            token_id = hash(char) % (vocab_size - 10)
            tokens.append(token_id)
        elif char == ' ':
            tokens.append(vocab_size - 1)
    return tokens if tokens else [0]

In [None]:
def simple_inference(model, text, speaker_id=0, emotion="neutral", max_length=100):
    """Perform simple inference with the Echolancer model"""
    with torch.no_grad():
        # Tokenize input text
        tokens = tokenize_text_demo(text, config['vocab_size'])
        print(f"Input text: '{text}'")
        print(f"Token IDs: {tokens[:20]}{'...' if len(tokens) > 20 else ''}")
        
        # Prepare tensors
        texts = torch.tensor([tokens], dtype=torch.long, device=device)
        src_lens = torch.tensor([len(tokens)], dtype=torch.long, device=device)
        speakers = torch.tensor([speaker_id], dtype=torch.long, device=device)
        
        # Create emotion embedding (demo purposes)
        em_hidden = torch.randn(1, 768, device=device)
        
        # Run inference
        model.eval()
        token_outputs = model.infer(
            speakers, texts, src_lens, 
            em_hidden=em_hidden, 
            max_length=max_length
        )
        
        print(f"Generated {len(token_outputs[0])} tokens")
        
        return token_outputs[0].cpu().numpy()

In [None]:
# Test inference with sample text
sample_text = "Hello, welcome to Echolancer!"
tokens = simple_inference(model, sample_text)

## 3. Model Information

In [None]:
def print_model_info(model):
    """Print detailed information about the model"""
    print("=== Model Information ===")
    print(f"Total parameters: {get_param_num(model) / 1e6:.2f}M")
    print()
    
    print("=== Component Breakdown ===")
    print(f"Encoder parameters: {get_param_num(model.encoder) / 1e6:.2f}M")
    print(f"Decoder parameters: {get_param_num(model.decoder) / 1e6:.2f}M")
    print()
    
    print("=== Model Architecture ===")
    print(f"Encoder hidden size: {model.encoder.d_model}")
    print(f"Encoder heads: {model.encoder.num_heads}")
    print(f"Encoder layers: {len(model.encoder.encoder.layers)}")
    print(f"Decoder hidden size: {model.decoder.d_model}")
    print(f"Decoder heads: {model.decoder.num_heads}")
    print(f"Decoder layers: {len(model.decoder.dec.layers)}")
    print()
    
    print("=== Features ===")
    print(f"ALiBi support: {hasattr(model.encoder.encoder.layers[0].self_attn, 'use_alibi') and model.encoder.encoder.layers[0].self_attn.use_alibi}")
    print(f"Emotion conditioning: {model.emotion_channels > 0}")
    print(f"Multi-speaker support: {model.multi_speaker}")

In [None]:
print_model_info(model)

## 4. Visualization Demo

In [None]:
# Create a simple visualization
def visualize_generated_tokens(tokens, max_display=50):
    """Visualize the generated tokens"""
    
    # Limit display for readability
    display_tokens = tokens[:max_display]
    
    # Create figure
    fig, ax1 = plt.subplots(1, 1, figsize=(15, 4))
    
    # Plot tokens
    ax1.plot(display_tokens, marker='o', linestyle='-', markersize=4)
    ax1.set_title('Generated Tokens')
    ax1.set_xlabel('Position')
    ax1.set_ylabel('Token ID')
    ax1.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Statistics:")
    print(f"  Mean token ID: {np.mean(display_tokens):.2f}")
    print(f"  Std token ID: {np.std(display_tokens):.2f}")

In [None]:
# Visualize the generated tokens
visualize_generated_tokens(tokens)

## 5. Performance Testing

In [None]:
def benchmark_model(model, text="This is a benchmark test sentence.", iterations=5):
    """Benchmark the model inference speed"""
    import time
    
    tokens = tokenize_text_demo(text)
    texts = torch.tensor([tokens], dtype=torch.long, device=device)
    src_lens = torch.tensor([len(tokens)], dtype=torch.long, device=device)
    speakers = torch.tensor([0], dtype=torch.long, device=device)
    em_hidden = torch.randn(1, 768, device=device)
    
    # Warmup
    for _ in range(2):
        with torch.no_grad():
            model.eval()
            _ = model.infer(speakers, texts, src_lens, em_hidden=em_hidden, max_length=100)
    
    # Benchmark
    times = []
    model.eval()
    
    with torch.no_grad():
        for _ in range(iterations):
            start_time = time.time()
            _ = model.infer(speakers, texts, src_lens, em_hidden=em_hidden, max_length=100)
            end_time = time.time()
            times.append(end_time - start_time)
    
    avg_time = np.mean(times)
    std_time = np.std(times)
    
    print(f"=== Performance Benchmark ===")
    print(f"Average inference time: {avg_time:.4f}s Â± {std_time:.4f}s")
    print(f"Inference throughput: {1/avg_time:.2f} samples/sec")
    print(f"Iterations: {iterations}")

In [None]:
# Run benchmark
benchmark_model(model)

## Conclusion

This notebook demonstrated:

1. **Model Loading**: How to create and load an Echolancer model
2. **Simple Inference**: Basic text-to-speech synthesis
3. **Model Information**: Displaying model architecture details
4. **Visualization**: Plotting generated tokens
5. **Performance Testing**: Benchmarking inference speed

The Echolancer model provides a full transformer-based text-to-speech pipeline with features like:
- Non-autoregressive text encoder
- Autoregressive spectrogram decoder
- Emotion conditioning
- Multi-speaker support
- ALiBi attention with linear biases
- Mixed precision training support