# IndicF5 Streaming TTS Demo

**Features:**
- üéß **Streaming audio** - Playback starts after first sentence!
- üìù **Examples included** - Pre-loaded reference audios

**Prerequisites:**
1. GPU runtime: Runtime ‚Üí Change runtime type ‚Üí T4 GPU
2. Request access: https://huggingface.co/ai4bharat/IndicF5
3. Get HF token: https://huggingface.co/settings/tokens

In [None]:
import torch
if not torch.cuda.is_available():
    raise RuntimeError('‚ùå GPU not available!')
print(f'‚úÖ GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# Install (RESTART RUNTIME AFTER)
!pip uninstall -y numpy scipy -q
!pip install numpy==1.26.4 scipy -q
!pip install 'transformers<4.50' accelerate -q
!pip install git+https://github.com/ai4bharat/IndicF5.git -q
!pip install gradio torchcodec soundfile requests -q
print('\n‚ö†Ô∏è RESTART RUNTIME! Then skip this cell.')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import torch, gradio as gr, tempfile, soundfile as sf, numpy as np, requests, io, re
from transformers import AutoModel

# Load audio from URL
def load_audio_url(url):
    r = requests.get(url)
    data, sr = sf.read(io.BytesIO(r.content))
    return sr, (data * 32768).astype(np.int16) if data.dtype == np.float64 else data

# Split text into sentences (supports multiple Indic scripts)
def split_sentences(text):
    # Split on . ! ? ‡•§ ‡•• and similar punctuation
    pattern = r'[.!?‡•§‡••\n]+'
    parts = re.split(pattern, text)
    return [p.strip() for p in parts if p.strip()]

# Examples
EXAMPLES = [
    {'name': 'PAN_F (Happy)', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav',
     'ref_text': '‡®á‡©±‡®ï ‡®ó‡©ç‡®∞‡®æ‡®π‡®ï ‡®®‡©á ‡®∏‡®æ‡®°‡©Ä ‡®¨‡©á‡®Æ‡®ø‡®∏‡®æ‡®≤ ‡®∏‡©á‡®µ‡®æ ‡®¨‡®æ‡®∞‡©á ‡®¶‡®ø‡®≤‡©ã‡®Ç‡®ó‡®µ‡®æ‡®π‡©Ä ‡®¶‡®ø‡©±‡®§‡©Ä‡•§',
     'synth': '‡§Æ‡•à‡§Ç ‡§¨‡§ø‡§®‡§æ ‡§ï‡§ø‡§∏‡•Ä ‡§ö‡§ø‡§Ç‡§§‡§æ ‡§ï‡•á ‡§Ö‡§™‡§®‡•á ‡§¶‡•ã‡§∏‡•ç‡§§‡•ã‡§Ç ‡§ï‡•ã ‡§≠‡•á‡§ú‡§§‡§æ ‡§π‡•Ç‡§Å‡•§ ‡§µ‡§π ‡§®‡§ø‡§∂‡•ç‡§ö‡§ø‡§§ ‡§∞‡•Ç‡§™ ‡§∏‡•á ‡§â‡§®‡§ï‡•Ä ‡§Æ‡§¶‡§¶ ‡§ï‡§∞‡•á‡§ó‡§æ‡•§ ‡§Ø‡§π ‡§¨‡§π‡•Å‡§§ ‡§Ö‡§ö‡•ç‡§õ‡•Ä ‡§¨‡§æ‡§§ ‡§π‡•à‡•§'},
    {'name': 'TAM_F (Happy)', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav',
     'ref_text': '‡Æ®‡Ææ‡Æ©‡Øç ‡Æ®‡ØÜ‡Æ©‡Æö‡Øç‡Æö ‡ÆÆ‡Ææ‡Æ§‡Æø‡Æ∞‡Æø‡ÆØ‡Øá ‡ÆÖ‡ÆÆ‡Øá‡Æö‡Ææ‡Æ©‡Øç‡Æ≤ ‡Æ™‡ØÜ‡Æ∞‡Æø‡ÆØ ‡Æ§‡Æ≥‡Øç‡Æ≥‡ØÅ‡Æ™‡Æü‡Æø ‡Æµ‡Æ®‡Øç‡Æ§‡Æø‡Æ∞‡ØÅ‡Æï‡Øç‡Æï‡ØÅ.',
     'synth': '‡¥≠‡¥ï‡µç‡¥∑‡¥£‡¥§‡µç‡¥§‡¥ø‡¥®‡µç ‡¥∂‡µá‡¥∑‡¥Ç ‡¥§‡µà‡¥∞‡µç ‡¥∏‡¥æ‡¥¶‡¥Ç ‡¥ï‡¥¥‡¥ø‡¥ö‡µç‡¥ö‡¥æ‡µΩ ‡¥®‡¥≤‡µç‡¥≤‡¥§‡¥æ‡¥£‡µç. ‡¥á‡¥§‡µç ‡¥Ü‡¥∞‡µã‡¥ó‡µç‡¥Ø‡¥§‡µç‡¥§‡¥ø‡¥®‡µç ‡¥®‡¥≤‡µç‡¥≤‡¥§‡¥æ‡¥£‡µç.'},
    {'name': 'KAN_F (Happy)', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav',
     'ref_text': '‡≤®‡≤Æ‡≥ç‚Äå ‡≤´‡≥ç‡≤∞‡≤ø‡≤ú‡≥ç‡≤ú‡≤≤‡≥ç‡≤≤‡≤ø ‡≤ï‡≥Ç‡≤≤‡≤ø‡≤Ç‡≤ó‡≥ç‚Äå ‡≤∏‡≤Æ‡≤∏‡≥ç‡≤Ø‡≥Ü ‡≤Ü‡≤ó‡≤ø‡≤§‡≥ç‡≤§‡≥Å.',
     'synth': '‡¶ö‡ßá‡¶®‡ßç‡¶®‡¶æ‡¶á‡¶Ø‡¶º‡ßá‡¶∞ ‡¶Ö‡¶ü‡ßã‡¶§‡ßá ‡¶≤‡ßã‡¶ï‡ßá‡¶∞‡¶æ ‡¶ñ‡¶æ‡¶¨‡¶æ‡¶∞ ‡¶≠‡¶æ‡¶ó ‡¶ï‡¶∞‡ßá ‡¶ñ‡¶æ‡¶Ø‡¶º‡•§ ‡¶è‡¶ü‡¶æ ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶ñ‡ßÅ‡¶¨ ‡¶≠‡¶æ‡¶≤‡ßã ‡¶≤‡¶æ‡¶ó‡ßá‡•§ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡ßá‡¶∞ ‡¶è‡¶á ‡¶≠‡¶æ‡¶≤‡ßã‡¶¨‡¶æ‡¶∏‡¶æ ‡¶¶‡ßá‡¶ñ‡ßá ‡¶Æ‡¶® ‡¶≠‡¶∞‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º‡•§'},
]

print('Loading examples...')
for ex in EXAMPLES:
    ex['sr'], ex['data'] = load_audio_url(ex['url'])
print('‚úÖ Examples loaded')

print('Loading IndicF5...')
model = AutoModel.from_pretrained('ai4bharat/IndicF5', trust_remote_code=True)
model = model.to('cuda')
print('‚úÖ Model loaded')

# STREAMING synthesis - yields audio progressively
def synthesize_streaming(text, ref_audio, ref_text):
    if not text or ref_audio is None or not ref_text:
        yield None
        return
    
    sr, data = ref_audio
    sentences = split_sentences(text)
    print(f'[STREAMING] {len(sentences)} sentences to process')
    
    # Save reference audio once
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
        sf.write(tmp.name, data, sr)
        ref_path = tmp.name
    
    all_audio = np.array([], dtype=np.float32)
    
    for i, sentence in enumerate(sentences):
        print(f'[{i+1}/{len(sentences)}] Generating: {sentence[:50]}...')
        
        # Generate audio for this sentence
        chunk = model(sentence, ref_audio_path=ref_path, ref_text=ref_text)
        
        # Convert to float32 if needed
        if chunk.dtype == np.int16:
            chunk = chunk.astype(np.float32) / 32768.0
        
        # Accumulate audio
        all_audio = np.concatenate([all_audio, chunk])
        
        # Yield accumulated audio so far
        print(f'[{i+1}/{len(sentences)}] Yielding {len(all_audio)} samples')
        yield (24000, all_audio.copy())
    
    print('[STREAMING] Complete!')

def load_example(name):
    ex = next((e for e in EXAMPLES if e['name'] == name), None)
    if ex:
        return (ex['sr'], ex['data']), ex['ref_text'], ex['synth']
    return None, '', ''

# Build Gradio UI
with gr.Blocks(title='IndicF5 Streaming') as app:
    gr.Markdown('# üéß IndicF5 Streaming TTS\n*Audio starts playing after first sentence!*')
    
    example_dropdown = gr.Dropdown([e['name'] for e in EXAMPLES], label='üìÇ Load Example')
    
    with gr.Row():
        with gr.Column():
            txt = gr.Textbox(label='Text to synthesize', lines=4, placeholder='Enter text with multiple sentences...')
            ref = gr.Audio(label='Reference Audio', type='numpy')
            ref_txt = gr.Textbox(label='Reference Text')
            btn = gr.Button('üé§ Generate (Streaming)', variant='primary')
        
        # STREAMING audio output
        out = gr.Audio(label='Output (streams as sentences complete)', streaming=True, autoplay=True)
    
    example_dropdown.change(load_example, [example_dropdown], [ref, ref_txt, txt])
    btn.click(synthesize_streaming, [txt, ref, ref_txt], [out])

app.launch(share=True, debug=True)