# IndicF5 Streaming TTS Demo

**Features:** üéß Streaming audio - plays progressively!

**Setup:** T4 GPU + HF access to https://huggingface.co/ai4bharat/IndicF5

In [None]:
import torch
assert torch.cuda.is_available(), '‚ùå GPU!'
print(f'‚úÖ GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# RESTART RUNTIME AFTER THIS
!pip uninstall -y numpy scipy -q
!pip install numpy==1.26.4 scipy -q
!pip install 'transformers<4.50' accelerate -q
!pip install git+https://github.com/ai4bharat/IndicF5.git -q
!pip install gradio torchcodec soundfile requests -q
print('‚ö†Ô∏è RESTART RUNTIME!')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import torch, gradio as gr, tempfile, soundfile as sf, numpy as np, requests, io, re, os
from transformers import AutoModel

def load_audio_url(url):
    r = requests.get(url)
    data, sr = sf.read(io.BytesIO(r.content))
    return sr, (data * 32768).astype(np.int16) if data.dtype == np.float64 else data

def split_sentences(text):
    parts = re.split(r'[.!?‡•§‡••\n]+', text)
    return [p.strip() for p in parts if p.strip()]

EXAMPLES = [
    {'name': 'PAN_F', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav',
     'ref_text': '‡®á‡©±‡®ï ‡®ó‡©ç‡®∞‡®æ‡®π‡®ï ‡®®‡©á ‡®∏‡®æ‡®°‡©Ä ‡®¨‡©á‡®Æ‡®ø‡®∏‡®æ‡®≤ ‡®∏‡©á‡®µ‡®æ ‡®¨‡®æ‡®∞‡©á ‡®¶‡®ø‡®≤‡©ã‡®Ç‡®ó‡®µ‡®æ‡®π‡©Ä ‡®¶‡®ø‡©±‡®§‡©Ä‡•§',
     'synth': '‡§Æ‡•à‡§Ç ‡§¨‡§ø‡§®‡§æ ‡§ï‡§ø‡§∏‡•Ä ‡§ö‡§ø‡§Ç‡§§‡§æ ‡§ï‡•á ‡§Ö‡§™‡§®‡•á ‡§¶‡•ã‡§∏‡•ç‡§§‡•ã‡§Ç ‡§ï‡•ã ‡§≠‡•á‡§ú‡§§‡§æ ‡§π‡•Ç‡§Å‡•§ ‡§µ‡§π ‡§®‡§ø‡§∂‡•ç‡§ö‡§ø‡§§ ‡§∞‡•Ç‡§™ ‡§∏‡•á ‡§Æ‡§¶‡§¶ ‡§ï‡§∞‡•á‡§ó‡§æ‡•§ ‡§Ø‡§π ‡§¨‡§π‡•Å‡§§ ‡§Ö‡§ö‡•ç‡§õ‡•Ä ‡§¨‡§æ‡§§ ‡§π‡•à‡•§'},
]

print('Loading examples...')
for ex in EXAMPLES:
    ex['sr'], ex['data'] = load_audio_url(ex['url'])

print('Loading IndicF5...')
model = AutoModel.from_pretrained('ai4bharat/IndicF5', trust_remote_code=True).to('cuda')
print('‚úÖ Ready!')

# FIXED STREAMING: Yield individual WAV file chunks
def synthesize_streaming(text, ref_audio, ref_text):
    if not text or ref_audio is None or not ref_text:
        return
    
    sr, data = ref_audio
    sentences = split_sentences(text)
    print(f'[STREAM] {len(sentences)} sentences')
    
    # Save ref audio
    ref_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    sf.write(ref_file.name, data, sr)
    ref_path = ref_file.name
    
    for i, sentence in enumerate(sentences):
        print(f'[{i+1}/{len(sentences)}] {sentence[:40]}...')
        
        # Generate audio for this sentence
        chunk = model(sentence, ref_audio_path=ref_path, ref_text=ref_text)
        
        if chunk.dtype == np.int16:
            chunk = chunk.astype(np.float32) / 32768.0
        
        # Save chunk to temp WAV file
        chunk_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
        sf.write(chunk_file.name, chunk, 24000)
        
        print(f'[{i+1}] Yielding chunk: {len(chunk)} samples')
        
        # Yield the WAV file path (Gradio appends these)
        yield chunk_file.name
    
    print('[STREAM] Done!')

def load_example(name):
    ex = next((e for e in EXAMPLES if e['name'] == name), None)
    return ((ex['sr'], ex['data']), ex['ref_text'], ex['synth']) if ex else (None, '', '')

with gr.Blocks(title='IndicF5') as app:
    gr.Markdown('# üéß IndicF5 Streaming TTS')
    
    dd = gr.Dropdown([e['name'] for e in EXAMPLES], label='Example')
    
    with gr.Row():
        with gr.Column():
            txt = gr.Textbox(label='Text', lines=3)
            ref = gr.Audio(label='Reference Audio', type='numpy')
            ref_txt = gr.Textbox(label='Reference Text')
            btn = gr.Button('üé§ Generate', variant='primary')
        
        # STREAMING: yield file paths, Gradio appends them
        out = gr.Audio(label='Output', streaming=True, autoplay=True)
    
    dd.change(load_example, [dd], [ref, ref_txt, txt])
    btn.click(synthesize_streaming, [txt, ref, ref_txt], [out])

app.launch(share=True, debug=True)