# IndicF5 Streaming TTS (Speed Optimized)

**Optimizations:**
- üî• Model warm-up
- ‚ö° **NFE Steps = 16** (default 32) ‚Üí 2x faster!
- üöÄ Sway sampling = -1

**Setup:** T4 GPU + HF access

In [None]:
import torch
assert torch.cuda.is_available(), '‚ùå GPU!'
print(f'‚úÖ GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# RESTART RUNTIME AFTER
!pip uninstall -y numpy scipy -q
!pip install numpy==1.26.4 scipy -q
!pip install 'transformers<4.50' accelerate -q
!pip install git+https://github.com/ai4bharat/IndicF5.git -q
!pip install gradio torchcodec soundfile requests -q
print('‚ö†Ô∏è RESTART RUNTIME!')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import torch, gradio as gr, tempfile, soundfile as sf, numpy as np, requests, io, re, os, time
from transformers import AutoModel

# SPEED SETTINGS
NFE_STEPS = 16  # Default 32 - lower = faster but less quality (try 8-16 for speed)
SWAY_COEF = -1  # Sway sampling coefficient for speed

torch.backends.cudnn.benchmark = True

def load_audio_url(url):
    r = requests.get(url)
    data, sr = sf.read(io.BytesIO(r.content))
    return sr, (data * 32768).astype(np.int16) if data.dtype == np.float64 else data

def split_sentences(text):
    parts = re.split(r'[.!?‡•§‡••\n]+', text)
    return [p.strip() for p in parts if p.strip()]

EXAMPLES = [
    {'name': 'PAN_F (Happy)', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav',
     'ref_text': '‡®á‡©±‡®ï ‡®ó‡©ç‡®∞‡®æ‡®π‡®ï ‡®®‡©á ‡®∏‡®æ‡®°‡©Ä ‡®¨‡©á‡®Æ‡®ø‡®∏‡®æ‡®≤ ‡®∏‡©á‡®µ‡®æ ‡®¨‡®æ‡®∞‡©á ‡®¶‡®ø‡®≤‡©ã‡®Ç‡®ó‡®µ‡®æ‡®π‡©Ä ‡®¶‡®ø‡©±‡®§‡©Ä ‡®ú‡®ø‡®∏ ‡®®‡®æ‡®≤ ‡®∏‡®æ‡®®‡©Ç‡©∞ ‡®Ö‡®®‡©∞‡®¶ ‡®Æ‡®π‡®ø‡®∏‡©Ç‡®∏ ‡®π‡©ã‡®á‡®Ü‡•§',
     'synth': '‡§Æ‡•à‡§Ç ‡§¨‡§ø‡§®‡§æ ‡§ï‡§ø‡§∏‡•Ä ‡§ö‡§ø‡§Ç‡§§‡§æ ‡§ï‡•á ‡§Ö‡§™‡§®‡•á ‡§¶‡•ã‡§∏‡•ç‡§§‡•ã‡§Ç ‡§ï‡•ã ‡§Ö‡§™‡§®‡•á ‡§ë‡§ü‡•ã‡§Æ‡•ã‡§¨‡§æ‡§á‡§≤ ‡§è‡§ï‡•ç‡§∏‡§™‡§∞‡•ç‡§ü ‡§ï‡•á ‡§™‡§æ‡§∏ ‡§≠‡•á‡§ú ‡§¶‡•á‡§§‡§æ ‡§π‡•Ç‡§Å ‡§ï‡•ç‡§Ø‡•ã‡§Ç‡§ï‡§ø ‡§Æ‡•à‡§Ç ‡§ú‡§æ‡§®‡§§‡§æ ‡§π‡•Ç‡§Å ‡§ï‡§ø ‡§µ‡§π ‡§®‡§ø‡§∂‡•ç‡§ö‡§ø‡§§ ‡§∞‡•Ç‡§™ ‡§∏‡•á ‡§â‡§®‡§ï‡•Ä ‡§∏‡§≠‡•Ä ‡§ú‡§∞‡•Ç‡§∞‡§§‡•ã‡§Ç ‡§™‡§∞ ‡§ñ‡§∞‡§æ ‡§â‡§§‡§∞‡•á‡§ó‡§æ‡•§'},
    {'name': 'TAM_F (Happy)', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav',
     'ref_text': '‡Æ®‡Ææ‡Æ©‡Øç ‡Æ®‡ØÜ‡Æ©‡Æö‡Øç‡Æö ‡ÆÆ‡Ææ‡Æ§‡Æø‡Æ∞‡Æø‡ÆØ‡Øá ‡ÆÖ‡ÆÆ‡Øá‡Æö‡Ææ‡Æ©‡Øç‡Æ≤ ‡Æ™‡ØÜ‡Æ∞‡Æø‡ÆØ ‡Æ§‡Æ≥‡Øç‡Æ≥‡ØÅ‡Æ™‡Æü‡Æø ‡Æµ‡Æ®‡Øç‡Æ§‡Æø‡Æ∞‡ØÅ‡Æï‡Øç‡Æï‡ØÅ. ‡Æï‡ÆÆ‡Øç‡ÆÆ‡Æø ‡Æï‡Ææ‡Æö‡ØÅ‡Æï‡Øç‡Æï‡Øá ‡ÆÖ‡Æ®‡Øç‡Æ§‡Æ™‡Øç ‡Æ™‡ØÅ‡Æ§‡ØÅ ‡Æö‡Øá‡ÆÆ‡Øç‡Æö‡Æô‡Øç ‡ÆÆ‡Ææ‡Æü‡Æ≤ ‡Æµ‡Ææ‡Æô‡Øç‡Æï‡Æø‡Æü‡Æ≤‡Ææ‡ÆÆ‡Øç.',
     'synth': '‡¥≠‡¥ï‡µç‡¥∑‡¥£‡¥§‡µç‡¥§‡¥ø‡¥®‡µç ‡¥∂‡µá‡¥∑‡¥Ç ‡¥§‡µà‡¥∞‡µç ‡¥∏‡¥æ‡¥¶‡¥Ç ‡¥ï‡¥¥‡¥ø‡¥ö‡µç‡¥ö‡¥æ‡µΩ ‡¥í‡¥∞‡µÅ ‡¥â‡¥∑‡¥æ‡¥±‡¥æ‡¥£‡µç!'},
    {'name': 'MAR_F (WIKI)', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav',
     'ref_text': '‡§¶‡§ø‡§ó‡§Ç‡§ü‡§∞‡§æ‡§µ‡•ç‡§¶‡§æ‡§∞‡•á ‡§Ö‡§Ç‡§§‡§∞‡§æ‡§≥ ‡§ï‡§ï‡•ç‡§∑‡•á‡§§‡§≤‡§æ ‡§ï‡§ö‡§∞‡§æ ‡§ö‡§ø‡§®‡•ç‡§π‡§ø‡§§ ‡§ï‡§∞‡§£‡•ç‡§Ø‡§æ‡§∏‡§æ‡§†‡•Ä ‡§™‡•ç‡§∞‡§Ø‡§§‡•ç‡§® ‡§ï‡•á‡§≤‡•á ‡§ú‡§æ‡§§ ‡§Ü‡§π‡•á.',
     'synth': '‡§™‡•ç‡§∞‡§æ‡§∞‡§Ç‡§≠‡§ø‡§ï ‡§Ö‡§Ç‡§ï‡•Å‡§∞ ‡§õ‡•á‡§¶‡§ï. ‡§Æ‡•Ä ‡§∏‡•ã‡§≤‡§æ‡§™‡•Ç‡§∞ ‡§ú‡§ø‡§≤‡•ç‡§π‡•ç‡§Ø‡§æ‡§§‡•Ä‡§≤ ‡§Æ‡§æ‡§≥‡§∂‡§ø‡§∞‡§∏ ‡§§‡§æ‡§≤‡•Å‡§ï‡•ç‡§Ø‡§æ‡§§‡•Ä‡§≤ ‡§∂‡•á‡§§‡§ï‡§∞‡•Ä ‡§ó‡§£‡§™‡§§ ‡§™‡§æ‡§ü‡•Ä‡§≤ ‡§¨‡•ã‡§≤‡§§‡•ã‡§Ø.'},
    {'name': 'MAR_M (WIKI)', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav',
     'ref_text': '‡§Ø‡§æ ‡§™‡•ç‡§∞‡§•‡§æ‡§≤‡§æ ‡§è‡§ï‡•ã‡§£‡•Ä‡§∏‡§∂‡•á ‡§™‡§Ç‡§ö‡§æ‡§§‡§∞ ‡§à‡§∏‡§µ‡•Ä ‡§™‡§æ‡§∏‡•Ç‡§® ‡§≠‡§æ‡§∞‡§§‡•Ä‡§Ø ‡§¶‡§Ç‡§° ‡§∏‡§Ç‡§π‡§ø‡§§‡§æ‡§ö‡•Ä ‡§ß‡§æ‡§∞‡§æ ‡§ö‡§æ‡§∞‡§∂‡•á ‡§Ö‡§†‡•ç‡§†‡§æ‡§µ‡•Ä‡§∏ ‡§Ü‡§£‡§ø ‡§ö‡§æ‡§∞‡§∂‡•á ‡§è‡§ï‡•ã‡§£‡§§‡•Ä‡§∏‡§ö‡•ç‡§Ø‡§æ ‡§Ö‡§®‡•ç‡§§‡§∞‡•ç‡§ó‡§§ ‡§®‡§ø‡§∑‡•á‡§ß ‡§ï‡•á‡§≤‡§æ.',
     'synth': '‡§ú‡•Ä‡§µ‡§æ‡§£‡•Ç ‡§ï‡§∞‡§™‡§æ. ‡§Æ‡•Ä ‡§Ö‡§π‡§Æ‡§¶‡§®‡§ó‡§∞ ‡§ú‡§ø‡§≤‡•ç‡§π‡•ç‡§Ø‡§æ‡§§‡•Ä‡§≤ ‡§∞‡§æ‡§π‡•Å‡§∞‡•Ä ‡§ó‡§æ‡§µ‡§æ‡§§‡•Ç‡§® ‡§¨‡§æ‡§≥‡§æ‡§∏‡§æ‡§π‡•á‡§¨ ‡§ú‡§æ‡§ß‡§µ ‡§¨‡•ã‡§≤‡§§‡•ã‡§Ø.'},
    {'name': 'KAN_F (Happy)', 'url': 'https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav',
     'ref_text': '‡≤®‡≤Æ‡≥ç‚Äå ‡≤´‡≥ç‡≤∞‡≤ø‡≤ú‡≥ç‡≤ú‡≤≤‡≥ç‡≤≤‡≤ø  ‡≤ï‡≥Ç‡≤≤‡≤ø‡≤Ç‡≤ó‡≥ç‚Äå ‡≤∏‡≤Æ‡≤∏‡≥ç‡≤Ø‡≥Ü ‡≤Ü‡≤ó‡≤ø ‡≤®‡≤æ‡≤®‡≥ç‚Äå ‡≤≠‡≤æ‡≤≥ ‡≤¶‡≤ø‡≤®‡≤¶‡≤ø‡≤Ç‡≤¶ ‡≤í‡≤¶‡≥ç‡≤¶‡≤æ‡≤°‡≥ç‡≤§‡≤ø‡≤¶‡≥ç‡≤¶‡≥Ü.',
     'synth': '‡¶ö‡ßá‡¶®‡ßç‡¶®‡¶æ‡¶á‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∂‡ßá‡¶Ø‡¶º‡¶æ‡¶∞‡ßá‡¶∞ ‡¶Ö‡¶ü‡ßã‡¶∞ ‡¶Ø‡¶æ‡¶§‡ßç‡¶∞‡ßÄ‡¶¶‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶ñ‡¶æ‡¶¨‡¶æ‡¶∞ ‡¶≠‡¶æ‡¶ó ‡¶ï‡¶∞‡ßá ‡¶ñ‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ‡¶ü‡¶æ ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶õ‡ßá ‡¶Æ‡¶® ‡¶ñ‡ßÅ‡¶¨ ‡¶≠‡¶æ‡¶≤‡ßã ‡¶ï‡¶∞‡ßá ‡¶¶‡ßá‡¶ì‡¶Ø‡¶º‡¶æ ‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡¶ø‡¶∑‡¶Ø‡¶º‡•§'},
]

print('Loading examples...')
for ex in EXAMPLES:
    ex['sr'], ex['data'] = load_audio_url(ex['url'])
    tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    sf.write(tmp.name, ex['data'], ex['sr'])
    ex['ref_path'] = tmp.name
print(f'‚úÖ {len(EXAMPLES)} examples loaded')

print('Loading IndicF5...')
model = AutoModel.from_pretrained('ai4bharat/IndicF5', trust_remote_code=True).to('cuda')
print('‚úÖ Model loaded')

# Patch the infer settings for speed
try:
    from f5_tts.infer import utils_infer
    # Patch default NFE steps
    original_infer = utils_infer.infer_process
    def fast_infer(*args, nfe_step=NFE_STEPS, sway_sampling_coef=SWAY_COEF, **kwargs):
        return original_infer(*args, nfe_step=nfe_step, sway_sampling_coef=sway_sampling_coef, **kwargs)
    utils_infer.infer_process = fast_infer
    print(f'‚úÖ Patched infer: nfe_step={NFE_STEPS}, sway={SWAY_COEF}')
except Exception as e:
    print(f'‚ö†Ô∏è Could not patch infer settings: {e}')

# WARM-UP
print('üî• Warming up...')
warmup_start = time.time()
with torch.inference_mode():
    _ = model('Hello', ref_audio_path=EXAMPLES[0]['ref_path'], ref_text=EXAMPLES[0]['ref_text'])
print(f'‚úÖ Warm-up done in {time.time() - warmup_start:.1f}s')

current_ref_path = [None]

def synthesize_streaming(text, ref_audio, ref_text):
    if not text or ref_audio is None or not ref_text:
        return
    
    sr, data = ref_audio
    sentences = split_sentences(text)
    print(f'[STREAM] {len(sentences)} sentences')
    
    if current_ref_path[0] and os.path.exists(current_ref_path[0]):
        ref_path = current_ref_path[0]
    else:
        ref_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
        sf.write(ref_file.name, data, sr)
        ref_path = ref_file.name
    
    for i, sentence in enumerate(sentences):
        start = time.time()
        print(f'[{i+1}/{len(sentences)}] Generating...')
        
        with torch.inference_mode():
            chunk = model(sentence, ref_audio_path=ref_path, ref_text=ref_text)
        
        if chunk.dtype == np.int16:
            chunk = chunk.astype(np.float32) / 32768.0
        
        chunk_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
        sf.write(chunk_file.name, chunk, 24000)
        
        elapsed = time.time() - start
        audio_len = len(chunk) / 24000
        print(f'[{i+1}/{len(sentences)}] {elapsed:.1f}s for {audio_len:.1f}s audio (RTF: {elapsed/audio_len:.2f})')
        yield chunk_file.name
    
    print('[STREAM] Complete!')

def load_example(name):
    ex = next((e for e in EXAMPLES if e['name'] == name), None)
    if ex:
        current_ref_path[0] = ex['ref_path']
        return ((ex['sr'], ex['data']), ex['ref_text'], ex['synth'])
    return (None, '', '')

with gr.Blocks(title='IndicF5 Fast') as app:
    gr.Markdown('# üöÄ IndicF5 Streaming TTS (Speed Optimized)')
    gr.Markdown(f'**Settings:** NFE={NFE_STEPS} (default 32) | Target: ~4-6s per 4s audio')
    
    dd = gr.Dropdown([e['name'] for e in EXAMPLES], label='üìÇ Choose example')
    
    with gr.Row():
        with gr.Column():
            txt = gr.Textbox(label='Text to Synthesize', lines=4)
            ref = gr.Audio(label='Reference Audio', type='numpy')
            ref_txt = gr.Textbox(label='Reference Text', lines=2)
            btn = gr.Button('üé§ Generate', variant='primary')
        out = gr.Audio(label='Output', streaming=True, autoplay=True)
    
    dd.change(load_example, [dd], [ref, ref_txt, txt])
    btn.click(synthesize_streaming, [txt, ref, ref_txt], [out])

app.launch(share=True, debug=True)