# IndicF5 Gradio Demo

This notebook requires a **GPU runtime**. Go to **Runtime → Change runtime type → T4 GPU** before running.

**IMPORTANT**: After running the install cell, you MUST restart the runtime before running the app cell!

In [None]:
# Check GPU availability
import torch
if not torch.cuda.is_available():
    raise RuntimeError('❌ GPU not available! Go to Runtime → Change runtime type → T4 GPU')
print(f'✅ GPU available: {torch.cuda.get_device_name(0)}')

In [None]:
# Install dependencies (RESTART RUNTIME AFTER THIS CELL)
!pip uninstall -y numpy scipy
!pip install numpy==1.26.4
!pip install scipy
!pip install 'transformers<4.50' accelerate
!pip install git+https://github.com/ai4bharat/IndicF5.git
!pip install gradio torchcodec soundfile
print('\n\n⚠️ NOW RESTART THE RUNTIME! Go to Runtime → Restart runtime, then skip this cell and run the next one.')

In [None]:
# Run this cell AFTER restarting the runtime
import torch
import gradio as gr
import tempfile
import soundfile as sf
import numpy as np
from transformers import AutoModel

print('Loading IndicF5 model...')
repo_id = 'ai4bharat/IndicF5'
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f'✅ Model loaded on {device}')

def synthesize_speech(text, ref_audio, ref_text):
    if not text or ref_audio is None or not ref_text:
        return 'Error: Provide all inputs.'
    
    sample_rate, audio_data = ref_audio
    
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
        sf.write(tmp.name, audio_data, sample_rate)
        audio_out = model(text, ref_audio_path=tmp.name, ref_text=ref_text)
    
    print(f'[DEBUG] dtype={audio_out.dtype}, min={audio_out.min()}, max={audio_out.max()}')
    
    if audio_out.dtype == np.int16:
        audio_out = audio_out.astype(np.float32) / 32768.0
    
    return 24000, audio_out

with gr.Blocks(title='IndicF5 TTS') as iface:
    gr.Markdown('# IndicF5 Text-to-Speech')
    with gr.Row():
        with gr.Column():
            txt = gr.Textbox(label='Text to synthesize', lines=3)
            ref_audio = gr.Audio(label='Reference Audio', type='numpy')
            ref_txt = gr.Textbox(label='Reference Text')
            btn = gr.Button('Generate', variant='primary')
        with gr.Column():
            out = gr.Audio(label='Output')
    btn.click(synthesize_speech, inputs=[txt, ref_audio, ref_txt], outputs=[out])

iface.launch(share=True, debug=True)