# Kaggle's secrets management to securely access your Hugging Face token!

# 1️⃣ Install kokoro


In [None]:
!pip install -q kokoro>=0.8.2 soundfile

# 2️⃣ Install espeak, used for English OOD fallback and some non-English languages


In [None]:

!apt-get -qq -y install espeak-ng > /dev/null 2>&1
# 🇪🇸 'e' => Spanish es
# 🇫🇷 'f' => French fr-fr
# 🇮🇳 'h' => Hindi hi
# 🇮🇹 'i' => Italian it
# 🇧🇷 'p' => Brazilian Portuguese pt-br

# 3️⃣ Initalize a pipeline

In [None]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
pipeline = KPipeline(lang_code='hi') # <= make sure lang_code matches voice (hi or h for hindi)

# Write your text here after '''
text = '''


पंजाब के एक छोटे से गाँव में एक पुरानी हवेली थी।


'''

# 4️⃣ Generate, display, and save audio files in a loop.

In [None]:
generator = pipeline(
    text, voice='hm_omega', # <= change voice here voice code (Male => hm_omega, hm_psi and Female=> hf_alpha, hf_beta)
    speed=1, split_pattern=r'\n+' #speed 0.1 to 1
)
for i, (gs, ps, audio) in enumerate(generator):
    print(i)  # i => index
    print(gs) # gs => graphemes/text
    print(ps) # ps => phonemes
    display(Audio(data=audio, rate=24000, autoplay=i==0))
    sf.write(f'{i}.wav', audio, 24000) # save each audio file

# 5️⃣ Added gradio tunnel for gui interface 

In [12]:
!pip install kokoro>=0.8.2


In [9]:
!pip install gradio




In [20]:
import gradio as gr
from kokoro import KPipeline

# Initialize pipeline
pipeline = KPipeline(lang_code="hi")

# Function to generate speech
def hindi_tts(text, voice, speed):
    try:
        print(f"Processing: {text} | Voice: {voice} | Speed: {speed}")
        generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
        audio_data = None
        
        # Generate speech
        for gs, ps, audio in generator:
            audio_data = audio  # Store last generated audio
        
        # 🔥 Fix: Ensure audio_data is converted to NumPy array
        if audio_data is not None:
            return (24000, audio_data.numpy())  # Convert to NumPy
        else:
            return "Error: No audio generated. Try different input."
    
    except Exception as e:
        print(f"Gradio Error: {e}")
        return f"Error: {str(e)}"

# Gradio UI
iface = gr.Interface(
    fn=hindi_tts,
    inputs=[
        gr.Textbox(label="Enter Hindi Text", placeholder="हिंदी में कुछ लिखें..."),
        gr.Radio(choices=["hm_omega", "hm_psi", "hf_alpha", "hf_beta"], label="Select Voice", value="hf_alpha"),
        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, label="Speech Speed", value=1.0)
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="Hindi Text-to-Speech",
    description="Enter Hindi text and generate speech using Kokoro TTS.",
    allow_flagging="never"
)

# Launch Gradio
iface.launch(share=True)




  WeightNorm.apply(module, name, dim)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://14d13c0d320e09e8b7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# working directory 

In [22]:
import torch
import numpy as np
import soundfile as sf
import tempfile
import gradio as gr
from transformers import pipeline

# ✅ Load Hugging Face SpeechT5 TTS Model
hindi_tts = pipeline("text-to-speech", model="microsoft/speecht5_tts")

# ✅ Load RVC Model (Replace with your model path)
rvc_model_path = "/kaggle/input/datsets/DeepMale.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    rvc_model = torch.load(rvc_model_path, map_location=device)
    rvc_model.eval()  # Set model to evaluation mode
    print("✅ RVC Model Loaded Successfully!")
except Exception as e:
    print(f"⚠️ Error Loading RVC Model: {str(e)}")

def apply_rvc_conversion(audio_array, sr=24000):
    """
    Apply RVC voice conversion to the generated TTS audio.
    """
    try:
        audio_tensor = torch.tensor(audio_array, dtype=torch.float32).to(device)

        # Ensure correct shape (batch, channels, samples)
        if len(audio_tensor.shape) == 1:
            audio_tensor = audio_tensor.unsqueeze(0)

        # Apply voice conversion
        with torch.no_grad():
            converted_audio = rvc_model(audio_tensor)

        # Convert tensor back to numpy array
        converted_audio_np = converted_audio.squeeze(0).cpu().numpy()

        # Save converted audio
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        sf.write(temp_file.name, converted_audio_np, sr)

        return temp_file.name  # Return file path
    
    except Exception as e:
        return f"Error in RVC conversion: {str(e)}"

def generate_tts_and_convert(text, apply_rvc):
    """
    Generate Hindi TTS and optionally apply RVC voice conversion.
    """
    try:
        # Generate TTS audio
        generator = hindi_tts(text)
        audio_data = np.array(generator["audio"], dtype=np.float32)

        # Apply RVC if selected
        if apply_rvc:
            return apply_rvc_conversion(audio_data)
        
        # Save and return TTS audio
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        sf.write(temp_file.name, audio_data, 24000)
        return temp_file.name
    
    except Exception as e:
        return f"Error: {str(e)}"

# ✅ Gradio Interface
iface = gr.Interface(
    fn=generate_tts_and_convert,
    inputs=[
        gr.Textbox(label="Enter Hindi Text"),
        gr.Checkbox(label="Apply RVC Voice Conversion", value=False)
    ],
    outputs=gr.File(label="Download Processed Speech"),
    title="Hindi TTS with RVC Voice Cloning",
    description="Enter text, generate Hindi speech, and apply RVC voice cloning."
)

iface.launch()


config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

  rvc_model = torch.load(rvc_model_path, map_location=device)


⚠️ Error Loading RVC Model: 'collections.OrderedDict' object has no attribute 'eval'
* Running on local URL:  http://127.0.0.1:7861
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://22edd91a37f2438c8d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 2113, in process_api
    data = await self.postprocess_data(block_fn, result["prediction"], state)
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1919, in postprocess_data
    prediction_value = block.postprocess(prediction_value)
  File "/usr/local/lib/python3.10/dist-packages/gradio/components/file.py", line 223, in postprocess
    size=Path(value).stat().st_size,
  File "/usr/lib/python3.10/pathlib.py", line 1097, in stat
    return self._accessor.stat(self, follow_symlinks=follow_symlinks)
FileNotFoundError: [Errno 2] No such file or directory: 'Error: `speak

In [24]:
import gradio as gr
import numpy as np
from kokoro import KPipeline

# Initialize pipeline
pipeline = KPipeline(lang_code="hi")

# Function to generate speech
def hindi_tts(text, voice, speed):
    try:
        print(f"Processing: {text} | Voice: {voice} | Speed: {speed}")
        generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
        audio_data = None
        
        # Generate speech and extract audio data
        for gs, ps, audio in generator:
            audio_data = audio  # Store last generated audio
        
        # Ensure valid audio output
        if audio_data is not None:
            audio_np = np.array(audio_data)  # Convert to NumPy array
            return (24000, audio_np)  # Return as (sample_rate, audio)
        else:
            return "Error: No audio generated. Try different input."
    
    except Exception as e:
        print(f"Gradio Error: {e}")
        return f"Error: {str(e)}"

# Gradio UI
iface = gr.Interface(
    fn=hindi_tts,
    inputs=[
        gr.Textbox(label="Enter Hindi Text", placeholder="हिंदी में कुछ लिखें..."),
        gr.Radio(choices=["hm_omega", "hm_psi", "hf_alpha", "hf_beta"], label="Select Voice", value="hf_alpha"),
        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, label="Speech Speed", value=1.0)
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="Hindi Text-to-Speech",
    description="Enter Hindi text and generate speech using Kokoro TTS.",
    allow_flagging="never"
)

# Launch Gradio
iface.launch(share=True)




  WeightNorm.apply(module, name, dim)


* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://67d5081efcb80ddd48.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Processing: पंजाब के एक छोटे से गाँव में एक पुरानी हवेली थी। | Voice: hm_omega | Speed: 1


hm_omega.pt:   0%|          | 0.00/523k [00:00<?, ?B/s]

