# Zonos TTS Model - For Google Colab - Gradio with Emotion Settings
## * MUST USE L4 or A100 GPU

## Zonos by Zyphra is a cutting edge Text-To-Speech model with fast crisp clean voice cloning abilities. Apache 2.0 License

This notebook clones the [Zonos repository](https://github.com/Zyphra/Zonos), installs the required system and Python dependencies, wait till cell has completed then click *restart session* before moving onto Gradio section.

In [None]:
# @title Initialize Zonos
# Update package lists and install eSpeak (required for phonemization)
!apt update && apt install -y espeak-ng

# Clone the Zonos repository from GitHub
!git clone https://github.com/Zyphra/Zonos.git
%cd Zonos

# Install Python dependencies using uv as recommended in the README cite50†
!pip install -U uv
!pip install -e .
!pip install --no-build-isolation -e .[compile]

!pip install numpy==1.24.4
!pip install scipy==1.13.3
!pip install scikit-learn==1.6.1
!pip install triton

# Must restart session !!!

## Gradio section

Gradio application, click Public Link URL in response to open gradio in a new browser tab, Then select the model, write the text prompt you want spoken, upload a sample voice and adjust settings to your liking.

First run will take longer to load models and analyze voice uploaded before processing, further generations are much faster.

In [None]:
# @title Launch Gradio with settings
import torch
import torchaudio
import gradio as gr
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=device)

def generate_speech(
    model_choice,
    text,
    language,
    speaker_audio,
    prefix_audio,
    happiness,
    sadness,
    disgust,
    fear,
    surprise,
    anger,
    other,
    neutral,
    vq_score,
    fmax,
    pitch_std,
    speaking_rate,
    dnsmos_ovrl,
    denoise_speaker,
    cfg_scale,
    seed,
    randomize_seed
):
    # Load and process the speaker audio
    wav, sampling_rate = torchaudio.load(speaker_audio)
    speaker = model.make_speaker_embedding(wav, sampling_rate)

    # Create emotion vector as a tensor
    emotion_values = torch.tensor([
        happiness, sadness, disgust, fear,
        surprise, anger, other, neutral
    ], device=device)
    # Normalize the emotion values
    emotion_values = emotion_values / emotion_values.sum()

    # Create conditioning dictionary with all parameters
    cond_dict = make_cond_dict(
        text=text,
        speaker=speaker,
        language=language,
        emotion=emotion_values,
        vqscore_8=vq_score,
        fmax=fmax,
        pitch_std=pitch_std,
        speaking_rate=speaking_rate,
        dnsmos_ovrl=dnsmos_ovrl,
        speaker_noised=denoise_speaker
    )

    # Handle seed
    if randomize_seed or seed == -1:
        seed = torch.randint(0, 1000000, (1,)).item()
    torch.manual_seed(seed)

    # Generate speech
    conditioning = model.prepare_conditioning(cond_dict)
    codes = model.generate(
        conditioning,
        cfg_scale=cfg_scale
    )
    wavs = model.autoencoder.decode(codes).cpu()

    # Save and return
    output_path = "output.wav"
    torchaudio.save(output_path, wavs[0], model.autoencoder.sampling_rate)
    return output_path, seed

# Available languages list
LANGUAGES = ['af', 'am', 'an', 'ar', 'as', 'az', 'ba', 'bg', 'bn', 'bpy', 'bs', 'ca', 'cmn',
            'cs', 'cy', 'da', 'de', 'el', 'en-029', 'en-gb', 'en-gb-scotland', 'en-gb-x-gbclan',
            'en-gb-x-gbcwmd', 'en-gb-x-rp', 'en-us', 'eo', 'es', 'es-419', 'et', 'eu', 'fa',
            'fa-latn', 'fi', 'fr-be', 'fr-ch', 'fr-fr', 'ga', 'gd', 'gn', 'grc', 'gu', 'hak',
            'hi', 'hr', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'is', 'it', 'ja', 'jbo', 'ka',
            'kk', 'kl', 'kn', 'ko', 'kok', 'ku', 'ky', 'la', 'lfn', 'lt', 'lv', 'mi', 'mk',
            'ml', 'mr', 'ms', 'mt', 'my', 'nb', 'nci', 'ne', 'nl', 'om', 'or', 'pa', 'pap',
            'pl', 'pt', 'pt-br', 'py', 'quc', 'ro', 'ru', 'ru-lv', 'sd', 'shn', 'si', 'sk',
            'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'tn', 'tr', 'tt', 'ur', 'uz', 'vi',
            'vi-vn-x-central', 'vi-vn-x-south', 'yue']

# Create Gradio interface
with gr.Blocks(title="Zonos Text-to-Speech") as demo:
    gr.Markdown("# Zonos Text-to-Speech Generator")

    with gr.Row():
        with gr.Column():
            model_choice = gr.Dropdown(
                choices=["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"],
                value="Zyphra/Zonos-v0.1-hybrid",
                label="Model Type"
            )
            text = gr.Textbox(
                label="Text to Speak",
                placeholder="Enter text to convert to speech...",
                value="Hello, this is a test of the Zonos text to speech system."
            )
            language = gr.Dropdown(
                choices=LANGUAGES,
                value="en-us",
                label="Language"
            )
            speaker_audio = gr.Audio(
                type="filepath",
                label="Speaker Voice Sample"
            )
            prefix_audio = gr.Audio(
                type="filepath",
                label="Prefix Audio (continue from this audio optional)",
                visible=True
            )

        with gr.Column():
            with gr.Tab("Emotions"):
                happiness = gr.Slider(0, 1, value=0.2, label="Happiness")
                sadness = gr.Slider(0, 1, value=0.05, label="Sadness")
                disgust = gr.Slider(0, 1, value=0.05, label="Disgust")
                fear = gr.Slider(0, 1, value=0.05, label="Fear")
                surprise = gr.Slider(0, 1, value=0.05, label="Surprise")
                anger = gr.Slider(0, 1, value=0.05, label="Anger")
                other = gr.Slider(0, 1, value=0.1, label="Other")
                neutral = gr.Slider(0, 1, value=0.2, label="Neutral")

            with gr.Tab("Voice Parameters"):
                vq_score = gr.Slider(0, 1, value=0.78, label="VQ Score")
                fmax = gr.Slider(1000, 48000, value=48000, label="Fmax (Hz)")
                pitch_std = gr.Slider(0, 100, value=45, label="Pitch Std")
                speaking_rate = gr.Slider(0, 30, value=15, label="Speaking Rate")
                dnsmos_ovrl = gr.Slider(1, 5, value=4, label="DNSMOS Overall")
                denoise_speaker = gr.Checkbox(label="Denoise Speaker?", value=False)

            with gr.Tab("Generation Settings"):
                cfg_scale = gr.Slider(0, 10, value=2, label="CFG Scale")
                seed = gr.Number(value=-1, label="Seed (-1 for random)")
                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                output_seed = gr.Number(label="Last Used Seed", interactive=False)

            with gr.Column():
                generate_btn = gr.Button("Generate Speech")
                output_audio = gr.Audio(label="Generated Speech")

    # Connect the generate button
    generate_btn.click(
        fn=generate_speech,
        inputs=[
            model_choice, text, language, speaker_audio, prefix_audio,
            happiness, sadness, disgust, fear, surprise, anger, other, neutral,
            vq_score, fmax, pitch_std, speaking_rate, dnsmos_ovrl, denoise_speaker,
            cfg_scale, seed, randomize_seed
        ],
        outputs=[output_audio, output_seed]
    )

# Launch with public URL and full configuration
demo.launch(share=True, debug=True, quiet=True)