# Zonos TTS Notebook
This notebook sets up and runs the Zonos Text-to-Speech (TTS) model.
It includes installation, setup, and inference steps.

## Setup and Installation
The following cells install necessary dependencies and clone the required repository.

In [None]:
# !git clone https://github.com/Zyphra/Zonos.git

In [None]:
# %cd Zonos

In [None]:
# !sudo apt install -y espeak-ng

In [None]:
!pip install -U uv

In [None]:
!uv sync
!uv sync --extra compile

In [None]:
!uv pip install -e .
!uv pip install -e .[compile]

In [None]:
!pip install mamba-ssm

In [None]:
!pip install -e .
!pip install --no-build-isolation -e .[compile]

In [None]:
import torch
import torchaudio
from zonos.model import Zonos
from IPython.display import Audio
from zonos.conditioning import make_cond_dict

model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device="cuda")

In [None]:
def generate_speech_from_text(text, mp3_path='assets/liza.mp3', audio_file="sample.wav"):
    
    # Load audio and generate speaker embedding
    wav, sampling_rate = torchaudio.load(mp3_path)
    speaker = model.make_speaker_embedding(wav, sampling_rate)

    cond_dict = make_cond_dict(text=text, speaker=speaker, language="en-us")
    conditioning = model.prepare_conditioning(cond_dict)

    codes = model.generate(conditioning)

    # Decode the generated speech and save it as a .wav file
    wavs = model.autoencoder.decode(codes).cpu()
    torchaudio.save(audio_file, wavs[0], model.autoencoder.sampling_rate)

    return audio_file

In [4]:
speech = """You don't even think to call me "Godfather." You come into my house on the day my daughter is to be married and you ask me to do murder - for money."""
audio_file = generate_speech_from_text(text=speech)
Audio('sample.wav')