In [1]:
# Fast install, might break in the future.
!pip install 'sphn<0.2'
!pip install --no-deps "moshi==0.2.10"
# Slow install (will download torch and cuda), but future proof.
# !pip install "moshi==0.2.10"

Collecting sphn<0.2
  Downloading sphn-0.1.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading sphn-0.1.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sphn
Successfully installed sphn-0.1.12
Collecting moshi==0.2.10
  Downloading moshi-0.2.10-py3-none-any.whl.metadata (8.2 kB)
Downloading moshi-0.2.10-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.3/107.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: moshi
Successfully installed moshi-0.2.10


In [2]:
import argparse
import sys

import numpy as np
import torch
from moshi.models.loaders import CheckpointInfo
from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel

from IPython.display import display, Audio

In [7]:
# Configuration
text = "Hey Jose! How are you? I hope you had a nice day. Looks like you want to live in Italy, I´m sure you can make it."
voice = "expresso/ex03-ex01_happy_001_channel1_334s.wav"
print(f"See https://huggingface.co/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.")

See https://huggingface.co/kyutai/tts-voices for available voices.


In [10]:
# Set everything up
checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)
tts_model = TTSModel.from_checkpoint_info(
    checkpoint_info, n_q=32, temp=0.6, device=torch.device("cuda")
)

# If you want to make a dialog, you can pass more than one turn [text_speaker_1, text_speaker_2, text_2_speaker_1, ...]
entries = tts_model.prepare_script([text], padding_between=1)
voice_path = tts_model.get_voice_path(voice)
# CFG coef goes here because the model was trained with CFG distillation,
# so it's not _actually_ doing CFG at inference time.
# Also, if you are generating a dialog, you should have two voices in the list.
condition_attributes = tts_model.make_condition_attributes(
    [voice_path], cfg_coef=2.0
)

In [11]:
print("Generating audio...")

pcms = []
def _on_frame(frame):
    print("Step", len(pcms), end="\r")
    if (frame != -1).all():
        pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
        pcms.append(np.clip(pcm[0, 0], -1, 1))

# You could also generate multiple audios at once by extending the following lists.
all_entries = [entries]
all_condition_attributes = [condition_attributes]
with tts_model.mimi.streaming(len(all_entries)):
    result = tts_model.generate(all_entries, all_condition_attributes, on_frame=_on_frame)

print("Done generating.")
audio = np.concatenate(pcms, axis=-1)

Generating audio...
Done generating.


In [12]:
display(
    Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)
)