In [6]:
import torch
torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision('high')

import ChatTTS
from IPython.display import Audio

## Load Models

In [7]:
chat = ChatTTS.Chat()
chat.load_models()

# Use force_redownload=True if the weights updated.
# chat.load_models(force_redownload=True)

# If you download the weights manually, set source='locals'.
# chat.load_models(source='local', local_path='YOUR LOCAL PATH')

INFO:ChatTTS.core:Load from cache: /home/cash/.cache/huggingface/hub/models--2Noise--ChatTTS/snapshots/d7474137acb4f988874e5d57ad88d81bcb7e10b6
INFO:ChatTTS.core:use cuda:0
INFO:ChatTTS.core:vocos loaded.
INFO:ChatTTS.core:dvae loaded.
INFO:ChatTTS.core:gpt loaded.
INFO:ChatTTS.core:decoder loaded.
INFO:ChatTTS.core:tokenizer loaded.
INFO:ChatTTS.core:All initialized.


## Inference

### Batch infer

In [9]:
texts = ["So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.",]*3 \
        + ["我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。"]*3     
        
wavs = chat.infer(texts)

INFO:ChatTTS.core:All initialized.
  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,
 29%|██▊       | 110/384 [01:40<04:11,  1.09it/s]
 43%|████▎     | 888/2048 [00:32<00:42, 27.08it/s] 


In [10]:
Audio(wavs[0], rate=24_000, autoplay=True)

In [5]:
Audio(wavs[3], rate=24_000, autoplay=True)

### Custom params

In [12]:
params_infer_code = {'prompt':'[speed_5]', 'temperature':.3}
params_refine_text = {'prompt':'[oral_2][laugh_0][break_6]'}

wav = chat.infer('Stable Zero123 is an AI-powered model for generating novel views of 3D objects with improved quality. Released for non-commercial and research purposes, it uses an improved dataset and elevation conditioning for higher-quality predictions.', \
    params_refine_text=params_refine_text, params_infer_code=params_infer_code)

INFO:ChatTTS.core:All initialized.
 23%|██▎       | 88/384 [00:00<00:01, 168.78it/s]
 35%|███▌      | 725/2048 [00:04<00:08, 165.13it/s]


In [13]:
Audio(wav[0], rate=24_000, autoplay=True)

### fix random speaker

In [16]:
rand_spk = chat.sample_random_speaker()
params_infer_code = {'spk_emb' : rand_spk, }

wav = chat.infer('Stable Zero123 is an AI-powered model for generating novel views of 3D objects with improved quality. Released for non-commercial and research purposes, it uses an improved dataset and elevation conditioning for higher-quality predictions.', \
    params_refine_text=params_refine_text, params_infer_code=params_infer_code)

INFO:ChatTTS.core:All initialized.
 23%|██▎       | 87/384 [00:00<00:01, 164.96it/s]
 37%|███▋      | 764/2048 [00:04<00:07, 163.35it/s]


In [18]:
Audio(wav[0], rate=24_000, autoplay=True)

### Two stage control

In [22]:
text = "So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with."
chat.infer(text, refine_text_only=True)

INFO:ChatTTS.core:All initialized.
 23%|██▎       | 88/384 [00:00<00:01, 175.77it/s]


['so we found [uv_break] being competitive and collaborative [uv_break] was a huge way [uv_break] of [uv_break] staying motivated towards our goals, [uv_break] so one person to call [uv_break] when you fall off, [uv_break] so one person who gets you back on [uv_break] then [uv_break] one person [uv_break] to actually do the activity with.']

In [25]:
text = 'so we found being competitive and collaborative [uv_break] was a huge way of staying [uv_break] motivated towards our goals, [uv_break] so [uv_break] one person to call [uv_break] when you fall off, [uv_break] one person who [uv_break] gets you back [uv_break] on then [uv_break] one person [uv_break] to actually do the activity with.'
wav = chat.infer(text, skip_refine_text=True)
Audio(wav[0], rate=24_000, autoplay=True)

INFO:ChatTTS.core:All initialized.
  1%|          | 19/2048 [00:00<00:11, 183.25it/s]

 46%|████▌     | 941/2048 [00:06<00:07, 155.63it/s]


## LLM Call

In [45]:
from ChatTTS.experimental.llm import llm_api

API_KEY = ''
client = llm_api(api_key="sk-42c48dd536b14a81964600a8fb061ed1",
        base_url="https://api.deepseek.com",
        model="deepseek-chat")

In [46]:
user_question = 'Why is the sky blue?'
text = client.call(user_question, prompt_version = 'deepseek')
print(text)
text = client.call(text, prompt_version = 'deepseek')
print(text)

INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 400 Bad Request"


BadRequestError: Error code: 400 - {'detail': 'Model Not Exist'}

In [42]:
params_infer_code = {'spk_emb' : rand_spk, 'temperature':.3}

wav = chat.infer(text, params_infer_code=params_infer_code)
Audio(wav[0], rate=24_000, autoplay=True)

INFO:ChatTTS.core:All initialized.
 20%|██        | 78/384 [00:00<00:01, 169.29it/s]
  1%|          | 21/2048 [00:00<00:09, 206.17it/s]

 35%|███▍      | 712/2048 [00:04<00:07, 167.55it/s]
