In [40]:
from bark.generation import load_codec_model, generate_text_semantic
from encodec.utils import convert_audio

import torchaudio
import torch

device = 'cuda' # or 'cpu'
model = load_codec_model(use_gpu=True if device == 'cuda' else False)

# 说明

    经过测试，使用huangjiahong_1.wav作为参考音频，生成出来的音频效果，在音色上并不好，采用t21.wav作为参考音频，生成出来的音频效果，在音色上是男生，效果都不好

In [2]:
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
from hubert.hubert_manager import HuBERTManager
hubert_manager = HuBERTManager()
hubert_manager.make_sure_hubert_installed()
hubert_manager.make_sure_tokenizer_installed()

Downloading HuBERT base model
Downloaded HuBERT
Downloading HuBERT custom tokenizer


quantifier_hubert_base_ls960_14.pth:   0%|          | 0.00/104M [00:00<?, ?B/s]

Downloaded tokenizer


'data/models/hubert/tokenizer.pth'

In [3]:
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer 
# Load HuBERT for semantic tokens
from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer

# Load the HuBERT model
hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)

# Load the CustomTokenizer model
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device)  # Automatically uses the right layers

In [25]:
# Load and pre-process the audio waveform
# audio_filepath = '/mnt/cephfs/hjh/common_dataset/tts/chinese/huangjiahong_wavs/huangjiahong_2.wav' # the audio you want to clone (under 13 seconds)
audio_filepath = '/mnt/cephfs/hjh/common_dataset/tts/chinese/tingting_wavs/t21.wav' # the audio you want to clone (under 13 seconds)
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.to(device)

In [26]:
semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
semantic_tokens = tokenizer.get_token(semantic_vectors)

In [27]:
# Extract discrete codes from EnCodec
with torch.no_grad():
    encoded_frames = model.encode(wav.unsqueeze(0))
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]

In [28]:
# move codes to cpu
codes = codes.cpu().numpy()
# move semantic tokens to cpu
semantic_tokens = semantic_tokens.cpu().numpy()

In [29]:
import numpy as np
voice_name = 'tt' # whatever you want the name of the voice to be
output_path = 'bark/assets/prompts/' + voice_name + '.npz'
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

In [30]:
# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'

In [31]:
# Heres the generation stuff copy-pasted for convenience

In [32]:
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

# Enter your prompt and speaker here
text_prompt = "Hello, my name is Serpy. And, uh — and I like pizza. [laughs]"
voice_name = "hjh" # use your custom voice name here if you have one

# 说明

下面的preload_models，如果在huaggingface自己下载，然后把模型放到models目录下，会报错，所以不要自己下载，而是直接运行下面的代码，让它自行下载

In [33]:
# download and load all models
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)

In [34]:
# simple generation
audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)

100%|██████████| 100/100 [00:04<00:00, 20.58it/s]
100%|██████████| 22/22 [00:14<00:00,  1.47it/s]


In [35]:
# generation with more control
x_semantic = generate_text_semantic(
    text_prompt,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)

x_coarse_gen = generate_coarse(
    x_semantic,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)
x_fine_gen = generate_fine(
    x_coarse_gen,
    history_prompt=voice_name,
    temp=0.5,
)
audio_array = codec_decode(x_fine_gen)

100%|██████████| 100/100 [00:24<00:00,  4.00it/s]
100%|██████████| 33/33 [01:49<00:00,  3.32s/it]


In [36]:
from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)

In [39]:
from scipy.io.wavfile import write as write_wav
# save audio
filepath = "/tmp/audio.wav" # change this to your desired output path
write_wav(filepath, SAMPLE_RATE, audio_array)
print('save to:', filepath)

save to: /tmp/audio.wav
