[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/bark-colab/blob/main/bark_clone_colab.ipynb)

In [None]:
# https://github.com/serp-ai/bark-with-voice-clone/blob/main/clone_voice.ipynb modified
# !pip install -q git+https://github.com/camenduru/bark.git
!git clone https://github.com/camenduru/bark
%cd /content/bark
!pip install .

!pip install -q --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
!pip install -q gradio==3.27.0

!apt -y install -qq aria2
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/bark_v0/resolve/main/6285677e88715abde42a9924db939b3b.pt -d /root/.cache/suno/bark_v0 -o 6285677e88715abde42a9924db939b3b.pt
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/bark_v0/resolve/main/751d4d3d562e63ead5311ebe2a5f45a8.pt -d /root/.cache/suno/bark_v0 -o 751d4d3d562e63ead5311ebe2a5f45a8.pt
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/bark_v0/resolve/main/e32937063d7ccececc61b2d2a3bb0a13.pt -d /root/.cache/suno/bark_v0 -o e32937063d7ccececc61b2d2a3bb0a13.pt

from bark.generation import load_codec_model, generate_text_semantic
from encodec.utils import convert_audio
import torchaudio
import torch

model = load_codec_model(use_gpu=True)

# Load and pre-process the audio waveform
audio_filepath = '/content/audio.wav' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)
device = 'cuda' # or 'cpu'
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.unsqueeze(0).to(device)

# Extract discrete codes from EnCodec
with torch.no_grad():
    encoded_frames = model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]


In [None]:
text = "Transcription of the audio you are cloning"

# get seconds of audio
seconds = wav.shape[-1] / model.sample_rate
# generate semantic tokens
semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7)

# move codes to cpu
codes = codes.cpu().numpy()

import numpy as np
voice_name = 'output' # whatever you want the name of the voice to be
output_path = '/content/bark/bark/assets/prompts/' + voice_name + '.npz'
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

In [None]:
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

# Enter your prompt and speaker here
text_prompt = "Hello, my name is Serpy. And, uh — and I like pizza. [laughs]"
voice_name = "output" # use your custom voice name here if you have one

# load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# download and load all models
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False
)

audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)

from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)