In [None]:
"""
    Using pipeline
"""

In [15]:
from transformers import pipeline
from IPython.display import Audio

In [16]:
synthesiser = pipeline("text-to-speech", "suno/bark-small")



In [17]:
speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"do_sample": True})

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [29]:
audio = speech['audio']
sampling_rate = speech['sampling_rate']
Audio(audio, rate=sampling_rate)

In [34]:
"""
    Using AutoProcessor and Automodel
"""

from transformers import AutoProcessor, AutoModel

processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

inputs = processor(
    text=["Today is such a beautiful day."],
    return_tensors="pt",
)

In [26]:
print(model.generation_config.sample_rate)
model.generation_config.sample_rate=24000
print(model.generation_config.sample_rate)

16000
16000


In [35]:
speech_values = model.generate(**inputs, do_sample=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [36]:
audio_array = speech_values.cpu().numpy().squeeze()
sampling_rate = model.generation_config.sample_rate
Audio(audio_array, rate=sampling_rate)

In [37]:
"""
    Using BarkModel with better transformer
"""

from transformers import BarkModel, AutoProcessor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
model =  model.to_bettertransformer()

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [41]:
voice_preset = "v2/en_speaker_6"
inputs = processor("Der freundliche Hund spielt gerne im Park.", voice_preset=voice_preset)

audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [42]:
from IPython.display import Audio

sampling_rate = model.generation_config.sample_rate
Audio(audio_array, rate=sampling_rate)

In [44]:
import scipy
scipy.io.wavfile.write("003_sample.wav", rate=24000, data=audio_array)

In [1]:
from bark import SAMPLE_RATE, generate_audio, preload_models

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
preload_models()

text_prompt = """
- Cristiano Ronaldo, one of the greatest footballers of all time, has won numerous accolades and continues to impress fans worldwide with his skill and dedication to the sport.
"""

audio_array = generate_audio(text_prompt)

No GPU being used. Careful, inference might be very slow!
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 571/571 [00:33<00:00, 17.07it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [01:59<00:00,  4.11s/it]


In [10]:
from IPython.display import Audio

Audio(audio_array, rate=SAMPLE_RATE)

In [11]:
import scipy

scipy.io.wavfile.write("003_sample.wav", rate=SAMPLE_RATE, data=audio_array)

In [12]:
import librosa

speech_array, sampling_rate = librosa.load("003_sample.wav", sr=16_000)
scipy.io.wavfile.write("003_sample.wav", rate=sampling_rate, data=speech_array)
Audio(audio_array, rate=SAMPLE_RATE)