In [1]:
from IPython.display import Audio
from scipy.io.wavfile import write as write_wav


from bark.api import generate_audio
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

In [2]:
semantic_path = "semantic_output/pytorch_model.bin" # set to None if you don't want to use finetuned semantic
coarse_path = "coarse_output/pytorch_model.bin" # set to None if you don't want to use finetuned coarse
fine_path = "fine_output/pytorch_model.bin" # set to None if you don't want to use finetuned fine

In [3]:
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    text_model_path=semantic_path,
    coarse_use_gpu=True,
    coarse_use_small=False,
    coarse_model_path=coarse_path,
    fine_use_gpu=True,
    fine_use_small=False,
    fine_model_path=fine_path,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)



In [4]:
# # simple generation
# text_prompt = "So now for this, you still using GPU, are you still using GPU?"
# filepath = "output/profclone_116_simple.wav" # change this to your desired output path
# audio_array = generate_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7)
# write_wav(filepath, SAMPLE_RATE, audio_array)

In [5]:
# Audio(audio_array, rate=SAMPLE_RATE)

In [6]:
def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=True):
    # generation with more control
    x_semantic = generate_text_semantic(
        text_prompt,
        history_prompt=voice_name if use_semantic_history_prompt else None,
        temp=semantic_temp,
        top_k=semantic_top_k,
        top_p=semantic_top_p,
    )

    x_coarse_gen = generate_coarse(
        x_semantic,
        history_prompt=voice_name if use_coarse_history_prompt else None,
        temp=coarse_temp,
        top_k=coarse_top_k,
        top_p=coarse_top_p,
    )
    x_fine_gen = generate_fine(
        x_coarse_gen,
        history_prompt=voice_name if use_fine_history_prompt else None,
        temp=fine_temp,
    )

    if output_full:
        full_generation = {
            'semantic_prompt': x_semantic,
            'coarse_prompt': x_coarse_gen,
            'fine_prompt': x_fine_gen,
        }
        return full_generation, codec_decode(x_fine_gen)
    return codec_decode(x_fine_gen)

In [7]:
# text_prompt_20 = "One burner represents your family one is your friends the third is your health and the fourth is your work."

# filepath_20 = "output/aayushclone_20_control.wav" 

In [8]:
# audio_array = generate_with_settings(
#     text_prompt_20,
#     semantic_temp=0.7,
#     semantic_top_k=50,
#     semantic_top_p=0.99,
#     coarse_temp=0.7,
#     coarse_top_k=50,
#     coarse_top_p=0.95,
#     fine_temp=0.5,
#     voice_name="datasets/clip aayush/tokens/20.npz",
#     use_semantic_history_prompt=False,
#     use_coarse_history_prompt=True,
#     use_fine_history_prompt=True,
#     output_full=False
# )

# write_wav(filepath_20, SAMPLE_RATE, audio_array)

# Audio(audio_array, rate=SAMPLE_RATE)

In [9]:
text_prompt_39 = "A familiar man called me back saying he thought I left something in our mailbox."

file_path_39 = "output/aayushclone_000039_unseen.wav"

In [10]:
audio_array = generate_with_settings(
    text_prompt_39,
    semantic_temp=0.7,
    semantic_top_k=50,
    semantic_top_p=0.99,
    coarse_temp=0.7,
    coarse_top_k=50,
    coarse_top_p=0.95,
    fine_temp=0.5,
    voice_name="datasets/aayush voice/tokens/0000000039.npz",
    use_semantic_history_prompt=False,
    use_coarse_history_prompt=True,
    use_fine_history_prompt=True,
    output_full=False
)

write_wav(file_path_39, SAMPLE_RATE, audio_array)

Audio(audio_array, rate=SAMPLE_RATE)

100%|██████████| 100/100 [04:49<00:00,  2.90s/it]
100%|██████████| 36/36 [1:28:26<00:00, 147.41s/it]


In [11]:
text_prompt_32 = "when they had all gone and come back he called them together to describe what they had seen."

filepath_32 = "output/aayushclone_000032_unseen.wav" 

In [12]:
audio_array = generate_with_settings(
    text_prompt_32,
    semantic_temp=0.8,
    semantic_top_k=50,
    semantic_top_p=0.99,
    coarse_temp=0.7,
    coarse_top_k=50,
    coarse_top_p=0.95,
    fine_temp=0.5,
    voice_name="datasets/aayush voice/tokens/0000000032.npz",
    use_semantic_history_prompt=False,
    use_coarse_history_prompt=True,
    use_fine_history_prompt=True,
    output_full=False
)

write_wav(filepath_32, SAMPLE_RATE, audio_array)

Audio(audio_array, rate=SAMPLE_RATE)

100%|██████████| 100/100 [02:15<00:00,  1.36s/it]
100%|██████████| 21/21 [50:02<00:00, 142.98s/it]


In [13]:
text_prompt_13 = "The development is set to transform an old industrial site into an interconnected smart city."

filepath_13 = "output/aayushclone_000013_unseen.wav" 

In [14]:
audio_array = generate_with_settings(
    text_prompt_13,
    semantic_temp=0.8,
    semantic_top_k=50,
    semantic_top_p=0.99,
    coarse_temp=0.7,
    coarse_top_k=50,
    coarse_top_p=0.95,
    fine_temp=0.5,
    voice_name="datasets/aayush voice/tokens/0000000013.npz",
    use_semantic_history_prompt=False,
    use_coarse_history_prompt=True,
    use_fine_history_prompt=True,
    output_full=False
)

write_wav(filepath_13, SAMPLE_RATE, audio_array)

Audio(audio_array, rate=SAMPLE_RATE)

100%|██████████| 100/100 [01:31<00:00,  1.09it/s]
100%|██████████| 17/17 [40:38<00:00, 143.43s/it]


In [15]:
text_prompt_29 = "It is so nice that every time you look at a sunflower, the whole world starts to smile."

filepath_29 = "output/aayushclone_000029_unseen.wav" 

In [16]:
audio_array = generate_with_settings(
    text_prompt_29,
    semantic_temp=0.9,
    semantic_top_k=50,
    semantic_top_p=0.99,
    coarse_temp=0.7,
    coarse_top_k=50,
    coarse_top_p=0.95,
    fine_temp=0.6,
    voice_name="datasets/aayush voice/tokens/0000000029.npz",
    use_semantic_history_prompt=False,
    use_coarse_history_prompt=True,
    use_fine_history_prompt=True,
    output_full=False
)

write_wav(filepath_29, SAMPLE_RATE, audio_array)

Audio(audio_array, rate=SAMPLE_RATE)

100%|██████████| 100/100 [03:02<00:00,  1.83s/it]
100%|██████████| 27/27 [1:05:43<00:00, 146.04s/it]


In [17]:
semantic_path_prof = "semantic_output_prof/pytorch_model.bin" # set to None if you don't want to use finetuned semantic
coarse_path_prof = "coarse_output_prof/pytorch_model.bin" # set to None if you don't want to use finetuned coarse
fine_path_prof = "fine_output_prof/pytorch_model.bin" # set to None if you don't want to use finetuned fine

In [18]:
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    text_model_path=semantic_path_prof,
    coarse_use_gpu=True,
    coarse_use_small=False,
    coarse_model_path=coarse_path_prof,
    fine_use_gpu=True,
    fine_use_small=False,
    fine_model_path=fine_path_prof,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)

In [19]:
text_prompt_49 = "so yes so thats the reason I said that next time please do host it from desktop not here did you got that part"

filepath_49 = "output/professorclone_49_control.wav" 

In [20]:
audio_array = generate_with_settings(
    text_prompt_49,
    semantic_temp=0.8,
    semantic_top_k=50,
    semantic_top_p=0.99,
    coarse_temp=0.7,
    coarse_top_k=50,
    coarse_top_p=0.95,
    fine_temp=0.5,
    voice_name="datasets/60 clips professor audio/tokens/49.npz",
    use_semantic_history_prompt=False,
    use_coarse_history_prompt=True,
    use_fine_history_prompt=True,
    output_full=False
)

write_wav(filepath_49, SAMPLE_RATE, audio_array)

Audio(audio_array, rate=SAMPLE_RATE)

100%|██████████| 100/100 [05:10<00:00,  3.11s/it]
100%|██████████| 37/37 [1:31:56<00:00, 149.09s/it]


In [24]:
text_prompt_235 = "okay so I hope next time you guys can really collect all the data set from our case score and then from that part you guys really can start to train right"

filepath_235 = "output/professorclone_235_unseen.wav" 

In [25]:
audio_array = generate_with_settings(
    text_prompt_235,
    semantic_temp=0.8,
    semantic_top_k=50,
    semantic_top_p=0.99,
    coarse_temp=0.75,
    coarse_top_k=50,
    coarse_top_p=0.95,
    fine_temp=0.6,
    voice_name = "datasets/independent study meeting voices/tokens/235.npz",
    use_semantic_history_prompt=False,
    use_coarse_history_prompt=True,
    use_fine_history_prompt=True,
    output_full=False
)

write_wav(filepath_235, SAMPLE_RATE, audio_array)

Audio(audio_array, rate=SAMPLE_RATE)

100%|██████████| 100/100 [04:20<00:00,  2.60s/it]
100%|██████████| 36/36 [1:46:27<00:00, 177.43s/it]


In [26]:

text_prompt_246 = "you need to discuss with them why they can run it from the windows system I guess is probably you got something wrong definitely you should be able to run it in your Mac and Windows systems for the model right"

filepath_246 = "output/professorclone_246_unseen.wav" 

In [27]:
audio_array = generate_with_settings(
    text_prompt_246,
    semantic_temp=0.7,
    semantic_top_k=50,
    semantic_top_p=0.99,
    coarse_temp=0.7,
    coarse_top_k=50,
    coarse_top_p=0.95,
    fine_temp=0.5,
    voice_name="datasets/independent study meeting voices/tokens/246.npz",
    use_semantic_history_prompt=False,
    use_coarse_history_prompt=True,
    use_fine_history_prompt=True,
    output_full=False
)

write_wav(filepath_246, SAMPLE_RATE, audio_array)

Audio(audio_array, rate=SAMPLE_RATE)

100%|██████████| 100/100 [10:29<00:00,  6.30s/it]
100%|██████████| 38/38 [1:39:05<00:00, 156.45s/it]


In [28]:
# import re
# def split_and_recombine_text(text, desired_length=100, max_length=150):
#     # from https://github.com/neonbjb/tortoise-tts
#     """Split text it into chunks of a desired length trying to keep sentences intact."""
#     # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
#     text = re.sub(r"\n\n+", "\n", text)
#     text = re.sub(r"\s+", " ", text)
#     text = re.sub(r"[“”]", '"', text)

#     rv = []
#     in_quote = False
#     current = ""
#     split_pos = []
#     pos = -1
#     end_pos = len(text) - 1

#     def seek(delta):
#         nonlocal pos, in_quote, current
#         is_neg = delta < 0
#         for _ in range(abs(delta)):
#             if is_neg:
#                 pos -= 1
#                 current = current[:-1]
#             else:
#                 pos += 1
#                 current += text[pos]
#             if text[pos] == '"':
#                 in_quote = not in_quote
#         return text[pos]

#     def peek(delta):
#         p = pos + delta
#         return text[p] if p < end_pos and p >= 0 else ""

#     def commit():
#         nonlocal rv, current, split_pos
#         rv.append(current)
#         current = ""
#         split_pos = []

#     while pos < end_pos:
#         c = seek(1)
#         # do we need to force a split?
#         if len(current) >= max_length:
#             if len(split_pos) > 0 and len(current) > (desired_length / 2):
#                 # we have at least one sentence and we are over half the desired length, seek back to the last split
#                 d = pos - split_pos[-1]
#                 seek(-d)
#             else:
#                 # no full sentences, seek back until we are not in the middle of a word and split there
#                 while c not in "!?.\n " and pos > 0 and len(current) > desired_length:
#                     c = seek(-1)
#             commit()
#         # check for sentence boundaries
#         elif not in_quote and (c in "!?\n" or (c == "." and peek(1) in "\n ")):
#             # seek forward if we have consecutive boundary markers but still within the max length
#             while (
#                 pos < len(text) - 1 and len(current) < max_length and peek(1) in "!?."
#             ):
#                 c = seek(1)
#             split_pos.append(pos)
#             if len(current) >= desired_length:
#                 commit()
#         # treat end of quote as a boundary if its followed by a space or newline
#         elif in_quote and peek(1) == '"' and peek(2) in "\n ":
#             seek(2)
#             split_pos.append(pos)
#     rv.append(current)

#     # clean up, remove lines with only whitespace or punctuation
#     rv = [s.strip() for s in rv]
#     rv = [s for s in rv if len(s) > 0 and not re.match(r"^[\s\.,;:!?]*$", s)]

#     return rv

# def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):
#     # generation with more control
#     x_semantic = generate_text_semantic(
#         text_prompt,
#         history_prompt=voice_name if use_semantic_history_prompt else None,
#         temp=semantic_temp,
#         top_k=semantic_top_k,
#         top_p=semantic_top_p,
#     )

#     x_coarse_gen = generate_coarse(
#         x_semantic,
#         history_prompt=voice_name if use_coarse_history_prompt else None,
#         temp=coarse_temp,
#         top_k=coarse_top_k,
#         top_p=coarse_top_p,
#     )
#     x_fine_gen = generate_fine(
#         x_coarse_gen,
#         history_prompt=voice_name if use_fine_history_prompt else None,
#         temp=fine_temp,
#     )

#     if output_full:
#         full_generation = {
#             'semantic_prompt': x_semantic,
#             'coarse_prompt': x_coarse_gen,
#             'fine_prompt': x_fine_gen,
#         }
#         return full_generation, codec_decode(x_fine_gen)
#     return codec_decode(x_fine_gen)

In [29]:
# # Chunk the text into smaller pieces then combine the generated audio
# from time import time
# from tqdm.auto import tqdm
# from IPython.display import Audio
# from scipy.io.wavfile import write as write_wav
# import os
# import numpy as np

In [30]:
# prompt_166 = "Okay, so next time you can show me a results about only about training and only about training. Only about training and the test so separate them so that we can know because we usually expect that is a training part it should be very good right but it may be very poor. Let's say how poor it is."


# # generation settings
# voice_name_166 = "datasets/60 clips professor audio/tokens/166.npz"
# out_filepath_166 = 'output/prof_prompt_simplify_test_166.wav'

In [31]:
# semantic_temp = 0.7
# semantic_top_k = 100
# semantic_top_p = 0.99

# coarse_temp = 0.7
# coarse_top_k = 100
# coarse_top_p = 0.95

# fine_temp = 0.7

# use_semantic_history_prompt = True
# use_coarse_history_prompt = True
# use_fine_history_prompt = True

# use_last_generation_as_history = False

# texts = split_and_recombine_text(prompt_166)

# all_parts = []
# for i, text in tqdm(enumerate(texts), total=len(texts)):
#     full_generation, audio_array = generate_with_settings(
#         text,
#         semantic_temp=semantic_temp,
#         semantic_top_k=semantic_top_k,
#         semantic_top_p=semantic_top_p,
#         coarse_temp=coarse_temp,
#         coarse_top_k=coarse_top_k,
#         coarse_top_p=coarse_top_p,
#         fine_temp=fine_temp,
#         voice_name=voice_name_166,
#         use_semantic_history_prompt=use_semantic_history_prompt,
#         use_coarse_history_prompt=use_coarse_history_prompt,
#         use_fine_history_prompt=use_fine_history_prompt,
#         output_full=True
#     )
#     # if use_last_generation_as_history:
#     #     # save to npz
#     #     os.makedirs('_temp', exist_ok=True)
#     #     np.savez_compressed(
#     #         '_temp/history.npz',
#     #         semantic_prompt=full_generation['semantic_prompt'],
#     #         coarse_prompt=full_generation['coarse_prompt'],
#     #         fine_prompt=full_generation['fine_prompt'],
#     #     )
#     #     voice_name = '_temp/history.npz'
#     # write_wav(out_filepath_166.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)

# # save audio
# write_wav(out_filepath_166, SAMPLE_RATE, audio_array)

# # play audio
# Audio(audio_array, rate=SAMPLE_RATE)

In [32]:
# text_prompt_13 = "offered herself as a guide. On that first afternoon, she showed us around Melbourne and shouted coffee. The following morning, she picked us up at our hotel and drove us into what she called the bush. I expected a wasteland of dust and human bones, but it was nothing like that. When Australians say the bush, they mean the woods, the forest."

# filepath_13 = 'output/aayushclone_prompt_13.wav'

In [33]:
# audio_array = generate_with_settings(
#     text_prompt_13,
#     semantic_temp=0.95,
#     semantic_top_k=50,
#     semantic_top_p=0.99,
#     coarse_temp=0.8,
#     coarse_top_k=50,
#     coarse_top_p=0.95,
#     fine_temp=0.7,
#     voice_name="datasets/clip aayush/tokens/13.npz",
#     use_semantic_history_prompt=False,
#     use_coarse_history_prompt=True,
#     use_fine_history_prompt=True,
#     output_full=False
# )

# write_wav(filepath_13, SAMPLE_RATE, audio_array)

# Audio(audio_array, rate=SAMPLE_RATE)

In [34]:
# text_prompt_1 = "He made this observation at the home of my cousin, Juan."

# filepath_1 = 'output/aayushclone_prompt_1.wav'

In [35]:
# audio_array = generate_with_settings(
#     text_prompt_1,
#     semantic_temp=0.5,
#     semantic_top_k=50,
#     semantic_top_p=0.99,
#     coarse_temp=0.5,
#     coarse_top_k=50,
#     coarse_top_p=0.95,
#     fine_temp=0.5,
#     voice_name="datasets/clip aayush/tokens/1.npz",
#     use_semantic_history_prompt=False,
#     use_coarse_history_prompt=True,
#     use_fine_history_prompt=True,
#     output_full=False
# )

# write_wav(filepath_1, SAMPLE_RATE, audio_array)

# Audio(audio_array, rate=SAMPLE_RATE)

In [36]:
# text_prompt_11 = "You can also say it's my south or I'll south the next round, she told us."

# filepath_11 = 'output/aayushclone_prompt_11.wav'

In [37]:
# audio_array = generate_with_settings(
#     text_prompt_11,
#     semantic_temp=0.9,
#     semantic_top_k=50,
#     semantic_top_p=0.99,
#     coarse_temp=0.8,
#     coarse_top_k=50,
#     coarse_top_p=0.95,
#     fine_temp=0.6,
#     voice_name="datasets/clip aayush/tokens/11.npz",
#     use_semantic_history_prompt=False,
#     use_coarse_history_prompt=True,
#     use_fine_history_prompt=True,
#     output_full=False
# )

# write_wav(filepath_11, SAMPLE_RATE, audio_array)

# Audio(audio_array, rate=SAMPLE_RATE)

In [38]:
# text_prompt_24 = "family. After that she switched off her health. How about you?"

# filepath_24 = 'output/aayushclone_prompt_24.wav'

In [39]:
# audio_array = generate_with_settings(
#     text_prompt_24,
#     semantic_temp=0.8,
#     semantic_top_k=50,
#     semantic_top_p=0.99,
#     coarse_temp=0.7,
#     coarse_top_k=50,
#     coarse_top_p=0.95,
#     fine_temp=0.5,
#     voice_name="datasets/clip aayush/tokens/24.npz",
#     use_semantic_history_prompt=False,
#     use_coarse_history_prompt=True,
#     use_fine_history_prompt=True,
#     output_full=False
# )

# write_wav(filepath_24, SAMPLE_RATE, audio_array)

# Audio(audio_array, rate=SAMPLE_RATE)

In [40]:
# text_prompt_22 = "Pat has her own business, a good one, that's allowing her to retire at 55."

# filepath_22 = 'output/aayushclone_prompt_22.wav'

In [41]:
# audio_array = generate_with_settings(
#     text_prompt_22,
#     semantic_temp=0.9,
#     semantic_top_k=50,
#     semantic_top_p=0.99,
#     coarse_temp=0.7,
#     coarse_top_k=50,
#     coarse_top_p=0.95,
#     fine_temp=0.7,
#     voice_name="datasets/clip aayush/tokens/22.npz",
#     use_semantic_history_prompt=False,
#     use_coarse_history_prompt=True,
#     use_fine_history_prompt=True,
#     output_full=False
# )

# write_wav(filepath_22, SAMPLE_RATE, audio_array)

# Audio(audio_array, rate=SAMPLE_RATE)