In [1]:
import torch
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.cuda.set_device(0)


import random
random.seed(0)

import numpy as np
np.random.seed(0)

%cd ..

# load packages
import time
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
from nltk.tokenize import word_tokenize
import soundfile as sf
from pathlib import Path

from models import *
from utils import *
from text_utils import TextCleaner
textcleaner = TextCleaner()

%matplotlib inline

SAMPLING_RATE = 24000

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300
)
mean, std = -4, 4


def length_to_mask(lengths):
    mask = (
        torch.arange(lengths.max())
        .unsqueeze(0)
        .expand(lengths.shape[0], -1)
        .type_as(lengths)
    )
    mask = torch.gt(mask + 1, lengths.unsqueeze(1))
    return mask


def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor


def compute_style(path):
    wave, sr = librosa.load(path, sr=SAMPLING_RATE)
    audio, index = librosa.effects.trim(wave, top_db=30)
    if sr != SAMPLING_RATE:
        audio = librosa.resample(audio, sr, SAMPLING_RATE)
    mel_tensor = preprocess(audio).to(device)

    with torch.no_grad():
        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))

    return torch.cat([ref_s, ref_p], dim=1)

/media/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2
177


In [2]:
config_list = {
    "en": {"config": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Models/EN-Multi-ID-Althaf-emphasis/config_ft_en_multi_id_althaf_sw_victoria.yml",
           "checkpoint": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Models/EN-Multi-ID-Althaf-emphasis/epoch_2nd_00029.pth"},
    "sw_multi": {"config": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Models/EN-Multi-ID-Althaf-SW-Victoria/config_ft_en_multi_id_althaf_sw_victoria.yml",
           "checkpoint": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Models/EN-Multi-ID-Althaf-SW-Victoria/epoch_2nd_00019.pth"},
    "sw_victoria": {"config": "./Models/SW-Bible-Victoria-20-epochs/config_ft_sw_bible_victoria.yml",
           "checkpoint": "./Models/SW-Bible-Victoria-20-epochs/epoch_2nd_00009.pth"},
    "sw_althaf": {"config": "./Models/SW-Bible-Althaf-20-epochs/config_ft_sw_bible_althaf.yml",
           "checkpoint": "./Models/SW-Bible-Althaf-20-epochs/epoch_2nd_00019.pth"},
    "lj": {"config": "./Models/LJSpeech/config.yml",
           "checkpoint": "./Models/LJSpeech/epoch_2nd_00100.pth"}
}

name = "sw_multi"

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

config = yaml.safe_load(open(
    config_list[name]["config"]
    ))

# load pretrained ASR model
ASR_config = config.get("ASR_config", False)
ASR_path = config.get("ASR_path", False)
print(ASR_config)
print(ASR_path)
text_aligner = load_ASR_models(ASR_path, ASR_config)

# load pretrained F0 model
F0_path = config.get("F0_path", False)
pitch_extractor = load_F0_models(F0_path)

# load BERT model
from Utils.PLBERT.util import load_plbert

BERT_path = config.get("PLBERT_dir", False)
plbert = load_plbert(BERT_path)

model_params = recursive_munch(config["model_params"])
model = build_model(model_params, text_aligner, pitch_extractor, plbert)
_ = [model[key].eval() for key in model]
_ = [model[key].to(device) for key in model]

params_whole = torch.load(config_list[name]["checkpoint"], map_location="cpu")
params = params_whole["net"]

for key in model:
    if key in params:
        print("%s loaded" % key)
        try:
            model[key].load_state_dict(params[key])
        except:
            from collections import OrderedDict

            state_dict = params[key]
            
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v
            # load params
            model[key].load_state_dict(new_state_dict, strict=False)
#             except:
#                 _load(params[key], model[key])
_ = [model[key].eval() for key in model]


from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

sampler = DiffusionSampler(
    model.diffusion.diffusion,
    sampler=ADPM2Sampler(),
    sigma_schedule=KarrasSchedule(
        sigma_min=0.0001, sigma_max=3.0, rho=9.0
    ),  # empirical parameters
    clamp=False,
)

Utils/ASR/config.yml
Utils/ASR/epoch_00080.pth




bert loaded
bert_encoder loaded
predictor loaded
decoder loaded
text_encoder loaded
predictor_encoder loaded
style_encoder loaded
diffusion loaded
text_aligner loaded
pitch_extractor loaded
mpd loaded
msd loaded
wd loaded


## Synthesize Speech


In [4]:
from pathlib import Path
import os
import soundfile as sf
import phonemizer


def inference(
    text,
    ref_s,
    global_phonemizer,
    alpha=0.3,
    beta=0.7,
    diffusion_steps=5,
    embedding_scale=1,
    phonemes=False,
):
    text = text.strip()
    if phonemes:
        ps = text
    else:
        ps = global_phonemizer.phonemize([text])[0]
    print(f"ps: {ps}")
    # ps = word_tokenize(ps[0])
    # ps = " ".join(ps)
    tokens = textcleaner(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

    with torch.no_grad():
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
        text_mask = length_to_mask(input_lengths).to(device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

        s_pred = sampler(
            noise=torch.randn((1, 256)).unsqueeze(1).to(device),
            embedding=bert_dur,
            embedding_scale=embedding_scale,
            features=ref_s,
            num_steps=diffusion_steps,
        ).squeeze(1)

        s = s_pred[:, 128:]
        ref = s_pred[:, :128]

        ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
        s = beta * s + (1 - beta) * ref_s[:, 128:]

        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)

        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)

        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame : c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        # encode prosody
        en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(en)
            asr_new[:, :, 0] = en[:, :, 0]
            asr_new[:, :, 1:] = en[:, :, 0:-1]
            en = asr_new

        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

        asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(asr)
            asr_new[:, :, 0] = asr[:, :, 0]
            asr_new[:, :, 1:] = asr[:, :, 0:-1]
            asr = asr_new

        out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))

    return (
        out.squeeze().cpu().numpy()[..., :-50]
    )  # weird pulse at the end of the model, need to be fixed later

def synthesize_speech(reference_dicts, output_directory, file_name, language, phonemes=False, alpha=0.3, beta=0.7, diffusion_steps = 10, embedding_scale=1):
    global_phonemizer = phonemizer.backend.EspeakBackend(
        language=language, preserve_punctuation=True, with_stress=True
    )
    start = time.time()
    noise = torch.randn(1, 1, 256).to(device)
    idx = 0
    for text, path in reference_dicts.items():
        try:
            # Convert string path to Path object for easier manipulation
            ref_s = compute_style(path)
            path = Path(path)
            # Create the output directory based on the reference path
            os.makedirs(output_directory, exist_ok=True)

            wav = inference(
                text,
                ref_s,
                global_phonemizer,
                alpha=alpha,
                beta=beta,
                diffusion_steps=diffusion_steps,
                embedding_scale=embedding_scale,
                phonemes=phonemes,
            )

            rtf = (time.time() - start) / (len(wav) / SAMPLING_RATE)
            print(f"RTF = {rtf:5f}")
            import IPython.display as ipd

            # print(k + " Synthesized:")
            # display(ipd.Audio(wav, rate=SAMPLING_RATE, normalize=False))

            sf.write(f"{output_directory}/{text}.wav", wav, SAMPLING_RATE)
            print(f"Audio saved to: {output_directory}/{text}.wav")
            idx += 1
        except Exception as e:
            print(e)

### US MADISON


In [14]:
reference_dicts = {
    """Learning is a journey filled with curiosity, wonder, and discovery. Every child deserves the opportunity to explore and grow at their own pace. By fostering a love for reading and numbers, we empower them to reach their fullest potential.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-US-Madison/concatenated_audio.wav",
}
synthesize_speech(reference_dicts, "./Demo/en-US-Madison/madison", "madison_reference", language="en-us", alpha=0.3, beta=0.7, embedding_scale=2)

In [None]:
### EXCLAMATION
reference_dicts = {
    """This spicy dish packs a punch!""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/madison-althaf-exclamations/wavs/line_0001.wav",

}

synthesize_speech(reference_dicts, "./Demo/en-US-Madison/madison2althaf/exclamation/synthesized", "exclamation", language="en-us", alpha=0.5, beta=0.0, embedding_scale=1)

### QUESTION
# reference = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-US-Madison/madison2althaf/madison_althaf_eleven_labs_reference.mp3"
# reference_dicts = {
#     """Did anyone remember to check the schedule?""": reference,
#     """Could this really be the best solution?""": reference,
#     """Who can I reach out to if I have further questions?""": reference,
#     """How did you come up with that idea?""": reference,
#     """Should we double-check the details before finalising?""": reference,
# }

# synthesize_speech(reference_dicts, "./Demo/en-US-Madison/madison2althaf/question/synthesized", "question", language="en-us", alpha=0.7, beta=1.0, embedding_scale=2)

### UK THALIA


In [84]:
reference_dicts = {
    """ StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. """: "/home/s44504/StyleTTS2/Demo/en-UK-Thalia/concatenated_audio.wav",
}

In [None]:
# reference = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-UK-Thalia/thalia2althaf/exclamation/eleven_thalia_exclamation_concat.wav"
reference_dicts = {
    "The holiday decorations are stunningly festive!": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/thalia-althaf-exclamations/wavs/line_0001.wav",
}

synthesize_speech(reference_dicts, "./Demo/en-UK-Thalia/thalia2althaf/exclamation/synthesized", "exclamation", language="en-gb", alpha=0.3, beta=0.0, embedding_scale=1)

# reference = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-UK-Thalia/thalia2althaf/thalia_althaf_eleven_labs.mp3"
# reference_dicts = {
#     """Did anyone remember to check the schedule?""": reference,
#     """Could this really be the best solution?""": reference,
#     """Who can I reach out to if I have further questions?""": reference,
#     """How did you come up with that idea?""": reference,
#     """Should we double-check the details before finalising?""": reference,
# }

# synthesize_speech(reference_dicts, "./Demo/en-UK-Thalia/thalia2althaf/question/synthesized", "question", language="en-gb", alpha=0.7, beta=0.9, embedding_scale=2)

### AU Zak


In [14]:
import pandas as pd

### Sentence
# reference = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/eleven_labs_zak2althaf_reference.mp3"
# reference_dicts = {
#     """Hi there!""": reference,
#     """Oh no!""": reference,
# }

# synthesize_speech(reference_dicts, "./Demo/en-AU-Zak/zak", "zak", language="en-gb", alpha=0.7, beta=0.9, embedding_scale=1)

 
# # EXCLAMATION
# reference = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/eleven_labs_zak2althaf_reference.mp3"
# reference_dicts = {
#     """Congratulation on your achievement!""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/zak2althaf-exclamation/eleven-labs/Congratulations on your achievement!!!!.mp3",
#     """You did an amazing job!""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/zak2althaf-exclamation/eleven-labs/You did an amazing job!!!!!.mp3",
#     # """Look out, there’s a car coming!""": reference,
#     # """You’ve got to try this recipe; it’s incredible!""": reference,
#     # """Let’s go on an adventure!""": reference,
# }

# synthesize_speech(reference_dicts, "./Demo/en-AU-Zak/zak2althaf-exclamation/synthesized", "exclamation", language="en-gb", alpha=0.5, beta=0.0, embedding_scale=1)

### QUESTION
# reference = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/zak2althaf-question/eleven-labs/ElevenLabs_2024-11-01T15_51_57_zak-althaf_ivc_s81_sb100_se39_b_m2.mp3"
# reference_dicts = {
#     """Did anyone remember to check the schedule?""": reference,
#     """Could this really be the best solution?""": reference,
#     """Who can I reach out to if I have further questions?""": reference,
#     """How did you come up with that idea?""": reference,
#     """Should we double-check the details before finalising?""": reference,
# }

# synthesize_speech(reference_dicts, "./Demo/en-AU-Zak/zak2althaf-question/synthesized", "zak_althaf_question", language="en-gb", alpha=0.3, beta=0.7, embedding_scale=1)

# SINGLE WORD
# reference = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/eleven_labs_zak2althaf_reference.mp3"
# reference_dicts = {"suddenly.": reference}

# Short Sentence
short_sentence = pd.read_csv("/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/eleven_labs/audio/zak-althaf-short-sentences/metadata.csv")

reference_dicts = {}
for datum in short_sentence[:20].iterrows():
    text = datum[1]["text"]
    path_to_audio = datum[1]["path_to_audio"]
    path_to_audio = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/eleven_labs/audio/zak-althaf-short-sentences/" + path_to_audio
    reference_dicts[text] =path_to_audio

synthesize_speech(reference_dicts, "./Demo/en-AU-Zak/zak2althaf-short-sentences/synthesized", "zak_althaf", language="en-gb", alpha=0.5, beta=0.3, embedding_scale=1)

ps: ɪt sˈɛd nˈəʊ. 
RTF = 0.043451
Audio saved to: ./Demo/en-AU-Zak/zak2althaf-short-sentences/synthesized/It said no..wav
ps: ɐ wˈʊlf kˈʌmz. 
RTF = 0.069540
Audio saved to: ./Demo/en-AU-Zak/zak2althaf-short-sentences/synthesized/A wolf comes..wav
ps: ɐn ʌmbɹˈɛlə? 
RTF = 0.126999
Audio saved to: ./Demo/en-AU-Zak/zak2althaf-short-sentences/synthesized/An umbrella?.wav
Calculated padded input size per channel: (5 x 4). Kernel size: (5 x 5). Kernel size can't be greater than actual input size
Calculated padded input size per channel: (5 x 4). Kernel size: (5 x 5). Kernel size can't be greater than actual input size
ps: ðə kˈat ɹˈan. 
RTF = 0.174621
Audio saved to: ./Demo/en-AU-Zak/zak2althaf-short-sentences/synthesized/The cat ran..wav
ps: dˈaɹən sˈɛd. 
RTF = 0.206162
Audio saved to: ./Demo/en-AU-Zak/zak2althaf-short-sentences/synthesized/Darren said..wav
ps: mˈʌm ɪz ɡlˈad. 
RTF = 0.246738
Audio saved to: ./Demo/en-AU-Zak/zak2althaf-short-sentences/synthesized/Mum is glad..wav
Calculated p

#### Emphasis


In [None]:
reference_dicts = {
    """ kəŋ"ɡɹˈat"ʃʊlˈeɪʃən "ˌɒn" "jɔːɹ" ɐt"ʃˈiːv"mənt!!! """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/eleven_labs_zak2althaf_reference.mp3",
    """ "aɪ" "kˈɑːnt" bɪ"lˈiːv" "juː" "dˈɪd" "ɪt!!!" """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/eleven_labs_zak2althaf_reference.mp3",
    # """lˈʊk ˈaʊt, ðeəz ɐ kˈɑː kˈʌmɪŋ!  """: "/home/s44504/StyleTTS2/Demo/en-AU-Zak/en-AU-Zak_0.wav",
    # """juːv ɡɒt tə tɹˈaɪ ðɪs ɹˈɛsɪpˌiː; ɪts ɪŋkɹˈɛdɪbəl!  """: "/home/s44504/StyleTTS2/Demo/en-AU-Zak/en-AU-Zak_0.wav",
    # """lˈɛts ɡˌəʊ ˌɒn ɐn ɐdvˈɛntʃə!  """: "/home/s44504/StyleTTS2/Demo/en-AU-Zak/en-AU-Zak_0.wav",
}

synthesize_speech(reference_dicts, "./Demo/en-AU-Zak/zak2althaf-emphasis", "", language="en-gb", alpha=0.7, beta=0.9, embedding_scale=1, phonemes=True)

In [None]:
reference_dicts = {
    """ ðɪs ɪz wɪðˌaʊt ˈɛmfəsɪs. "ðɪs" ɪz "wɪð" "ˈɛm"fəsɪs. """: "/home/s44504/StyleTTS2/Demo/en-AU-Zak/concatenated_audio.wav",
    """aɪ kəm"plˈiːt"li ɐ"ɡɹˈiː" wɪð juː.""": "/home/s44504/StyleTTS2/Demo/en-AU-Zak/Zak2Althaf/Althaf-Zak_1.wav",
}

synthesize_speech(reference_dicts, "emphasis-althaf-zak", "emphasis-althaf-zak", language="en-gb", phonemes=True)

### ID ALTHAF


In [9]:

reference_dicts = {
    """Lalu...""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav",
    """'Mungkin... aku bisa bantu!' kata burung.  Burung lalu minta untuk menjaga singa itu. Zebra boleh pergi makan sekarang.  Burung sudah selesai makan dan masih kenyang sekali. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav",
    """"Bintik merah ini menular!" kata kami kepada Aden. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav",
    """Buku ini adalah buku Non-Fiksi.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav",
    """Informasi.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav",
    """Kalau begitu...""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav",
    """Ulat bulu harus mengalami fase-fase berganti kulit atau instar. Lalu, mereka akan bertransformasi menjadi kepompong.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav",
}

        
        
synthesize_speech(reference_dicts, "./Demo/id-ID-Althaf/synthesized", "althaf", language="id", alpha=0.1, beta=0.3, embedding_scale=1)

ps: lˈalu... 
RTF = 0.095099
Audio saved to: ./Demo/id-ID-Althaf/synthesized/Lalu....wav
ps: mˈuŋkin... ˈaku bˈisa bˈantu!kˈata bˈuruŋ.  bˈuruŋ lˈalu mˈinta ˌuntuk məndʒˈaɡa sˈiŋa ˈitu. zˈɛbra bˈoləh pˈɛrɡi mˈakan səkˈaraŋ.  bˈuruŋ sˈudah səlˈɛsaɪ mˈakan dan mˈasih kəɲˈaŋ səkˈali. 
RTF = 0.079740
Audio saved to: ./Demo/id-ID-Althaf/synthesized/'Mungkin... aku bisa bantu!' kata burung.  Burung lalu minta untuk menjaga singa itu. Zebra boleh pergi makan sekarang.  Burung sudah selesai makan dan masih kenyang sekali. .wav
ps: "bˈintik mˈɛrah ˈini mənˈular!" kˈata kˈami kəpˈada ˈadən. 
RTF = 0.384992
Audio saved to: ./Demo/id-ID-Althaf/synthesized/"Bintik merah ini menular!" kata kami kepada Aden. .wav
ps: bˈuku ˈini adˌalah bˈuku nˈonfˈiksi. 
RTF = 0.707816
Audio saved to: ./Demo/id-ID-Althaf/synthesized/Buku ini adalah buku Non-Fiksi..wav
ps: ˌinfɔrmˈasi. 
RTF = 1.666144
Audio saved to: ./Demo/id-ID-Althaf/synthesized/Informasi..wav
ps: kˈalaʊ bəɡˈitu... 
RTF = 1.935199
Audio saved to: .

In [None]:
synthesize_speech(reference_dicts)

### SW Victoria


In [5]:
from datasets import load_dataset

dataset = load_dataset("bookbot/OpenBible_Swahili", "JHN_clean", num_proc=20)

In [None]:
# from IPython.display import display, Audio
# import soundfile as sf

# def play_audio(example):
#     audio = example['audio']
#     display(Audio(audio['array'], rate=audio['sampling_rate']))
    
# play_audio(dataset["train"][0])
# output_directory = "/home/s44504/StyleTTS2-clone/Demo/Swahili-Bible-John/original"
# os.makedirs(output_directory, exist_ok=True)


# for i, datum in enumerate(dataset["train"]):
#     if i == 6:
#         break
#     audio_sample = datum["audio"]

#         # Create the full path for the output file
#     output_file_path = os.path.join(output_directory, f"{datum['id']}.wav")

#     # Write the audio data to the file
#     sf.write(output_file_path, audio_sample["array"], audio_sample["sampling_rate"])
    

In [9]:
# # Testing sounds with 2 consonant cluster
# reference_dicts = {
#     """Habari za ukoo wa Yesu Kristo mwana wa Daudi, mwana wa Abrahamu.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/sw-TZ-Victoria/wavs/sw-TZ-Victoria_000026.wav",
#     """Abrahamu akamzaa Isaki, Isaki akamzaa Yakobo, Yakobo akawazaa Yuda na ndugu zake.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/sw-TZ-Victoria/wavs/sw-TZ-Victoria_000026.wav",
#     """Yuda akawazaa Peresi na Zera, ambao mama yao alikuwa Tamari, Peresi akamzaa Hesroni, Hesroni akamzaa Aramu.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/sw-TZ-Victoria/wavs/sw-TZ-Victoria_000026.wav",
#     """Aramu akamzaa Aminadabu, Aminadabu akamzaa Nashoni, Nashoni akamzaa Salmoni.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/sw-TZ-Victoria/wavs/sw-TZ-Victoria_000026.wav",
#     """Salmoni akamzaa Boazi, na mama yake Boazi alikuwa Rahabu, Boazi akamzaa Obedi, ambaye mama yake alikuwa Ruthu, Obedi akamzaa Yese.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/sw-TZ-Victoria/wavs/sw-TZ-Victoria_000026.wav",
# }
reference_dicts = {
    """Habari za ukoo wa Yesu Kristo mwana wa Daudi, mwana wa Abrahamu.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_001_smooth.wav",
    """Abrahamu akamzaa Isaki, Isaki akamzaa Yakobo, Yakobo akawazaa Yuda na ndugu zake.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_002_smooth.wav",
    """Yuda akawazaa Peresi na Zera, ambao mama yao alikuwa Tamari, Peresi akamzaa Hesroni, Hesroni akamzaa Aramu.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_003_smooth.wav",
    """Aramu akamzaa Aminadabu, Aminadabu akamzaa Nashoni, Nashoni akamzaa Salmoni.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_004_smooth.wav",
    """Salmoni akamzaa Boazi, na mama yake Boazi alikuwa Rahabu, Boazi akamzaa Obedi, ambaye mama yake alikuwa Ruthu, Obedi akamzaa Yese.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_005_smooth.wav",
}

# reference_dicts = {
#     """Ningependa kwenda shuleni leo. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_004_023_smooth.wav",
#     """Mbwa wangu anapenda kula nyama. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_004_023_smooth.wav",
#     """Ndizi zangu zimeiva vizuri. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_004_023_smooth.wav",
#     """Mtoto anapiga ngoma kwa nguvu. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_004_023_smooth.wav",
#     """Mboga za kijani ni nzuri kwa afya. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_004_023_smooth.wav",
#     """Tonge la ugali ni tamu. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_004_023_smooth.wav",
# }

synthesize_speech(reference_dicts, "./Demo/Swahili-Bible/MAT/althaf", "althaf", language="sw", alpha=0.7, beta=0.5, embedding_scale=1)

ps: habˈari za ukˈoo wa jˈesu krˈisto mwˈana wa daˈudi, mwˈana wa ˌabrahˈamu. 
RTF = 0.021256
Audio saved to: ./Demo/Swahili-Bible/MAT/althaf/Habari za ukoo wa Yesu Kristo mwana wa Daudi, mwana wa Abrahamu..wav
ps: ˌabrahˈamu ˌakamzˈaa isˈaki, isˈaki ˌakamzˈaa jakˈobo, jakˈobo ˌakawazˈaa jˈuda na ndˈuɡu zˈake. 
RTF = 0.041515
Audio saved to: ./Demo/Swahili-Bible/MAT/althaf/Abrahamu akamzaa Isaki, Isaki akamzaa Yakobo, Yakobo akawazaa Yuda na ndugu zake..wav
ps: jˈuda ˌakawazˈaa perˈesi na zˈera, ambˈao mˈama jˈao ˌalikˈuwa tamˈari, perˈesi ˌakamzˈaa hesrˈoni, hesrˈoni ˌakamzˈaa arˈamu. 
RTF = 0.063048
Audio saved to: ./Demo/Swahili-Bible/MAT/althaf/Yuda akawazaa Peresi na Zera, ambao mama yao alikuwa Tamari, Peresi akamzaa Hesroni, Hesroni akamzaa Aramu..wav
ps: arˈamu ˌakamzˈaa ˌaminadˈabu, ˌaminadˈabu ˌakamzˈaa naʃˈoni, naʃˈoni ˌakamzˈaa salmˈoni. 
RTF = 0.114912
Audio saved to: ./Demo/Swahili-Bible/MAT/althaf/Aramu akamzaa Aminadabu, Aminadabu akamzaa Nashoni, Nashoni akamzaa Salmon

## S2S


In [5]:
import phonemizer

def s2s(
    text,
    ref_s,
    target_s,
    language,
    alpha=0.8,
    beta=0.1,
    diffusion_steps=10,
    embedding_scale=1,
    phonemes=False,
):
    global_phonemizer = phonemizer.backend.EspeakBackend(
        language=language, preserve_punctuation=True, with_stress=True
    )

    text = text.strip()
    if phonemes:
        ps = text
    else:
        ps = global_phonemizer.phonemize([text])[0]
    print(f"ps: {ps}")
    # ps = word_tokenize(ps[0])
    # ps = " ".join(ps)
    tokens = textcleaner(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

    with torch.no_grad():
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
        text_mask = length_to_mask(input_lengths).to(device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

        s_pred = sampler(
            noise=torch.randn((1, 256)).unsqueeze(1).to(device),
            embedding=bert_dur,
            embedding_scale=embedding_scale,
            features=target_s,  # reference from the same speaker as the embedding
            num_steps=diffusion_steps,
        ).squeeze(1)

        ref = s_pred[:, :128]
        s = s_pred[:, 128:]
        
        # Ref depebds on target_styke
        ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
        s = beta * s + (1 - beta) * ref_s[:, 128:]

        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)

        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)

        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame : c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        # encode prosody
        en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(en)
            asr_new[:, :, 0] = en[:, :, 0]
            asr_new[:, :, 1:] = en[:, :, 0:-1]
            en = asr_new

        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

        asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(asr)
            asr_new[:, :, 0] = asr[:, :, 0]
            asr_new[:, :, 1:] = asr[:, :, 0:-1]
            asr = asr_new

        out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))

    return out.squeeze().cpu().numpy()[..., :-50]

In [14]:
### Angry
# reference_dicts = {
#     # "StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis." : "/home/s44504/StyleTTS2-clone/Demo/en-US-Madison/madison2althaf/madison2althaf_0.wav",
#     # "I am very angry at him! He pisses me off" : "/home/s44504/StyleTTS2-clone/Demo/en-US-Madison/madison2althaf/madison2althaf_1.wav",
#     "The field of astronomy is a joke! Its theories are based on flawed observations!" : "/home/s44504/StyleTTS2-clone/Demo/en-US-Madison/Angry-1.wav",
#     "This is absolutely unacceptable, and I won’t tolerate it any longer!" : "/home/s44504/StyleTTS2-clone/Demo/en-US-Madison/Angry-3.wav",
#     "Not at this particular case, Tom, apologized Whittemore." : "/home/s44504/StyleTTS2-clone/Demo/jeanie-Angry/amused_1-28_0002.wav",
# }

# ### Thalia
# reference_dicts = {
#     """ StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. """: "/home/s44504/StyleTTS2-clone/Demo/en-UK-Thalia/thalia/en-thalia_0.wav",
#     """ Then. whoosh! It takes off! """: "/home/s44504/StyleTTS2-clone/Demo/en-UK-Thalia/thalia/en-thalia_1.wav",
#     """ So he is convinced he can fix that. So he goes to work creating a red hut here. """: "/home/s44504/StyleTTS2-clone/Demo/en-UK-Thalia/thalia/en-thalia_2.wav",
#     """ This is Forester. He works out in the forest. """: "/home/s44504/StyleTTS2-clone/Demo/en-UK-Thalia/thalia/en-thalia_3.wav",
#     """ It brings him a lot of happiness to work with animals. """: "/home/s44504/StyleTTS2-clone/Demo/en-UK-Thalia/thalia/en-thalia_4.wav",
#     """ He checks on them every day to make sure that they are doing well. """: "/home/s44504/StyleTTS2-clone/Demo/en-UK-Thalia/thalia/en-thalia_5.wav",
# }

### Zak
# reference_dicts = {
    # """Hi there!""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/zak/Hi there!.wav",
    # """ Then, whoosh! It takes off! """: "/home/s44504/StyleTTS2-clone/Demo/en-AU-Zak/zak/en-zak_1.wav",
    # """ So he is convinced he can fix that. So he goes to work creating a red hut here. """: "/home/s44504/StyleTTS2-clone/Demo/en-AU-Zak/zak/en-zak_2.wav",
    # """ This is Forester. He works out in the forest. """: "/home/s44504/StyleTTS2-clone/Demo/en-AU-Zak/zak/en-zak_3.wav",
    # """ It brings him a lot of happiness to work with animals. """: "/home/s44504/StyleTTS2-clone/Demo/en-AU-Zak/zak/en-zak_4.wav",
    # """ He checks on them every day to make sure that they are doing well. """: "/home/s44504/StyleTTS2-clone/Demo/en-AU-Zak/zak/en-zak_5.wav",
    # """ Diya never wants to talk in class. """: "/home/s44504/StyleTTS2-clone/Demo/en-AU-Zak/zak/en-zak_6.wav",
    # }

# Madison
# reference_dicts = {
#    "Learning is a journey filled with curiosity, wonder, and discovery. Every child deserves the opportunity to explore and grow at their own pace. By fostering a love for reading and numbers, we empower them to reach their fullest potential." : "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-US-Madison/madison/madison_reference.wav",
# }


### Swahili
# reference_dicts = {
#     """Habari za ukoo wa Yesu Kristo mwana wa Daudi, mwana wa Abrahamu.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_001_smooth.wav",
#     """Abrahamu akamzaa Isaki, Isaki akamzaa Yakobo, Yakobo akawazaa Yuda na ndugu zake.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_002_smooth.wav",
#     """Yuda akawazaa Peresi na Zera, ambao mama yao alikuwa Tamari, Peresi akamzaa Hesroni, Hesroni akamzaa Aramu.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_003_smooth.wav",
#     """"Aramu akamzaa Aminadabu, Aminadabu akamzaa Nashoni, Nashoni akamzaa Salmoni.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/OpenBible_Swahili_Althaf/MAT_clean/wavs/MAT_001_004_smooth.wav",
#     """a. sa. nte.""": "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/sw-TZ-Althaf-Syllables/wavs/sw-TZ-Victoria_syllable_1150_0.wav",
# }

# reference_dicts = {
#     """Ningependa kwenda shuleni leo. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/Swahili-Bible-John/synthesized/ Ningependa kwenda shuleni leo. .wav",
#     """Mbwa wangu anapenda kula nyama. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/Swahili-Bible-John/synthesized/ Mbwa wangu anapenda kula nyama. .wav",
#     """Ndizi zangu zimeiva vizuri. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/Swahili-Bible-John/synthesized/ Ndizi zangu zimeiva vizuri. .wav",
#     """Mtoto anapiga ngoma kwa nguvu. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/Swahili-Bible-John/synthesized/ Mtoto anapiga ngoma kwa nguvu. .wav",
#     """Mboga za kijani ni nzuri kwa afya. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/Swahili-Bible-John/synthesized/ Mboga za kijani ni nzuri kwa afya. .wav",
#     """Tonge la ugali ni tamu. """: "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/Swahili-Bible-John/synthesized/ Tonge la ugali ni tamu. .wav",
# }

# Short Sentence
import pandas as pd
short_sentence = pd.read_csv("/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/eleven_labs/audio/zak-althaf-short-sentences/metadata.csv")

reference_dicts = {}
for datum in short_sentence[:30].iterrows():
    text = datum[1]["text"]
    path_to_audio = datum[1]["path_to_audio"]
    path_to_audio = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/eleven_labs/audio/zak-althaf-short-sentences/" + path_to_audio
    reference_dicts[text] = path_to_audio

ref_althaf = compute_style(
    "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav"
)

idx = 0
for text, ref_path in reference_dicts.items():
    try:
        ref_path = compute_style(ref_path)
        wav = s2s(
            text,
            ref_path,
            ref_althaf,
            "en-gb",
            alpha=0.6,
            beta=0.2,
            diffusion_steps=10,
            phonemes=False
        )
    except Exception as e:
        continue
    
    # Apply fadeout (50ms)
    fade_length = int(SAMPLING_RATE * 0.1)  # 50ms
    fade_curve = np.linspace(1.0, 0.0, fade_length)
    wav[-fade_length:] *= fade_curve
    
    # Add 100ms silence
    silence_length = int(SAMPLING_RATE * 0.1)  # 100ms
    silence = np.zeros(silence_length)
    wav = np.concatenate([wav, silence])
    
    target_dir = Path("/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/StyleTTS2/Demo/en-AU-Zak/zak2althaf-short-sentences/speech2speech")
    os.makedirs(target_dir, exist_ok=True)
    target_path = f"{target_dir}/{text}.wav"
    sf.write(target_path, wav, SAMPLING_RATE)
    from audiostretchy.stretch import stretch_audio
    stretch_audio(target_path, target_path, ratio=0.8)
    
    idx += 1

ps: ɪt sˈɛd nˈəʊ. 
ps: ɐ wˈʊlf kˈʌmz. 
ps: ɐn ʌmbɹˈɛlə? 
ps: ðə kˈat ɹˈan. 
ps: dˈaɹən sˈɛd. 
ps: mˈʌm ɪz ɡlˈad. 
ps: ðə mˈan nˈɒdɪd. 
ps: zˈɒmbi ɐtˈak. 
ps: ɐ spˈɛʃəl flˈaʊə. 
ps: ɐ fˈɑːm dˈɒɡ. 
ps: ɪt lˈandz ˈɒn. 
ps: ɪt hˈɪt mˌiː. 
ps: tˈɪk ɪz sˈeɪf! 
ps: ɐ bˈɪɡ mˈɛs! 
ps: flˈaʊə hˈʌnt. 
ps: pˈat ðə kˈat. 
ps: hˈeɪ, pˈat! 
ps: jˈuːv ˈɜːnd ɪt. 
ps: ɪt swˈɪŋz. 
ps: wiː ɑː fˈɪʃ. 
ps: fˈʊl mˈɛtəl nˈɛd. 
ps: pˈʌpɪz bˈɜːθdeɪ. 
ps: maɪ bˈɛd? 
ps: ɪts ˌəʊkˈeɪ, ˈɛlɪfənt. 


In [17]:

audio_path = "/home/s44504/3b01c699-3670-469b-801f-13880b9cac56/en-Multi-Exclamation-24kHz/wavs/en-AU-Zak_line_0001.wav"
stretch_audio(audio_path, "output.wav", ratio=0.8)

In [None]:
reference_dicts = {
    " His name is Archie. He likes to make stuff.  Archie works all of the time. He has made bots to help him in his home. He has made a bot to do art. He must be fit to work well. He rides his bike. Archie made a van that helps.  He can use cogs and tubes to make wild bits and bobs. Will Archie make bots to help or for fun next? " : "/home/s44504/StyleTTS2/Demo/en-AU-Zak/zak/en-zak_0.wav",
    # "StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis." : "/home/s44504/StyleTTS2-clone/Demo/en-US-Madison/madison/madison_template_text_0.wav",
    # "I am very angry at him! He pisses me off" : "/home/s44504/StyleTTS2-clone/Demo/en-US-Madison/madison/madison_0.wav",
    # "The field of astronomy is a joke! Its theories are based on flawed observations!" : "/home/s44504/StyleTTS2-clone/Demo/en-US-Madison/madison/madison_1.wav",
    # "This is absolutely unacceptable, and I won’t tolerate it any longer!" : "/home/s44504/StyleTTS2-clone/Demo/en-US-Madison/madison/madison_2.wav"
}

ref_althaf = compute_style(
    "/home/s44504/StyleTTS2/Demo/id-ID-Althaf/concatenated_audio.wav"
)

idx = 0
for text, ref_path in reference_dicts.items():
    ref_path = compute_style(ref_path)
    wav = s2s(
        text,
        ref_path,
        ref_althaf,
        "en-gb",
        alpha=0.7,
        beta=0.3,
        diffusion_steps=10,
        phonemes=False,
    )
    
    target_dir = Path("Demo/en-AU-Zak/zak2althaf")
    os.makedirs(target_dir, exist_ok=True)
    sf.write(f"{target_dir}/zak2althaf_{idx}.wav", wav, SAMPLING_RATE)
    idx += 1