In [1]:
import torch
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

import random
random.seed(0)

import numpy as np
np.random.seed(0)

%cd ..

# load packages
import time
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
from nltk.tokenize import word_tokenize, TweetTokenizer 

import soundfile as sf
from pathlib import Path

from models import *
from utils import *
from text_utils import TextCleaner
textclenaer = TextCleaner()

%matplotlib inline

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


/root/StyleTTS2
177


In [2]:
to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300
)
mean, std = -4, 4


def length_to_mask(lengths):
    mask = (
        torch.arange(lengths.max())
        .unsqueeze(0)
        .expand(lengths.shape[0], -1)
        .type_as(lengths)
    )
    mask = torch.gt(mask + 1, lengths.unsqueeze(1))
    return mask


def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor


def compute_style(path):
    wave, sr = librosa.load(path, sr=24000)
    audio, index = librosa.effects.trim(wave, top_db=30)
    if sr != 24000:
        audio = librosa.resample(audio, sr, 24000)
    mel_tensor = preprocess(audio).to(device)

    with torch.no_grad():
        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))

    return torch.cat([ref_s, ref_p], dim=1)

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# load phonemizer
import phonemizer

global_phonemizer = phonemizer.backend.EspeakBackend(
    language="en-us", preserve_punctuation=True, with_stress=True
)

config = yaml.safe_load(open("/root/StyleTTS2-en-Multi-id-Althaf/config_ft_en_multi_id_althaf.yml"))
# config = yaml.safe_load(open("/root/StyleTTS2-en-US-Madison/config_ft_madison.yml"))

# load pretrained ASR model
ASR_config = config.get("ASR_config", False)
ASR_path = config.get("ASR_path", False)
text_aligner = load_ASR_models(ASR_path, ASR_config)

# load pretrained F0 model
F0_path = config.get("F0_path", False)
pitch_extractor = load_F0_models(F0_path)

# load BERT model
from Utils.PLBERT.util import load_plbert

BERT_path = config.get("PLBERT_dir", False)
plbert = load_plbert(BERT_path)

model_params = recursive_munch(config["model_params"])
model = build_model(model_params, text_aligner, pitch_extractor, plbert)
_ = [model[key].eval() for key in model]
_ = [model[key].to(device) for key in model]

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
params_whole = torch.load("/root/StyleTTS2-en-Multi-id-Althaf/epoch_2nd_00024.pth", map_location="cpu")
# params_whole = torch.load("/root/StyleTTS2-en-US-Madison/epoch_2nd_00099.pth", map_location="cpu")
params = params_whole["net"]

In [5]:
for key in model:
    if key in params:
        print("%s loaded" % key)
        try:
            model[key].load_state_dict(params[key])
        except:
            from collections import OrderedDict

            state_dict = params[key]
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v
            # load params
            model[key].load_state_dict(new_state_dict, strict=False)
#             except:
#                 _load(params[key], model[key])
_ = [model[key].eval() for key in model]

bert loaded
bert_encoder loaded
predictor loaded
decoder loaded
text_encoder loaded
predictor_encoder loaded
style_encoder loaded
diffusion loaded
text_aligner loaded
pitch_extractor loaded
mpd loaded
msd loaded
wd loaded


In [6]:
from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

sampler = DiffusionSampler(
    model.diffusion.diffusion,
    sampler=ADPM2Sampler(),
    sigma_schedule=KarrasSchedule(
        sigma_min=0.0001, sigma_max=3.0, rho=9.0
    ),  # empirical parameters
    clamp=False,
)

In [7]:
tk = TweetTokenizer() 
def inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1):
    text = text.strip()
    ps = global_phonemizer.phonemize([text])
    ps = tk.tokenize(ps[0])
    ps = " ".join(ps)
    tokens = textclenaer(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

    with torch.no_grad():
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
        text_mask = length_to_mask(input_lengths).to(device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

        s_pred = sampler(
            noise=torch.randn((1, 256)).unsqueeze(1).to(device),
            embedding=bert_dur,
            embedding_scale=embedding_scale,
            features=ref_s,
            num_steps=diffusion_steps,
        ).squeeze(1)

        s = s_pred[:, 128:]
        ref = s_pred[:, :128]

        ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
        s = beta * s + (1 - beta) * ref_s[:, 128:]

        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)

        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)

        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame : c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        # encode prosody
        en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(en)
            asr_new[:, :, 0] = en[:, :, 0]
            asr_new[:, :, 1:] = en[:, :, 0:-1]
            en = asr_new

        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

        asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(asr)
            asr_new[:, :, 0] = asr[:, :, 0]
            asr_new[:, :, 1:] = asr[:, :, 0:-1]
            asr = asr_new

        out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))

    return (
        out.squeeze().cpu().numpy()[..., :-50]
    )  # weird pulse at the end of the model, need to be fixed later

## Synthesize Speech

In [33]:
from pathlib import Path
import os
import soundfile as sf


def synthesize_speech(reference_dicts):
    start = time.time()
    noise = torch.randn(1, 1, 256).to(device)
    for path, text in reference_dicts.items():
        try:
            # Convert string path to Path object for easier manipulation
            ref_s = compute_style(path)
            path = Path(path)
            # Create the output directory based on the reference path
            # output_directory = path.parent / "synthesized_en_madison"
            output_directory = path.parent / "synthesized_multilingual_en_id"
            os.makedirs(output_directory, exist_ok=True)

            wav = inference(
                text, ref_s, alpha=0.0, beta=0.0, diffusion_steps=8, embedding_scale=1
            )

            rtf = (time.time() - start) / (len(wav) / 24000)
            print(f"RTF = {rtf:5f}")
            import IPython.display as ipd

            # print(k + " Synthesized:")
            # display(ipd.Audio(wav, rate=24000, normalize=False))
            sf.write(f"{output_directory}/{path.name}", wav, 24000)
        except Exception as e:
            print(e)

        # print("Reference:")
        # display(ipd.Audio(path, rate=24000, normalize=False))

### US MADISON

#### Angry

In [42]:
reference_dicts = {}
        
# reference_dicts = {
#     "/root/StyleTTS2/Demo/US-Madison/Happy.wav": """“ I’m absolutely over the moon right now! This moment has filled me with overwhelming joy and gratitude!" said he happily. """,
#     "/root/StyleTTS2/Demo/US-Madison/Happy_concatenated_audio.wav": """ The joyous laughter of children echoed through the park, filling the air with a symphony of bliss! said he joyfully """,
#     "/root/StyleTTS2/Demo/US-Madison/Angry.wav": """" The field of astronomy is a joke! Its theories are based on flawed observations!" she said, angrily. """,
#     "/root/StyleTTS2/Demo/US-Madison/Sad.wav": """ "My heart is heavy with profound sadness. This moment has left me feeling deeply sorrowful and lost." said he in a sad tone. """,
#     "/root/StyleTTS2/Demo/US-Madison/Whisper.wav": """ "Did you hear what happened? Keep this a secret, okay.” he whispered softly. """,
#     "/root/StyleTTS2/Demo/US-Madison/Shouting.wav": """ “Watch out! There’s a car coming from the right side, and it’s moving fast!” she shouted. """,
#     "/root/StyleTTS2/Demo/US-Madison/Shouting Angry.wav": """ The field of astronomy is a joke! Its theories are based on flawed observations!" she said, angrily. """,
# }

reference_dicts = {
    # "/root/StyleTTS2/Demo/US-Madison/Happy.wav": """ The joyous laughter of children echoed through the park! filling the air with a symphony of bliss! """,
    # "/root/StyleTTS2/Demo/US-Madison/Happy_concatenated_audio.wav": """ The joyous laughter of children echoed through the park! filling the air with a symphony of bliss! """,
    # "/root/StyleTTS2/Demo/US-Madison/Angry.wav": """ I can’t believe this happened! This is absolutely infuriating! """,
    # "/root/StyleTTS2/Demo/US-Madison/Angry_concatenated_audio.wav": """ I can’t believe this happened! This is absolutely infuriating! """,
    # "/root/StyleTTS2/Demo/US-Madison/Sad.wav": """ I feel utterly heartbroken. This news has left me devastated. """,
    "/root/StyleTTS2/Demo/US-Madison/Adrian_Whisper_cleaned.wav": """ Did you hear that? I think I heard someone's coming. """,
    "/root/StyleTTS2/Demo/US-Madison/Madison_whisper.wav": """ Did you hear that? I think I heard someone's coming. """,
    "/root/StyleTTS2/Demo/US-Madison/David_Eleven_Labs_Whisper.wav": """ Did you hear that? I think I heard someone's coming. """,
    "/root/StyleTTS2/Demo/US-Madison/Whisper_adrian_concat.mp3": """ Don’t let anyone else know about this. Meet me quietly in the hallway. """,
    "/root/StyleTTS2/Demo/US-Madison/Whisper_adrian_concat_2.mp3": """ Don’t let anyone else know about this. Meet me quietly in the hallway. """,
    "/root/StyleTTS2/Demo/US-Madison/Whisper_concatenated_audio.wav": """ Don’t let anyone else know about this. Meet me quietly in the hallway. """,
    # "/root/StyleTTS2/Demo/US-Madison/Shouting.wav": """ Hey! Look out for the incoming tiger! """,
    # "/root/StyleTTS2/Demo/US-Madison/Shouting_concatenated_audio.wav": """ Hey! Look out for the incoming tiger! """,
    # "/root/StyleTTS2/Demo/US-Madison/Shouting-Angry-1.wav": """ I am so angry right now! I don't want to see him anymore! """,
    # "/root/StyleTTS2/Demo/US-Madison/Shouting-Angry_concatenated_audio.wav": """ I am so angry right now! I don't want to see him anymore! """,
}

# reference_dicts = {
#     "/root/StyleTTS2/Demo/US-Madison/Happy.wav": """ I just got amazing news! Today is such a joyful day! """,
#     "/root/StyleTTS2/Demo/US-Madison/Angry.wav": """ I can’t believe this happened! This is absolutely infuriating! """,
#     "/root/StyleTTS2/Demo/US-Madison/Shouting Angry.wav": """ I can’t believe this happened! This is absolutely infuriating! """,
#     "/root/StyleTTS2/Demo/US-Madison/Sad.wav": """ I feel utterly heartbroken. This news has left me devastated. """,
#     "/root/StyleTTS2/Demo/US-Madison/Whisper.wav": """ Don’t let anyone else know about this. Meet me quietly in the hallway. """,
#     "/root/StyleTTS2/Demo/US-Madison/Shouting.wav": """ Get out of the way right now! Everyone, listen up, we need to move quickly! """,
# }

In [43]:
synthesize_speech(reference_dicts)

RTF = 0.042922
RTF = 0.082087
RTF = 0.174904
RTF = 0.122203
RTF = 0.168355
RTF = 0.228435


# SpeechToSpeech

In [78]:
tk = TweetTokenizer() 
def s2s(text, ref_s, target_s, alpha=0.8, beta=0.1, diffusion_steps=10, embedding_scale=1):
    text = text.strip()
    ps = global_phonemizer.phonemize([text])
    ps = tk.tokenize(ps[0])
    ps = " ".join(ps)
    tokens = textclenaer(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

    with torch.no_grad():
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
        text_mask = length_to_mask(input_lengths).to(device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

        s_pred = sampler(
            noise=torch.randn((1, 256)).unsqueeze(1).to(device),
            embedding=bert_dur,
            embedding_scale=embedding_scale,
            features=target_s,  # reference from the same speaker as the embedding
            num_steps=diffusion_steps,
        ).squeeze(1)

        ref = s_pred[:, :128]
        s = s_pred[:, 128:]

        ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
        s = beta * s + (1 - beta) * ref_s[:, 128:]

        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)

        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)

        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame : c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        # encode prosody
        en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(en)
            asr_new[:, :, 0] = en[:, :, 0]
            asr_new[:, :, 1:] = en[:, :, 0:-1]
            en = asr_new

        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

        asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(asr)
            asr_new[:, :, 0] = asr[:, :, 0]
            asr_new[:, :, 1:] = asr[:, :, 0:-1]
            asr = asr_new

        out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))

    return out.squeeze().cpu().numpy()[..., :-50]

In [82]:
text = "Did you hear that? I think I heard someone's coming."
# text = "Did you hear that? I think I heard something. Can you keep a secret? I think someone's coming, so be quiet. We have to be very careful from here on out. I need to tell you something important."
ref_david = compute_style("/root/StyleTTS2/Demo/US-Madison/David_Eleven_Labs_Whisper_Short.wav")
ref_madison = compute_style("/root/StyleTTS2/Demo/US-Madison/madison_concatenated_audio.wav")

wav = s2s(text, ref_david, ref_madison, alpha=0.7, beta=0.9, diffusion_steps=10)
sf.write(f'/root/StyleTTS2/Demo/US-Madison/david-madison-s2s.wav', wav, 24000)

### UK THALIA

In [11]:
reference_dicts = {}

dir_path = Path("Demo/en-UK-Thalia")

# Iterate through each file in the directory
for k, file_path in enumerate(sorted(dir_path.iterdir())):
    if file_path.is_file():
        reference_dicts[k] = file_path

FileNotFoundError: [Errno 2] No such file or directory: 'Demo/en-UK-Thalia'

In [None]:
synthesize_speech(reference_dicts)

### AU Zak

In [None]:
reference_dicts = {}

dir_path = Path("Demo/en-AU-Zak")

# Iterate through each file in the directory
for k, file_path in enumerate(sorted(dir_path.iterdir())):
    if file_path.is_file():
        reference_dicts[k] = file_path

In [None]:
synthesize_speech(reference_dicts)

### ID ALTHAF

In [None]:
reference_dicts = {}

dir_path = Path("Demo/id-ID-Althaf")

# Iterate through each file in the directory
for k, file_path in enumerate(sorted(dir_path.iterdir())):
    if file_path.is_file():
        reference_dicts[k] = file_path

In [None]:
synthesize_speech(reference_dicts)

### EN Althaf S2S

In [None]:
reference_dicts = {}

dir_path = Path("Demo/en-Althaf-S2S")

# Iterate through each file in the directory
for k, file_path in enumerate(sorted(dir_path.iterdir())):
    if file_path.is_file():
        reference_dicts[k] = file_path

In [None]:
synthesize_speech(reference_dicts)

### SW Victoria

In [None]:
reference_dicts = {}

dir_path = Path("Demo/sw-TZ-Victoria")

# Iterate through each file in the directory
for k, file_path in enumerate(sorted(dir_path.iterdir())):
    if file_path.is_file():
        reference_dicts[k] = file_path

In [None]:
synthesize_speech(reference_dicts)