In [None]:
import os
import glob
import torch
from tqdm import tqdm
from underthesea import sent_tokenize
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

device = "cuda:0" if torch.cuda.is_available() else "CPU"
checkpoint_directory = "./your_checkpoint_directory"

xtts_checkpoint = os.path.join(checkpoint_directory, "checkpoint_*.pth")
xtts_checkpoint = glob.glob(xtts_checkpoint)[-1]
xtts_config = os.path.join(checkpoint_directory, "config.json")
xtts_vocab = os.path.join(checkpoint_directory, "vocab.json")

# Load model
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL_norm = Xtts.init_from_config(config)
XTTS_MODEL_norm.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
XTTS_MODEL_norm.to(device)

print("Model loaded successfully!")

In [None]:
from phonemizer import Transcriptor
from IPython.display import Audio

speaker_audio_file = "path_to_speaker.wav" # replace with your wav file

tts_text = "прив+іт, +як +у т+ебе спр+ави?"

tts_text = Transcriptor(tts_text).to_ipa()

gpt_cond_latent, speaker_embedding = XTTS_MODEL_norm.get_conditioning_latents(
    audio_path=speaker_audio_file,
    gpt_cond_len=XTTS_MODEL_norm.config.gpt_cond_len,
    max_ref_length=XTTS_MODEL_norm.config.max_ref_len,
    sound_norm_refs=XTTS_MODEL_norm.config.sound_norm_refs,
)

tts_texts = sent_tokenize(tts_text)

wav_chunks = []
for text in tqdm(tts_texts):
    wav_chunk = XTTS_MODEL_norm.inference(
        text=text,
        language="uk",
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=1,
        length_penalty=1.0,
        repetition_penalty=2.0,
        top_k=1,
        top_p=0.3,
    )
    wav_chunks.append(torch.tensor(wav_chunk["wav"]))

out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()

Audio(out_wav, rate=24000)