In [1]:
import os

import librosa
import numpy as np
import pandas as pd
from pydub import AudioSegment
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import (
    GPTArgs,
    GPTTrainer,
    GPTTrainerConfig,
)
from TTS.tts.models.xtts import XttsAudioConfig
from TTS.utils.manage import ModelManager

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
OUTPUT_PATH = "../data/train/"
os.makedirs(OUTPUT_PATH, exist_ok=True)

CHEKPOINTS_PATH = "../data/orig_models"
os.makedirs(CHEKPOINTS_PATH, exist_ok=True)

### Download original models

In [None]:
# DVAE files
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"

# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHEKPOINTS_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHEKPOINTS_PATH, os.path.basename(MEL_NORM_LINK))

# download DVAE files if needed
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files(
        [MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHEKPOINTS_PATH, progress_bar=True
    )

In [None]:
# Download XTTS v2.0 checkpoint if needed
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"

# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(
    CHEKPOINTS_PATH, os.path.basename(TOKENIZER_FILE_LINK)
)  # vocab.json file
XTTS_CHECKPOINT = os.path.join(
    CHEKPOINTS_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)
)  # model.pth file

# download XTTS v2.0 files if needed
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHEKPOINTS_PATH, progress_bar=True
    )

In [5]:
SPEAKER_REFERENCE = "../data/speaker_sample.wav"
speaker_sample = AudioSegment.from_file(SPEAKER_REFERENCE)

In [7]:
# max_sample_length
print("max_sample_length: ", librosa.load(SPEAKER_REFERENCE)[0].shape[0])
# max wav length
MAX_LENGTH = int(12.5 * 22050 + 5000)
print("maxt_text_wav: ", MAX_LENGTH)

# max text length

data = pd.read_csv("../data/train_data/train_dataset.csv")
print("maxt_text_length: ", np.max([len(x) for x in data["Transcription"]]))

max_sample_length:  374850
maxt_text_wav:  280625
maxt_text_length:  195


In [None]:
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True
START_WITH_EVAL = True
BATCH_SIZE = 1
GRAD_ACUMM_STEPS = 252
LANGUAGE = "ru"

model_args = GPTArgs(
    max_conditioning_length=375000,  # the audio you will use for conditioning latents should be less than this
    min_conditioning_length=66150,  # and more than this
    debug_loading_failures=True,  # this will print output to console and help you find problems in your ds
    max_wav_length=MAX_LENGTH,  # set this to >= the longest audio in your dataset
    max_text_length=200,
    mel_norm_file=MEL_NORM_FILE,
    dvae_checkpoint=DVAE_CHECKPOINT,
    xtts_checkpoint=XTTS_CHECKPOINT,
    tokenizer_file=TOKENIZER_FILE,
    gpt_num_audio_tokens=1026,
    gpt_start_audio_token=1024,
    gpt_stop_audio_token=1025,
    gpt_use_masking_gt_prompt_approach=True,
    gpt_use_perceiver_resampler=True,
)

audio_config = XttsAudioConfig(dvae_sample_rate=16000, output_sample_rate=24000)


In [9]:
RUN_NAME = "IM-FT-v1"
PROJECT_NAME = "dls-vd"
DASHBOARD_LOGGER = "tensorboard"
LOGGER_URI = None

In [None]:
config = GPTTrainerConfig(
    run_eval=True,
    epochs=1000,  # assuming you want to end training manually w/ keyboard interrupt
    output_path=OUTPUT_PATH,
    model_args=model_args,
    run_name=RUN_NAME,
    project_name=PROJECT_NAME,
    run_description="""
        GPT XTTS training
        """,
    dashboard_logger=DASHBOARD_LOGGER,
    logger_uri=LOGGER_URI,
    audio=audio_config,
    batch_size=BATCH_SIZE,
    batch_group_size=48,
    eval_batch_size=BATCH_SIZE,
    num_loader_workers=8,  # consider decreasing if your jupyter env is crashing or similar
    eval_split_max_size=256,
    print_step=50,
    plot_step=100,
    log_model_step=1000,
    save_step=10000,  # ?
    save_n_checkpoints=1,  # if you want to store multiple checkpoint rather than just 1, increase this
    save_checkpoints=False,
    print_eval=False,
    optimizer="AdamW",
    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
    lr=5e-06,
    lr_scheduler="MultiStepLR",
    lr_scheduler_params={
        "milestones": [50000 * 18, 150000 * 18, 300000 * 18],
        "gamma": 0.5,
        "last_epoch": -1,
    },
    test_sentences=[
        {
            "text": "Заставить бояться или уважать. Что ценнее? Я считаю все вместе. С этой мыслью я хочу представить вам новую жемчужину Старкс Индастрис свобода. Это первая ракетная система с репульсорной технологией запуска и наведения",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
        {
            "text": "Никогда не сдавайтесь, идите к своей цели! А если будет сложно – сдавайтесь. А после плотно покушайте и ляжте спать.",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
    ],
)

model = GPTTrainer.init_from_config(config)

In [None]:
TRAINING_DIR = "../data/train_data"
dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="train_dataset.csv",
    language=LANGUAGE,
    path=TRAINING_DIR,
)
train_samples, eval_samples = load_tts_samples(
    dataset_config, eval_split=True, eval_split_size=0.05
)  # off eval split because of small dataset size

In [43]:
trainer = Trainer(
    TrainerArgs(
        restore_path=None,
        skip_train_epoch=False,
        start_with_eval=START_WITH_EVAL,
        grad_accum_steps=GRAD_ACUMM_STEPS,
    ),
    config,
    output_path=OUTPUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 32


 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=../data/train/IM-FT-v1-July-11-2025_07+44PM-9aa8666

 > Model has 518442047 parameters


In [None]:
trainer.fit()

### Inference

In [3]:
checkpoint_path = "../data/checkpoints/best_model_2772.pth"
config_file = "../data/train/IM-FT-v1-July-11-2025_07+44PM-9aa8666/config.json"
vocab_json = "../data/orig_models/vocab.json"
SPEAKER_REFERENCE = "../data/speaker_sample.wav"

In [1]:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
output_wav = "./xtts-ft.wav"

print("Loading model...")
config = XttsConfig()
config.load_json(config_file)
model = Xtts.init_from_config(config)
model.load_checkpoint(
    config, checkpoint_path=checkpoint_path, vocab_path=vocab_json, use_deepspeed=False
)
model.cuda()


Loading model...


Xtts(
  (gpt): GPT(
    (conditioning_encoder): ConditioningEncoder(
      (init): Conv1d(80, 1024, kernel_size=(1,), stride=(1,))
      (attn): Sequential(
        (0): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (1): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (2): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttentionLegacy()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(

In [None]:
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
    audio_path=[SPEAKER_REFERENCE]
)

In [None]:
text = "Никогда не сдавайтесь, идите к своей цели! А если будет сложно – сдавайтесь. А после плотно покушайте и ляжте спать."

In [29]:
out = model.inference(
    text,
    "ru",
    gpt_cond_latent,
    speaker_embedding,
    temperature=0.2,  # Add custom parameters here
)


In [16]:
from IPython.display import Audio

In [30]:
Audio(out["wav"], rate=24000)

In [23]:
from TTS.api import TTS

In [24]:
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts_model = TTS(model_name).to("cuda")

In [None]:
audio = tts_model.tts(
    text=text,
    speaker_wav=SPEAKER_REFERENCE,
    language="ru",
    temperature=0.2,
)

In [32]:
Audio(audio, rate=24000)