In [1]:
# Check GPU
!nvidia-smi

Mon Oct 20 15:29:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.76.07              Driver Version: 581.08         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   46C    P8              7W /   37W |     293MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [2]:
import time, os
import soundfile as sf
import torch
from TTS.api import TTS

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


# Quick test (pretrained VITS on VCTK)

In [3]:
model_name = "tts_models/en/vctk/vits"
tts = TTS(model_name)

 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.


In [4]:
speaker_id = "p225"

os.makedirs("../results/vits_foundation_setup/quick_test", exist_ok=True)
wav_path = "../results/vits_foundation_setup/quick_test/hello_vctk_p225.wav"

In [5]:
# Warmup (improves timing stability on GPU)
_ = tts.tts("warmup", speaker=speaker_id)

 > Text splitted to sentences.
['warmup']
 > Processing time: 0.41132450103759766
 > Real-time factor: 0.3404544011966602


In [6]:
# Time a single synthesis; sync for accurate RTF on GPU
text = "This is a quick baseline test using VITS on the VCTK dataset."
if torch.cuda.is_available():
    torch.cuda.synchronize()
t0 = time.time()
tts.tts_to_file(text=text, speaker=speaker_id, file_path=wav_path)
if torch.cuda.is_available():
    torch.cuda.synchronize()
elapsed = time.time() - t0

audio, sr = sf.read(wav_path)
duration = len(audio) / sr
rtf = elapsed / duration
print(f"Saved -> {wav_path}\nSynthesis: {elapsed:.3f}s  Audio: {duration:.3f}s  RTF: {rtf:.3f}")


 > Text splitted to sentences.
['This is a quick baseline test using VITS on the VCTK dataset.']
 > Processing time: 1.576317310333252
 > Real-time factor: 0.34987313469206194
Saved -> ../results/vits_foundation_setup/quick_test/hello_vctk_p225.wav
Synthesis: 1.590s  Audio: 4.505s  RTF: 0.353


# Batch test on real VCTK samples

In [7]:
import os, glob, time
import soundfile as sf
from tqdm import tqdm
from TTS.api import TTS
import torch

In [8]:
import os

# Use relative path to data folder
VCTK_ROOT = "../data/VCTK-Corpus-0.92"
speaker_id = "p225"
WAV_DIR = f"{VCTK_ROOT}/wav48_silence_trimmed/{speaker_id}"
TXT_DIR = f"{VCTK_ROOT}/txt/{speaker_id}"

assert os.path.isdir(WAV_DIR), f"Missing WAV_DIR: {os.path.abspath(WAV_DIR)}"
assert os.path.isdir(TXT_DIR), f"Missing TXT_DIR: {os.path.abspath(TXT_DIR)}"

outs_dir = "../results/vits_foundation_setup/batch_test"
os.makedirs(outs_dir, exist_ok=True)

In [9]:
tts = TTS("tts_models/en/vctk/vits")

 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.


In [10]:
# small subset
wav_files = sorted(glob.glob(os.path.join(WAV_DIR, "*.flac")))[:10]
times, durations = [], []

# Warmup
_ = tts.tts("warmup", speaker=speaker_id)
if torch.cuda.is_available(): torch.cuda.synchronize()

wav_files

 > Text splitted to sentences.
['warmup']
 > Processing time: 0.2806849479675293
 > Real-time factor: 0.24161083317785842


['../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_001.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_002.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_003.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_004.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_005.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_006.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_007.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_008.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_009.flac',
 '../data/VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_010.flac']

In [11]:
def read_text(p):
    with open(p, "r", encoding="utf-8") as f:
        return f.read().strip()

for wav in tqdm(wav_files, desc="Synth"):
    base = os.path.splitext(os.path.basename(wav))[0]
    txt = os.path.join(TXT_DIR, base + ".txt")
    if not os.path.exists(txt):
        continue
    text = read_text(txt)
    out_wav = os.path.join(outs_dir, f"{base}_gen.wav")

    if torch.cuda.is_available(): torch.cuda.synchronize()
    t0 = time.time()
    tts.tts_to_file(text=text, speaker=speaker_id, file_path=out_wav)
    if torch.cuda.is_available(): torch.cuda.synchronize()
    synth_t = time.time() - t0

    audio, sr = sf.read(out_wav)
    dur = len(audio) / sr

    times.append(synth_t)
    durations.append(dur)

total_t = sum(times)
total_d = sum(durations) if durations else 1e-9
print(f"Files: {len(times)} | Total synth: {total_t:.2f}s | Total audio: {total_d:.2f}s | Mean RTF: {total_t/total_d:.3f}")
print(f"Outputs in: {outs_dir}")

Synth:   0%|                                                                          | 0/10 [00:00<?, ?it/s]

 > Text splitted to sentences.
['Please call Stella.']


Synth:  10%|██████▌                                                           | 1/10 [00:00<00:04,  1.96it/s]

 > Processing time: 0.468916654586792
 > Real-time factor: 0.25715310967068156
 > Text splitted to sentences.
['Ask her to bring these things with her from the store.']


Synth:  20%|█████████████▏                                                    | 2/10 [00:01<00:06,  1.23it/s]

 > Processing time: 0.9838624000549316
 > Real-time factor: 0.2901530858283121
 > Text splitted to sentences.
['Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob.']


Synth:  30%|███████████████████▊                                              | 3/10 [00:04<00:11,  1.63s/it]

 > Processing time: 2.5432965755462646
 > Real-time factor: 0.35215317925999157
 > Text splitted to sentences.
['We also need a small plastic snake and a big toy frog for the kids.']


Synth:  40%|██████████████████████████▍                                       | 4/10 [00:05<00:09,  1.63s/it]

 > Processing time: 1.585362195968628
 > Real-time factor: 0.3131470942123069
 > Text splitted to sentences.
['She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.']


Synth:  50%|█████████████████████████████████                                 | 5/10 [00:07<00:09,  1.84s/it]

 > Processing time: 2.1636359691619873
 > Real-time factor: 0.33635203835322774
 > Text splitted to sentences.
['When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow.']


Synth:  60%|███████████████████████████████████████▌                          | 6/10 [00:09<00:07,  1.86s/it]

 > Processing time: 1.844778060913086
 > Real-time factor: 0.32893450190139045
 > Text splitted to sentences.
['The rainbow is a division of white light into many beautiful colors.']


Synth:  70%|██████████████████████████████████████████████▏                   | 7/10 [00:11<00:05,  1.71s/it]

 > Processing time: 1.360849380493164
 > Real-time factor: 0.3133534757714523
 > Text splitted to sentences.
['These take the shape of a long round arch, with its path high above, and its two ends apparently beyond the horizon.']


Synth:  80%|████████████████████████████████████████████████████▊             | 8/10 [00:13<00:04,  2.01s/it]

 > Processing time: 2.600792407989502
 > Real-time factor: 0.34781339517326854
 > Text splitted to sentences.
['There is , according to legend, a boiling pot of gold at one end.']


Synth:  90%|███████████████████████████████████████████████████████████▍      | 9/10 [00:15<00:01,  1.81s/it]

 > Processing time: 1.3318870067596436
 > Real-time factor: 0.3042444523770319
 > Text splitted to sentences.
['People look, but no one ever finds it.']


Synth: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.61s/it]

 > Processing time: 0.7823607921600342
 > Real-time factor: 0.29677703459827887
Files: 10 | Total synth: 15.81s | Total audio: 48.37s | Mean RTF: 0.327
Outputs in: ../results/vits_foundation_setup/batch_test





In [12]:
from IPython.display import Audio, display
import os

folder = "../results/vits_foundation_setup/batch_test"
for filename in sorted(os.listdir(folder)):
    if filename.endswith((".wav", ".mp3", ".flac")):
        path = os.path.join(folder, filename)
        print(f"▶️ {filename}")
        display(Audio(path))

▶️ p225_001_gen.wav


▶️ p225_002_gen.wav


▶️ p225_003_gen.wav


▶️ p225_004_gen.wav


▶️ p225_005_gen.wav


▶️ p225_006_gen.wav


▶️ p225_007_gen.wav


▶️ p225_008_gen.wav


▶️ p225_009_gen.wav


▶️ p225_010_gen.wav


# Profiling hotspots (PyTorch Profiler + TensorBoard)

In [13]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity
from TTS.api import TTS

In [14]:
import os, socket, torch
from torch.profiler import (
    profile, record_function, ProfilerActivity,
    schedule, tensorboard_trace_handler
)

text = "Profiling the VITS baseline on VCTK."
speaker = "p225"

def synth_once():
    _ = tts.tts(text=text, speaker=speaker)
    if torch.cuda.is_available():
        torch.cuda.synchronize()

logdir = os.path.abspath("../results/vits_foundation_setup/traces_vits")
os.makedirs(logdir, exist_ok=True)

activities = [ProfilerActivity.CPU]
if torch.cuda.is_available():
    activities.append(ProfilerActivity.CUDA)

# A small schedule so TB gets something meaningful:
sched = schedule(wait=1, warmup=1, active=2, repeat=1)
run_name = f"{socket.gethostname()}_{speaker}_vits"

with profile(
    activities=activities,
    schedule=sched,
    on_trace_ready=tensorboard_trace_handler(logdir, worker_name=run_name),
    record_shapes=True,
    with_modules=True,
    profile_memory=True,
    with_stack=True,
) as prof:
    # total steps = (wait + warmup + active) * repeat = 4
    # add 1–2 extra steps to be safe
    total_steps = 6
    for step in range(total_steps):
        with record_function("VITS_synthesis"):
            synth_once()
        prof.step()


 > Text splitted to sentences.
['Profiling the VITS baseline on VCTK.']
 > Processing time: -0.08645391464233398
 > Real-time factor: -0.026031090477707346
 > Text splitted to sentences.
['Profiling the VITS baseline on VCTK.']
 > Processing time: 1.0034449100494385
 > Real-time factor: 0.2909472999499016
 > Text splitted to sentences.
['Profiling the VITS baseline on VCTK.']


[W1020 15:31:17.419871308 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event


 > Processing time: 1.0648162364959717
 > Real-time factor: 0.3097846476506251
 > Text splitted to sentences.
['Profiling the VITS baseline on VCTK.']
 > Processing time: 1.0454468727111816
 > Real-time factor: 0.29810810499795104
 > Text splitted to sentences.
['Profiling the VITS baseline on VCTK.']
 > Processing time: -0.1366443634033203
 > Real-time factor: -0.04002401983319889
 > Text splitted to sentences.
['Profiling the VITS baseline on VCTK.']
 > Processing time: 1.1164164543151855
 > Real-time factor: 0.3141844856244875


In [15]:
%load_ext tensorboard
%tensorboard --logdir ../results/vits_foundation_setup/traces_vits --port 6006