# Evaluate XTTSv2

download `balacoon/speech_gen_eval_testsets`, run inference with xtts, run evaluation, upload results to leaderboard.

In [None]:
import os
from huggingface_hub import snapshot_download

if not os.path.isdir("speech_gen_eval_testsets"):
    snapshot_download(
        repo_id="balacoon/speech_gen_eval_testsets",
        local_dir="speech_gen_eval_testsets",
        repo_type="dataset"
    )

setting up xtts environment:
```bash
conda create -n xtts python=3.10
conda activate xtts
pip install git+https://github.com/coqui-ai/TTS
# had to downgrade torch
pip install torch==2.1 torchaudio==2.1
```

In [None]:
# Load the model
# https://github.com/coqui-ai/TTS/blob/dev/docs/source/models/xtts.md#single-reference-1
import os
from TTS.api import TTS
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

In [None]:
import tqdm

# run the synthesis for the vctk
fold = "vctk"
mapping = {}
out_dir = os.path.join("xtts", fold, "wav")
ref_dir = os.path.join("speech_gen_eval_testsets", fold, "wav")
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join("speech_gen_eval_testsets", fold, "id_mapping"), "r") as fp:
    for line in fp:
        k, v = line.strip().split()
        mapping[k] = v
with open(os.path.join("speech_gen_eval_testsets", fold, "test"), "r") as fp:
    for line in tqdm.tqdm(fp):
        id, txt = line.strip().split("\t", 1)
        out_path = os.path.join(out_dir, id + ".wav")
        if os.path.exists(out_path):
            continue
        tts.tts_to_file(
            text=txt,
            file_path=out_path,
            speaker_wav=[os.path.join(ref_dir, mapping[id] + ".wav")],
            split_sentences=False,
            language="en"
        )


In [None]:
import tqdm
import re
import soundfile as sf
import numpy as np
import os

# run the synthesis for the daps_celeb
# unfortunately, the model can only generate texts up to 250 characters.
# daps_celeb contains some texts that are longer than 250 characters.
# for those lines we split text on closest punctiation mark and generate audio in chunks
fold = "daps_celeb"
mapping = {}
out_dir = os.path.join("xtts", fold, "wav")
ref_dir = os.path.join("speech_gen_eval_testsets", fold, "wav")
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join("speech_gen_eval_testsets", fold, "id_mapping"), "r") as fp:
    for line in fp:
        k, v = line.strip().split()
        mapping[k] = v
with open(os.path.join("speech_gen_eval_testsets", fold, "test"), "r") as fp:
    for line in tqdm.tqdm(fp):
        id, txt = line.strip().split("\t", 1)
        out_path = os.path.join(out_dir, id + ".wav")
        if os.path.exists(out_path):
            continue
        if len(txt) < 250:
            tts.tts_to_file(
                text=txt,
                file_path=out_path,
                speaker_wav=[os.path.join(ref_dir, mapping[id] + ".wav")],
                split_sentences=False,
                language="en"
            )
        else:
            # Split on punctuation marks
            chunks = []
            current_chunk = ""
            words = txt.split()
            
            for word in words:
                if len(current_chunk + " " + word) < 250:
                    current_chunk += " " + word if current_chunk else word
                else:
                    # Find last punctuation mark in current chunk
                    punct_matches = list(re.finditer(r'[,.!?;]', current_chunk))
                    if punct_matches:
                        split_idx = punct_matches[-1].end()
                        chunks.append(current_chunk[:split_idx].strip())
                        current_chunk = current_chunk[split_idx:].strip() + " " + word
                    else:
                        chunks.append(current_chunk.strip())
                        current_chunk = word
            
            if current_chunk:
                chunks.append(current_chunk.strip())
            
            # Generate audio for each chunk
            chunk_audios = []
            for i, chunk in enumerate(chunks):
                chunk_path = f"{out_path}.chunk{i}.wav"
                tts.tts_to_file(
                    text=chunk,
                    file_path=chunk_path,
                    speaker_wav=[os.path.join(ref_dir, mapping[id] + ".wav")],
                    split_sentences=False,
                    language="en"
                )
                audio, sr = sf.read(chunk_path)
                chunk_audios.append(audio)
                os.remove(chunk_path)  # Delete chunk file
            
            # Concatenate and save final audio
            final_audio = np.concatenate(chunk_audios)
            sf.write(out_path, final_audio, sr)

In [None]:
# finally run evaluation with `speech_gen_eval`
import os
from speech_gen_eval.evaluation import speech_gen_eval

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = os.path.join("xtts", fold, "wav")
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    id_mapping = os.path.join("speech_gen_eval_testsets", fold, "id_mapping")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="zero-tts",
        original_audio=original_audio,
        mapping_path=id_mapping,
        out_path=os.path.join("xtts", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="XTTSv2",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://github.com/coqui-ai/TTS/blob/dev/docs/source/models/xtts.md#single-reference-1"
    )

In [None]:
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path) as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("xtts", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "xtts"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "zero-tts/xtts"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )

In [4]:
# for vctk and daps_celeb in speech_gen_eval_testsets,
# read test and demo, take demo, then 140 more other random lines from test
# and write it all to "keep" next to demo.
import random
import os
for fold in ["vctk", "daps_celeb"]:
    # Read demo file
    demo_path = os.path.join("speech_gen_eval_testsets", fold, "demo")
    with open(demo_path) as f:
        demo_lines = f.readlines()
    
    # Read test file
    test_path = os.path.join("speech_gen_eval_testsets", fold, "test") 
    with open(test_path) as f:
        test_lines = f.readlines()
        
    # Remove demo lines from test lines to avoid duplicates
    test_lines_unique = [line for line in test_lines if line not in demo_lines]
        
    # Sample 140 random lines from remaining test lines
    sampled_test = random.sample(test_lines_unique, 140)
    
    # Combine demo and sampled test lines
    keep_lines = demo_lines + sampled_test
    
    # Write to keep file
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path, "w") as f:
        f.writelines(keep_lines)

In [6]:
from huggingface_hub import HfApi
import os

# Upload keep files to HuggingFace dataset
api = HfApi()
for fold in ["vctk", "daps_celeb"]:
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    api.upload_file(
        path_or_fileobj=keep_path,
        path_in_repo=os.path.join(fold, "keep"),
        repo_id="balacoon/speech_gen_eval_testsets",
        repo_type="dataset"
    )

In [13]:
from huggingface_hub import HfApi
import os

api = HfApi()

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path) as f:
        keep_lines = f.readlines()
    keep_ids = [line.split()[0] for line in keep_lines]
    
    # Get list of files in wav directory from HF
    wav_files = api.list_repo_tree(
        repo_id=f"balacoon/speech_gen_baselines",
        repo_type="dataset",
        path_in_repo=f"zero-tts/xtts/{fold}/wav"
    )
    wav_files = [x.path for x in wav_files]
    
    # Collect files to delete
    files_to_delete = []
    for file_path in wav_files:
        if not file_path.endswith('.wav'):
            continue
            
        file_id = os.path.basename(file_path).split(".")[0]
        if file_id not in keep_ids:
            files_to_delete.append(file_path)
    
    # Delete files in bulk
    if files_to_delete:
        api.delete_files(
            delete_patterns=files_to_delete,
            repo_id="balacoon/speech_gen_baselines",
            repo_type="dataset"
        )
        #print(f"Deleted {len(files_to_delete)} files from {fold}")

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/balacoon/speech_gen_baselines/commit/main (Request ID: Root=1-67b30d70-3d327b9f658834c17a670180;cd4ffa67-d2a4-40ac-9c5b-a706c1402a27)

You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.