# Evaluate XTTSv2

download `balacoon/speech_gen_eval_testsets`, run inference with xtts, run evaluation, upload results to leaderboard.

In [None]:
import os
from huggingface_hub import snapshot_download

if not os.path.isdir("speech_gen_eval_testsets"):
    snapshot_download(
        repo_id="balacoon/speech_gen_eval_testsets",
        local_dir="speech_gen_eval_testsets",
        repo_type="dataset"
    )

setting up xtts environment:
```bash
conda create -n xtts python=3.10
conda activate xtts
pip install git+https://github.com/coqui-ai/TTS
# had to downgrade torch
pip install torch==2.1 torchaudio==2.1
```

In [None]:
# Load the model
# https://github.com/coqui-ai/TTS/blob/dev/docs/source/models/xtts.md#single-reference-1
import os
from TTS.api import TTS
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

In [None]:
import tqdm

# run the synthesis for the vctk
fold = "vctk"
mapping = {}
out_dir = os.path.join("xtts", fold, "wav")
ref_dir = os.path.join("speech_gen_eval_testsets", fold, "wav")
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join("speech_gen_eval_testsets", fold, "id_mapping"), "r") as fp:
    for line in fp:
        k, v = line.strip().split()
        mapping[k] = v
with open(os.path.join("speech_gen_eval_testsets", fold, "test"), "r") as fp:
    for line in tqdm.tqdm(fp):
        id, txt = line.strip().split("\t", 1)
        out_path = os.path.join(out_dir, id + ".wav")
        if os.path.exists(out_path):
            continue
        tts.tts_to_file(
            text=txt,
            file_path=out_path,
            speaker_wav=[os.path.join(ref_dir, mapping[id] + ".wav")],
            split_sentences=False,
            language="en"
        )


In [None]:
import tqdm
import re
import soundfile as sf
import numpy as np
import os

# run the synthesis for the daps_celeb
# unfortunately, the model can only generate texts up to 250 characters.
# daps_celeb contains some texts that are longer than 250 characters.
# for those lines we split text on closest punctiation mark and generate audio in chunks
fold = "daps_celeb"
mapping = {}
out_dir = os.path.join("xtts", fold, "wav")
ref_dir = os.path.join("speech_gen_eval_testsets", fold, "wav")
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join("speech_gen_eval_testsets", fold, "id_mapping"), "r") as fp:
    for line in fp:
        k, v = line.strip().split()
        mapping[k] = v
with open(os.path.join("speech_gen_eval_testsets", fold, "test"), "r") as fp:
    for line in tqdm.tqdm(fp):
        id, txt = line.strip().split("\t", 1)
        out_path = os.path.join(out_dir, id + ".wav")
        if os.path.exists(out_path):
            continue
        if len(txt) < 250:
            tts.tts_to_file(
                text=txt,
                file_path=out_path,
                speaker_wav=[os.path.join(ref_dir, mapping[id] + ".wav")],
                split_sentences=False,
                language="en"
            )
        else:
            # Split on punctuation marks
            chunks = []
            current_chunk = ""
            words = txt.split()
            
            for word in words:
                if len(current_chunk + " " + word) < 250:
                    current_chunk += " " + word if current_chunk else word
                else:
                    # Find last punctuation mark in current chunk
                    punct_matches = list(re.finditer(r'[,.!?;]', current_chunk))
                    if punct_matches:
                        split_idx = punct_matches[-1].end()
                        chunks.append(current_chunk[:split_idx].strip())
                        current_chunk = current_chunk[split_idx:].strip() + " " + word
                    else:
                        chunks.append(current_chunk.strip())
                        current_chunk = word
            
            if current_chunk:
                chunks.append(current_chunk.strip())
            
            # Generate audio for each chunk
            chunk_audios = []
            for i, chunk in enumerate(chunks):
                chunk_path = f"{out_path}.chunk{i}.wav"
                tts.tts_to_file(
                    text=chunk,
                    file_path=chunk_path,
                    speaker_wav=[os.path.join(ref_dir, mapping[id] + ".wav")],
                    split_sentences=False,
                    language="en"
                )
                audio, sr = sf.read(chunk_path)
                chunk_audios.append(audio)
                os.remove(chunk_path)  # Delete chunk file
            
            # Concatenate and save final audio
            final_audio = np.concatenate(chunk_audios)
            sf.write(out_path, final_audio, sr)

In [None]:
# finally run evaluation with `speech_gen_eval`
import os
from speech_gen_eval.evaluation import speech_gen_eval

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = os.path.join("xtts", fold, "wav")
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    id_mapping = os.path.join("speech_gen_eval_testsets", fold, "id_mapping")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="zero-tts",
        original_audio=original_audio,
        mapping_path=id_mapping,
        out_path=os.path.join("xtts", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="XTTSv2",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://github.com/coqui-ai/TTS/blob/dev/docs/source/models/xtts.md#single-reference-1"
    )

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "xtts"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "zero-tts/xtts"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )