# Evaluate Zonos

New TTS from Zyphra just dropped: https://huggingface.co/Zyphra/Zonos-v0.1-hybrid. Lets evaluate it.
Skipping the step for `balacoon/speech_gen_eval_testsets` downloading, see `xtts.ipynb` for details.

Setting up environment:
```bash
pip install torch torchaudio zonos
# for some reason had to install from repo too
pip install git+https://github.com/Zyphra/Zonos.git
# had to also install espeak
apt install -y espeak-ng
```

In [None]:
import torchaudio
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict
import tqdm
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

#model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda")
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device="cuda")

# run the synthesis for the vctk
for fold in ["vctk", "daps_celeb"]:
    mapping = {}
    out_dir = os.path.join("zonos", fold, "wav")
    ref_dir = os.path.join("speech_gen_eval_testsets", fold, "wav")
    os.makedirs(out_dir, exist_ok=True)
    with open(os.path.join("speech_gen_eval_testsets", fold, "id_mapping"), "r") as fp:
        for line in fp:
            k, v = line.strip().split()
            mapping[k] = v
    with open(os.path.join("speech_gen_eval_testsets", fold, "test"), "r") as fp:
        for line in tqdm.tqdm(fp):
            id, txt = line.strip().split("\t", 1)
            out_path = os.path.join(out_dir, id + ".wav")
            if os.path.exists(out_path):
                continue

            ref_path = os.path.join(ref_dir, mapping[id] + ".wav")
            wav, sampling_rate = torchaudio.load(ref_path)
            speaker = model.make_speaker_embedding(wav, sampling_rate)
            cond_dict = make_cond_dict(text=txt, speaker=speaker, language="en-us")
            conditioning = model.prepare_conditioning(cond_dict)
            codes = model.generate(conditioning)
            wavs = model.autoencoder.decode(codes).cpu()
            torchaudio.save(out_path, wavs[0], model.autoencoder.sampling_rate)


In [None]:
# finally run evaluation with `speech_gen_eval`
import os
from speech_gen_eval.evaluation import speech_gen_eval

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = os.path.join("zonos", fold, "wav")
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    id_mapping = os.path.join("speech_gen_eval_testsets", fold, "id_mapping")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="zero-tts",
        original_audio=original_audio,
        mapping_path=id_mapping,
        out_path=os.path.join("zonos", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="Zonos-v0.1-transformer",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://huggingface.co/Zyphra/Zonos-v0.1-transformer"
    )

In [1]:
# after checking that evaluation succeeded, remove files that are not meant to be kept
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path) as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("zonos", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "zonos"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "zero-tts/zonos"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )