# Evaluate WavTokenizer

THe first discrete audio codec compressing the audio into a single stream of tokens: https://github.com/jishengpeng/WavTokenizer

In [None]:
# had to resolve issues with fairseq manually
# downloaded repo, put configs, encoder and decoder dirs in the current di
!pip install -r https://raw.githubusercontent.com/jishengpeng/WavTokenizer/main/requirements.txt

In [None]:
from huggingface_hub import hf_hub_download

# Define model repo and filename
repo_id = "novateur/WavTokenizer-large-speech-75token"
filename = "wavtokenizer_large_speech_320_v2.ckpt"

# Download the model directly into the current directory
model_path = hf_hub_download(repo_id=repo_id, filename=filename, local_dir=".")

print(f"Model downloaded to: {model_path}")

In [9]:
# load model
import torch
from decoder.pretrained import WavTokenizer
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

device = torch.device('cuda')

config_path = "./configs/wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"  # Replace 'xxx.yaml' with the correct config file name
model_path = "./wavtokenizer_large_speech_320_v2.ckpt"

wavtokenizer = WavTokenizer.from_pretrained0802(config_path, model_path)
wavtokenizer = wavtokenizer.to(device)

In [None]:
# anaysis synthesis

from encoder.utils import convert_audio
import torchaudio
import torch
import matplotlib.pylab as plt
from IPython.display import Audio, display

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
device=torch.device('cuda')

wav_path = os.path.join("speech_gen_eval_testsets", "vctk", "wav", "p225_011.wav")
wav, sr = torchaudio.load(wav_path)
wav = convert_audio(wav, sr, 24000, 1)
display(Audio(wav, rate=24000))
bandwidth_id = torch.tensor([0])
wav=wav.to(device)
_, discrete_code= wavtokenizer.encode_infer(wav, bandwidth_id=bandwidth_id)
plt.plot(discrete_code[0][0].detach().cpu().numpy())
plt.show()
# reconstruct
features = wavtokenizer.codes_to_features(discrete_code)
bandwidth_id = torch.tensor([0], device=device)  
audio_out = wavtokenizer.decode(features, bandwidth_id=bandwidth_id)
display(Audio(audio_out[0].detach().cpu().numpy(), rate=24000))

In [None]:
import os
import tqdm
import soundfile as sf

# for testsets do analysis synthesis and save files to a directory for evaluation

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
out_dir = "wavtokenizer"
device=torch.device('cuda')

for fold in ["vctk", "daps_celeb"]:
    out_fold_dir = os.path.join(out_dir, fold, "wav")
    os.makedirs(out_fold_dir, exist_ok=True)
    with open(os.path.join("speech_gen_eval_testsets", fold, "test"), "r") as fp:
        for line in tqdm.tqdm(fp):
            id, txt = line.strip().split("\t", 1)
            out_path = os.path.join(out_fold_dir, id + ".wav")
            if os.path.exists(out_path):
                continue
            in_path = os.path.join("speech_gen_eval_testsets", fold, "wav", id + ".wav")

            wav, sr = torchaudio.load(in_path)
            wav = convert_audio(wav, sr, 24000, 1)
            bandwidth_id = torch.tensor([0], device=device)
            wav=wav.to(device)
            _, discrete_code= wavtokenizer.encode_infer(wav, bandwidth_id=bandwidth_id)
            # reconstruct
            features = wavtokenizer.codes_to_features(discrete_code)
            audio_out = wavtokenizer.decode(features, bandwidth_id=bandwidth_id)
            sf.write(out_path, audio_out[0].detach().cpu().numpy(), 24000)


In [None]:
# finally run evaluation with `speech_gen_eval`
import os
from speech_gen_eval.evaluation import speech_gen_eval

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = os.path.join("wavtokenizer", fold, "wav")
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="vocoder",
        original_audio=original_audio,
        out_path=os.path.join("wavtokenizer", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="WavTokenizer(LargeV2)",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://github.com/jishengpeng/WavTokenizer"
    )

In [4]:
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path, encoding="utf-8") as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("wavtokenizer", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "wavtokenizer"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "vocoder/wavtokenizer"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )