# Evaluate EnCodec

Evaluate original codec from meta: https://huggingface.co/facebook/encodec_24khz

In [None]:
!pip install git+https://github.com/huggingface/transformers.git@main

In [None]:
from transformers import EncodecModel, AutoProcessor

model = EncodecModel.from_pretrained("facebook/encodec_24khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

In [None]:
import soundfile as sf
import resampy
import os
import matplotlib.pylab as plt
from IPython.display import Audio, display

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# do analysis synthesis of the single file
wav_path = os.path.join("speech_gen_eval_testsets", "vctk", "wav", "p225_011.wav")
arr, sr = sf.read(wav_path, dtype="float32")
new_arr = resampy.resample(arr, sr, 24000)
display(Audio(new_arr, rate=24000))
inputs = processor(raw_audio=new_arr, sampling_rate=24000, return_tensors="pt")
# use all 8 books
encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"], bandwidth=6.0)
plt.imshow(encoder_outputs.audio_codes[0][0].detach().numpy(), aspect="auto")
plt.colorbar()
plt.show()
audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0]
display(Audio(audio_values[0][0].detach().numpy(), rate=24000))


In [None]:
import os
import tqdm
import soundfile as sf

# for testsets do analysis synthesis and save files to a directory for evaluation

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
out_dir = "encodec"

model.cuda()

for fold in ["vctk", "daps_celeb"]:
    out_fold_dir = os.path.join(out_dir, fold, "wav")
    os.makedirs(out_fold_dir, exist_ok=True)
    with open(os.path.join("speech_gen_eval_testsets", fold, "test"), "r") as fp:
        for line in tqdm.tqdm(fp):
            id, txt = line.strip().split("\t", 1)
            out_path = os.path.join(out_fold_dir, id + ".wav")
            if os.path.exists(out_path):
                continue
            in_path = os.path.join("speech_gen_eval_testsets", fold, "wav", id + ".wav")

            arr, sr = sf.read(in_path, dtype="float32")
            new_arr = resampy.resample(arr, sr, 24000)
            inputs = processor(raw_audio=new_arr, sampling_rate=24000, return_tensors="pt")
            encoder_outputs = model.encode(inputs["input_values"].cuda(), inputs["padding_mask"].cuda(), bandwidth=6.0)
            audio_values = model.decode(
                encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"]
            )[0][0][0].detach().cpu().numpy()
            sf.write(out_path, audio_values, 24000)


In [None]:
# finally run evaluation with `speech_gen_eval`
import os
from speech_gen_eval.evaluation import speech_gen_eval

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = os.path.join("encodec", fold, "wav")
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="vocoder",
        original_audio=original_audio,
        out_path=os.path.join("encodec", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="EnCodec (8books)",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://huggingface.co/facebook/encodec_24khz"
    )

In [1]:
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path) as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("encodec", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "encodec"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "vocoder/encodec"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )