# Evaluate Mimi

Audio codec of Moshi (low latency Audio LM): https://huggingface.co/kyutai/mimi

In [None]:
# load model
import torch
from transformers import MimiModel, AutoFeatureExtractor
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
device = torch.device('cuda')

model = MimiModel.from_pretrained("kyutai/mimi").cuda()
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")

In [None]:
# anaysis synthesis
import torch
import matplotlib.pylab as plt
import librosa
from IPython.display import Audio, display

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
device=torch.device('cuda')

wav_path = os.path.join("speech_gen_eval_testsets", "vctk", "wav", "p225_011.wav")
wav, _ = librosa.load(wav_path, sr=24000)
display(Audio(wav, rate=24000))

inputs = feature_extractor(
    raw_audio=wav,
    sampling_rate=feature_extractor.sampling_rate,
    return_tensors="pt"
)
encoder_outputs = model.encode(inputs["input_values"].cuda())
tokens = encoder_outputs.audio_codes[:, :8, :]  # using first 8 tokens
plt.imshow(tokens[0].detach().cpu().numpy(), aspect="auto")
plt.show()
# reconstruct
wav = model.decode(tokens)[0]
display(Audio(wav[0][0].detach().cpu().numpy(), rate=24000))

In [None]:
import os
import tqdm
import librosa
import soundfile as sf

# for testsets do analysis synthesis and save files to a directory for evaluation

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
out_dir = "mimi"
device=torch.device('cuda')

for fold in ["vctk", "daps_celeb"]:
    out_fold_dir = os.path.join(out_dir, fold, "wav")
    os.makedirs(out_fold_dir, exist_ok=True)
    with open(os.path.join("speech_gen_eval_testsets", fold, "test"), "r") as fp:
        for line in tqdm.tqdm(fp):
            id, txt = line.strip().split("\t", 1)
            out_path = os.path.join(out_fold_dir, id + ".wav")
            if os.path.exists(out_path):
                continue
            in_path = os.path.join("speech_gen_eval_testsets", fold, "wav", id + ".wav")
            wav, _ = librosa.load(in_path, sr=24000)
            inputs = feature_extractor(
                raw_audio=wav,
                sampling_rate=feature_extractor.sampling_rate,
                return_tensors="pt"
            )
            encoder_outputs = model.encode(inputs["input_values"].cuda())
            tokens = encoder_outputs.audio_codes[:, :8, :]  # using first 8 tokens
            # reconstruct
            wav = model.decode(tokens)[0]
            sf.write(out_path, wav[0][0].detach().cpu().numpy(), 24000)


In [None]:
# finally run evaluation with `speech_gen_eval`
import os
from speech_gen_eval.evaluation import speech_gen_eval

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = os.path.join("mimi", fold, "wav")
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="vocoder",
        original_audio=original_audio,
        out_path=os.path.join("mimi", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="Mimi(8books)",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://huggingface.co/kyutai/mimi"
    )

In [9]:
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path, encoding="utf-8") as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("mimi", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "mimi"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "vocoder/mimi"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )