# Nano codec

[Family of codecs](https://huggingface.co/nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps) from NVIDIA.
Works at 12.5Hz or 21.5Hz. Uses FSQ so codebooks are quasi-independent. 

In [None]:
# to download the testset, git is the most straightforward way
!git clone https://huggingface.co/datasets/balacoon/speech_gen_eval_testsets

In [None]:
import os

os.makedirs("outputs/nano_codec", exist_ok=True)

In [None]:
# extract codebooks for vctk
!nano-acoustic-tokens --batch-size 8 --max-dur 40 --device cuda speech_gen_eval_testsets/vctk/wav/ outputs/nano_codec/vctk_tokens/

In [None]:
# extract codebooks for daps
!nano-acoustic-tokens --batch-size 8 --max-dur 40 --device cuda speech_gen_eval_testsets/daps_celeb/wav/ outputs/nano_codec/daps_celeb_tokens/

In [None]:
# resynthesize audio from codebooks

import os
import tqdm
import glob
import torch
import numpy as np
import soundfile as sf

from nemo.collections.tts.models import AudioCodecModel


device = torch.device("cuda")
nemo_codec_model = AudioCodecModel.from_pretrained("nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps").eval().to(device)
for fold in ["daps_celeb", "vctk"]:
    os.makedirs(f"outputs/nano_codec/{fold}/wav", exist_ok=True)
    for path in tqdm.tqdm(glob.glob(f"outputs/nano_codec/{fold}_tokens/*.npz")):
        arr = np.load(path)["acoustic_tokens"]  # books x T
        tokens = torch.from_numpy(arr).unsqueeze(dim=0).to(device)
        if tokens.shape[2] > 750:
            # skip 30sec+ audio to avoid OOM
            continue
        print(f"Trying to decode {tokens.shape[2]} frames", flush=True)
        tokens_len = torch.tensor([tokens.shape[2]]).to(device)
        with torch.no_grad():
            reconstructed_audio, _ = nemo_codec_model.decode(tokens=tokens, tokens_len=tokens_len)
            output_audio = reconstructed_audio.detach().cpu().numpy().squeeze()
            name = os.path.splitext(os.path.basename(path))[0]
            out_path = f"outputs/nano_codec/{fold}/wav/{name}.wav"
            sf.write(out_path, output_audio, nemo_codec_model.sample_rate)


In [None]:
# finally run evaluation with `speech_gen_eval`
import os
from speech_gen_eval.evaluation import speech_gen_eval

for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = f"outputs/nano_codec/{fold}/wav"
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="vocoder",
        original_audio=original_audio,
        out_path=os.path.join("outputs", "nano_codec", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="NanoCodec 25Hz",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://huggingface.co/nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps"
    )

In [None]:
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path, encoding="utf-8") as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("outputs", "nano_codec", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "outputs/nano_codec"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "vocoder/nano_codec"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )