# Balacoon Vocoder

Our inhouse discrete audio codec.
24khz, 50 frames per second, 4 codebooks.
Middle ground between high bitrate of EnCodec and low bitrate Mimi (12.5 frames per second only) or WaveTokenizer (single codebook).

In [None]:
import torch
import os
import soundfile as sf
import resampy
from IPython.display import Audio, display
import matplotlib.pylab as plt
from huggingface_hub import hf_hub_download

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
device = torch.device('cuda')

wav_path = os.path.join("speech_gen_eval_testsets", "vctk", "wav", "p225_011.wav")
wav16khz, sr = sf.read(wav_path, dtype="int16")
wav24khz = resampy.resample(wav16khz, sr, 24000)
print(wav24khz.shape)
display(Audio(wav24khz, rate=24000))
x = torch.tensor(wav24khz).to(device).unsqueeze(0)

encoder_path = hf_hub_download(repo_id="balacoon/vq4_50fps_24khz_vocoder", filename="analysis.jit")
decoder_path = hf_hub_download(repo_id="balacoon/vq4_50fps_24khz_vocoder", filename="synthesis.jit")
encoder = torch.jit.load(encoder_path)
decoder = torch.jit.load(decoder_path)

tokens = encoder(x)
print(tokens.shape)
plt.imshow(tokens[0].detach().cpu().numpy().T, aspect="auto")
plt.show()

res = decoder(tokens)
print(res.shape)
display(Audio(res[0].detach().cpu().numpy(), rate=24000))


In [11]:
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path, encoding="utf-8") as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("vq4_50fps_24khz_vocoder", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "vq4_50fps_24khz_vocoder"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "vocoder/balacoon"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )