# Lina codec

[Lina codec](https://github.com/ysharma3501/LinaCodec) is a low bitrate (12.5Hz) codec. It also has a voice conversion capabilities via a global speaker embedding.

In [None]:
# to download the testset, git is the most straightforward way
!git clone https://huggingface.co/datasets/balacoon/speech_gen_eval_testsets

In [None]:
import os

os.makedirs("outputs/lina", exist_ok=True)

In [None]:
# first we check performance of Lina as codec, i.e. just try to do reconstruction

import os
import glob
import tqdm
import torch
import soundfile as sf
from linacodec.codec import LinaCodec

lina_tokenizer = LinaCodec()

os.makedirs("outputs/lina/reconstruction", exist_ok=True)
for testset in ["vctk", "daps_celeb"]:
    os.makedirs(f"outputs/lina/reconstruction/{testset}/wav", exist_ok=True)
    for wav_path in tqdm.tqdm(glob.glob(f"speech_gen_eval_testsets/{testset}/wav/*.wav")):
        speech_tokens, global_embedding = lina_tokenizer.encode(wav_path)
        audio = lina_tokenizer.decode(speech_tokens, global_embedding)
        name = os.path.splitext(os.path.basename(wav_path))[0]
        out_path = f"outputs/lina/reconstruction/{testset}/wav/{name}.wav"
        audio_numpy = audio.cpu().numpy().squeeze()
        sf.write(out_path, audio_numpy, 48000, subtype="PCM_16")

In [None]:
# run evaluation of reconstruction with lina
import os
from speech_gen_eval.evaluation import speech_gen_eval

for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = f"outputs/lina/reconstruction/{fold}/wav"
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="vocoder",
        original_audio=original_audio,
        out_path=os.path.join("outputs", "lina", "reconstruction", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="Lina codec",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://github.com/ysharma3501/LinaCodec"
    )

In [None]:
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path, encoding="utf-8") as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("outputs", "lina", "reconstruction", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "outputs/lina/reconstruction"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "vocoder/lina"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )

In [None]:
# now we check performance of Lina as voice conversion system

import os
import glob
import tqdm
import torch
import soundfile as sf
from linacodec.codec import LinaCodec

lina_tokenizer = LinaCodec()

os.makedirs("outputs/lina/voice_conversion", exist_ok=True)
for testset in ["vctk", "daps_celeb"]:
    # read id mapping
    with open(f"speech_gen_eval_testsets/{testset}/id_mapping") as fp:
        target2ref = {}
        for line in fp:
            target, ref = line.strip().split()
            target2ref[target] = ref
    # create output dir
    os.makedirs(f"outputs/lina/voice_conversion/{testset}/wav", exist_ok=True)
    with open(f"speech_gen_eval_testsets/{testset}/test") as fp:
        for line in fp:
            name = line.strip().split()[0]
            orig_path = f"speech_gen_eval_testsets/{testset}/wav/{name}.wav"
            ref_name = target2ref[name]
            if not ref_name:
                continue
            ref_path = f"speech_gen_eval_testsets/{testset}/wav/{ref_name}.wav"
            if not os.path.isfile(ref_path) or not os.path.isfile(orig_path):
                continue
            audio = lina_tokenizer.convert_voice(orig_path, ref_path)
   
            out_path = f"outputs/lina/voice_conversion/{testset}/wav/{name}.wav"
            audio_numpy = audio.cpu().numpy().squeeze()
            sf.write(out_path, audio_numpy, 48000, subtype="PCM_16")

In [None]:
# run evaluation of reconstruction with lina
import os
from speech_gen_eval.evaluation import speech_gen_eval

for fold in ["vctk", "daps_celeb"]:
    print(f"Evaluating {fold}")
    txt_path = os.path.join("speech_gen_eval_testsets", fold, "test")
    generated_audio = f"outputs/lina/voice_conversion/{fold}/wav"
    original_audio = os.path.join("speech_gen_eval_testsets", fold, "wav")
    id_mapping = os.path.join("speech_gen_eval_testsets", fold, "id_mapping")
    speech_gen_eval(
        txt_path=txt_path,
        generated_audio=generated_audio,
        eval_type="zero-vc",
        original_audio=original_audio,
        mapping_path=id_mapping,
        out_path=os.path.join("outputs", "lina", "voice_conversion", fold, "metrics.yaml"),
        ignore_missing=True,
        # extra arguments to write into the metrics.yaml as meta info
        model_name="Lina codec",
        dataset=f"balacoon/speech_gen_eval_testsets/{fold}",
        link="https://github.com/ysharma3501/LinaCodec"
    )

In [None]:
# dont upload all the files from test (2k),
# but only those (150) that are meant to be kept for listening / subjective evaluation
import os
import glob

for fold in ["vctk", "daps_celeb"]:
    # Read keep file and get IDs
    keep_path = os.path.join("speech_gen_eval_testsets", fold, "keep")
    with open(keep_path, encoding="utf-8") as f:
        keep_lines = f.readlines()
    keep_ids = set([line.split()[0] for line in keep_lines])
    
    for wav_file in glob.glob(os.path.join("outputs", "lina", "voice_conversion", fold, "wav", "*.wav")):
        if os.path.basename(wav_file).split(".")[0] not in keep_ids:
            os.remove(wav_file)

In [None]:
# upload synthetic audio and metrics to `speech_gen_baselines`,
# so it is available on TTSLeaderboard

local_dataset = "outputs/lina/voice_conversion"
hf_dataset = "balacoon/speech_gen_baselines"
hf_subdir = "zero-vc/lina"

from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Upload each fold to the appropriate subdirectory
for fold in ["vctk", "daps_celeb"]:
    # Upload wav files
    api.upload_folder(
        folder_path=os.path.join(local_dataset, fold),
        repo_id=hf_dataset,
        repo_type="dataset",
        path_in_repo=os.path.join(hf_subdir, fold)
    )