# High-Quality Text-Free One-Shot Voice Conversion with FeeVC and OpenVINO™
[FreeVC](https://github.com/OlaWod/FreeVC) allows alter the voice of a source speaker to a target style, while keeping the linguistic content unchanged, without text annotation.
FreeVC suggests only command line interface to use and only with CUDA. In this notebook it shows how to use FreeVC in Python and without CUDA devices. It consists of the following steps:
- Download and prepare models.
- Inference.
- Convert models to OpenVINO Intermediate Representation.
- Inference using only OpenVINO's IR models.

## Pre-requisites
1. Clone this repo: git clone https://github.com/OlaWod/FreeVC.git
2. Download [WavLM-Large](https://github.com/microsoft/unilm/tree/master/wavlm) and put it under directory 'FreeVC/wavlm/'
3. Download the [VCTK](https://datashare.ed.ac.uk/handle/10283/3443) dataset. You can use any of them, but for this example two of them already included and available under directory 'dataset': `vctk-16k/p225/p225_001.wav` and `vctk-16k/p226/p226_002.wav`. To use other examples, you should change `convert.txt`
4. Download [pretrained models](https://1drv.ms/u/s!AnvukVnlQ3ZTx1rjrOZ2abCwuBAh?e=UlhRR5) and put it under directory 'checkpoints' (for current example only `freevc.pth` are required)

Install extra requirements

In [None]:
!pip install -q "librosa>=0.8.1"
!pip install webrtcvad==2.0.10

Check if FreeVC is installed and its path to sys.path

In [None]:
from pathlib import Path
import sys


free_vc_repo = 'FreeVC'
if not Path(free_vc_repo).exists():
    !git clone https://github.com/OlaWod/FreeVC.git

sys.path.append(free_vc_repo)

## Imports and settings

In [None]:
import os
import time

import librosa
import numpy as np
import torch
from scipy.io.wavfile import write
from tqdm import tqdm

from openvino.runtime import Core
from openvino.tools import mo

import utils
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
from wavlm import WavLM, WavLMConfig

Redefine function `get_model` from `utils` to exclude cuda

In [None]:
def get_cmodel():
    checkpoint = torch.load('wavlm/WavLM-Large.pt')
    cfg = WavLMConfig(checkpoint['cfg'])
    cmodel = WavLM(cfg)
    cmodel.load_state_dict(checkpoint['model'])
    cmodel.eval()

    return cmodel

Models initialization

In [None]:
hps = utils.get_hparams_from_file('configs/freevc.json')
os.makedirs('outputs/freevc', exist_ok=True)

net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model
)

utils.load_checkpoint('checkpoints/freevc.pth', net_g, optimizer=None, strict=True)
cmodel = get_cmodel()
smodel = SpeakerEncoder('FreeVC/speaker_encoder/ckpt/pretrained_bak_5805000.pt')

Reading dataset settings

In [None]:
titles, srcs, tgts = [], [], []

with open('convert.txt', "r") as f:
    for rawline in f.readlines():
        title, src, tgt = rawline.strip().split("|")
        titles.append(title)
        srcs.append(src)
        tgts.append(tgt)

Inference

In [None]:
with torch.no_grad():
    for line in tqdm(zip(titles, srcs, tgts)):
        title, src, tgt = line
        # tgt
        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)

        g_tgt = smodel.embed_utterance(wav_tgt)
        g_tgt = torch.from_numpy(g_tgt).unsqueeze(0)

        # src
        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
        wav_src = torch.from_numpy(wav_src).unsqueeze(0)

        c = utils.get_content(cmodel, wav_src)

        tgt_audio = net_g.infer(c, g=g_tgt)
        tgt_audio = tgt_audio[0][0].data.cpu().float().numpy()

        timestamp = time.strftime("%m-%d_%H-%M", time.localtime())
        write(os.path.join('outputs/freevc', "{}.wav".format(timestamp + "_" + title)), hps.data.sampling_rate,
              tgt_audio)

Result audio files should be available in 'outputs/freevc'

# Use Model Optimizer
### Convert cmodel (WavLM).
First we convert the model to the ONNX format, then to OpenVINO's IR format

In [None]:
# define forward as extract_features for compatibility
cmodel.forward = cmodel.extract_features

Convert cmodel to ONNX

In [None]:
OUTPUT_DIR = Path("output")
BASE_MODEL_NAME = "cmodel"

OUTPUT_DIR.mkdir(exist_ok=True)

onnx_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".onnx")

length = 32000
input_shape = (1, length)

input_names=['input']
output_names = ['output']
dummy_input = torch.randn(1, length)
dynamic_axes= {
    'input':{ 1: 'length'},
    'output': {1: 'out_length'}
}

torch.onnx.export(cmodel, dummy_input, onnx_path, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes)

Convert ONNX model to IR format and compile it.

In [None]:
ir_cmodel = mo.convert_model(onnx_path, compress_to_fp16=True)
core = Core()
compiled_cmodel = core.compile_model(ir_cmodel, 'CPU')

### Convert SpeakerEncoder
Converting to ONNX format.

In [None]:
OUTPUT_DIR = Path("output")
BASE_MODEL_NAME = "smodel"

OUTPUT_DIR.mkdir(exist_ok=True)

onnx_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".onnx")


length = 32000
input_shape = (1, length, 40)

input_names=['input']
output_names = ['output']
dummy_input = torch.randn(1, length, 40)
dynamic_axes= {
    'input':{
        0: 'branch_size',
        1: 'length'},
    'output': {1: 'out_length'}
}

torch.onnx.export(smodel, dummy_input, onnx_path, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes)

Converting to OpenVINO's IR format.

In [None]:
ir_smodel = mo.convert_model(onnx_path, compress_to_fp16=True)

Converted model hasn't helper methods. So, we should define helper functions for preparing an input for inference. Just take `compute_partial_slices` and `embed_utterance` methods from `speaker_encoder.voice_encoder.SpeakerEncoder` class and make functions based on them.

In [None]:
from speaker_encoder.hparams import sampling_rate, mel_window_step, partials_n_frames
from speaker_encoder import audio


def compute_partial_slices(n_samples: int, rate, min_coverage):
    """
    Computes where to split an utterance waveform and its corresponding mel spectrogram to
    obtain partial utterances of <partials_n_frames> each. Both the waveform and the
    mel spectrogram slices are returned, so as to make each partial utterance waveform
    correspond to its spectrogram.

    The returned ranges may be indexing further than the length of the waveform. It is
    recommended that you pad the waveform with zeros up to wav_slices[-1].stop.

    :param n_samples: the number of samples in the waveform
    :param rate: how many partial utterances should occur per second. Partial utterances must
    cover the span of the entire utterance, thus the rate should not be lower than the inverse
    of the duration of a partial utterance. By default, partial utterances are 1.6s long and
    the minimum rate is thus 0.625.
    :param min_coverage: when reaching the last partial utterance, it may or may not have
    enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
    then the last partial utterance will be considered by zero-padding the audio. Otherwise,
    it will be discarded. If there aren't enough frames for one partial utterance,
    this parameter is ignored so that the function always returns at least one slice.
    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
    respectively the waveform and the mel spectrogram with these slices to obtain the partial
    utterances.
    """
    assert 0 < min_coverage <= 1

    # Compute how many frames separate two partial utterances
    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
    frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
    assert 0 < frame_step, "The rate is too high"
    assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
        (sampling_rate / (samples_per_frame * partials_n_frames))

    # Compute the slices
    wav_slices, mel_slices = [], []
    steps = max(1, n_frames - partials_n_frames + frame_step + 1)
    for i in range(0, steps, frame_step):
        mel_range = np.array([i, i + partials_n_frames])
        wav_range = mel_range * samples_per_frame
        mel_slices.append(slice(*mel_range))
        wav_slices.append(slice(*wav_range))

    # Evaluate whether extra padding is warranted or not
    last_wav_range = wav_slices[-1]
    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
    if coverage < min_coverage and len(mel_slices) > 1:
        mel_slices = mel_slices[:-1]
        wav_slices = wav_slices[:-1]

    return wav_slices, mel_slices


def embed_utterance(wav: np.ndarray, smodel, return_partials=False, rate=1.3, min_coverage=0.75):
    """
    Computes an embedding for a single utterance. The utterance is divided in partial
    utterances and an embedding is computed for each. The complete utterance embedding is the
    L2-normed average embedding of the partial utterances.

    TODO: independent batched version of this function

    :param wav: a preprocessed utterance waveform as a numpy array of float32
    :param return_partials: if True, the partial embeddings will also be returned along with
    the wav slices corresponding to each partial utterance.
    :param rate: how many partial utterances should occur per second. Partial utterances must
    cover the span of the entire utterance, thus the rate should not be lower than the inverse
    of the duration of a partial utterance. By default, partial utterances are 1.6s long and
    the minimum rate is thus 0.625.
    :param min_coverage: when reaching the last partial utterance, it may or may not have
    enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
    then the last partial utterance will be considered by zero-padding the audio. Otherwise,
    it will be discarded. If there aren't enough frames for one partial utterance,
    this parameter is ignored so that the function always returns at least one slice.
    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
    returned.
    """
    # Compute where to split the utterance into partials and pad the waveform with zeros if
    # the partial utterances cover a larger range.
    wav_slices, mel_slices = compute_partial_slices(len(wav), rate, min_coverage)
    max_wave_length = wav_slices[-1].stop
    if max_wave_length >= len(wav):
        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")

    # Split the utterance into partials and forward them through the model
    mel = audio.wav_to_mel_spectrogram(wav)
    mels = np.array([mel[s] for s in mel_slices])
    with torch.no_grad():
        mels = torch.from_numpy(mels).to(torch.device('cpu'))
        output_layer = smodel.output(0)
        partial_embeds = smodel(mels)[output_layer]

    # Compute the utterance embedding from the partial embeddings
    raw_embed = np.mean(partial_embeds, axis=0)
    embed = raw_embed / np.linalg.norm(raw_embed, 2)

    if return_partials:
        return embed, partial_embeds, wav_slices
    return embed

Then compile model and check inference.

In [None]:
compiled_smodel = core.compile_model(ir_smodel, 'CPU')

wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)

g_tgt = embed_utterance(wav_tgt, compiled_smodel)
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0)
print(g_tgt.shape)

### Convert SynthesizerTrn

Also, it is possible to convert a model to IR format without direct conversion to ONNX. Just set parameter `use_legacy_frontend` to `True`, and Model Optimizer will do it for you under the hood.

In [None]:
# define forward as infer for compatibility
net_g.forward = net_g.infer

dummy_input_1 = torch.randn(1, 1024, 81)
dummy_input_2 = torch.randn(1, 256)

ir_net_g_model = mo.convert_model(
    net_g,
    example_input=(dummy_input_1, dummy_input_2),
    input_shape=[[-1, 1024, -1], [-1, 256]],
    compress_to_fp16=True,
    progress=True,
    use_legacy_frontend=True
)
compiled_ir_net_g_model = core.compile_model(ir_net_g_model, 'CPU')

And now we can check inference using only IR models.

In [None]:
with torch.no_grad():
    for line in tqdm(zip(titles, srcs, tgts)):
        title, src, tgt = line
        # tgt
        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)

        g_tgt = embed_utterance(wav_tgt, compiled_smodel)
        g_tgt = torch.from_numpy(g_tgt).unsqueeze(0)

        # src
        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
        wav_src = torch.from_numpy(wav_src).unsqueeze(0)
        output_layer = compiled_cmodel.output(0)
        c = compiled_cmodel(wav_src)[output_layer]
        c = c.transpose((0, 2, 1))

        output_layer = compiled_ir_net_g_model.output(0)
        tgt_audio = compiled_ir_net_g_model((c, g_tgt))[output_layer]
        tgt_audio = tgt_audio[0][0]

        timestamp = time.strftime("%m-%d_%H-%M", time.localtime())
        write(os.path.join('outputs/freevc', "{}.wav".format(timestamp + "_" + title)), hps.data.sampling_rate,
              tgt_audio)

Result audio files should be available in 'outputs/freevc' and you can check them and compare with generated earlier.