# High-Quality Text-Free One-Shot Voice Conversion with FeeVC and OpenVINO™

## Pre-requisites
1. Clone this repo: git clone https://github.com/OlaWod/FreeVC.git
2. Download [WavLM-Large](https://github.com/microsoft/unilm/tree/master/wavlm) and put it under directory 'FreeVC/wavlm/'
3. Download the [VCTK](https://datashare.ed.ac.uk/handle/10283/3443) dataset (will be replaced by our examples). You can use any of them, but for this example you should use `vctk-16k/p225/p225_001.wav` and `vctk-16k/p226/p226_002.wav`. Put them under directory 'dataset'. To use other examples, you should change `convert.txt`.
4. Download [pretrained models](https://1drv.ms/u/s!AnvukVnlQ3ZTx1rjrOZ2abCwuBAh?e=UlhRR5) and put it under directory 'checkpoints' (for current example only `freevc.pth` are required)

Install extra requirements

In [1]:
!pip install -q "librosa>=0.8.1"
!pip install webrtcvad==2.0.10

Check if FreeVC is installed and its path to sys.path

In [1]:
from pathlib import Path
import sys


free_vc_repo = 'FreeVC'
if not Path(free_vc_repo).exists():
    !git clone https://github.com/OlaWod/FreeVC.git

sys.path.append(free_vc_repo)

## Imports and settings

In [2]:
import os
import time

import librosa
import torch
from scipy.io.wavfile import write
from tqdm import tqdm

import utils
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
from wavlm import WavLM, WavLMConfig

Redefine function `get_model`form `utils` to exclude cuda

In [3]:
def get_cmodel():
    checkpoint = torch.load('wavlm/WavLM-Large.pt')
    cfg = WavLMConfig(checkpoint['cfg'])
    cmodel = WavLM(cfg)
    cmodel.load_state_dict(checkpoint['model'])
    cmodel.eval()

    return cmodel

Models initialization

In [4]:
hps = utils.get_hparams_from_file('configs/freevc.json')
os.makedirs('outputs/freevc', exist_ok=True)

net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model
)

utils.load_checkpoint('checkpoints/freevc.pth', net_g, optimizer=None, strict=True)
cmodel = get_cmodel()
smodel = SpeakerEncoder('FreeVC/speaker_encoder/ckpt/pretrained_bak_5805000.pt')

INFO:root:Loaded checkpoint 'checkpoints/freevc.pth' (iteration 1372)
INFO:wavlm.WavLM:WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': 'static', 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_buckets': 320, 'max_distance':

Reading dataset settings

In [None]:
titles, srcs, tgts = [], [], []

with open('convert.txt', "r") as f:
    for rawline in f.readlines():
        title, src, tgt = rawline.strip().split("|")
        titles.append(title)
        srcs.append(src)
        tgts.append(tgt)

Inference

In [6]:
with torch.no_grad():
    for line in tqdm(zip(titles, srcs, tgts)):
        title, src, tgt = line
        # tgt
        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)

        g_tgt = smodel.embed_utterance(wav_tgt)
        g_tgt = torch.from_numpy(g_tgt).unsqueeze(0)

        # src
        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
        wav_src = torch.from_numpy(wav_src).unsqueeze(0)
        c = utils.get_content(cmodel, wav_src)

        audio = net_g.infer(c, g=g_tgt)
        audio = audio[0][0].data.cpu().float().numpy()

        timestamp = time.strftime("%m-%d_%H-%M", time.localtime())
        write(os.path.join('outputs/freevc', "{}.wav".format(timestamp + "_" + title)), hps.data.sampling_rate,
              audio)

2it [00:03,  1.99s/it]


Results audio files should be available in 'outputs/freevc'

In [0]:
# Use Model Optimizer

In [8]:
# define forward as extract_features for compatibility
cmodel.forward = cmodel.extract_features

Convert model

In [9]:
from openvino.tools import mo


ir_model = mo.convert_model(cmodel, input_shape=[1, -1], compress_to_fp16=True)

RuntimeError: Calculated padded input size per channel: (1). Kernel size: (10). Kernel size can't be greater than actual input size

Convert cmodel to ONNX

In [10]:
OUTPUT_DIR = Path("output")
BASE_MODEL_NAME = "cmodel"

OUTPUT_DIR.mkdir(exist_ok=True)

onnx_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".onnx")


length = 32000
input_shape = (1, length)

input_names=['input']
output_names = ['output']
dummy_input = torch.randn(1, length)
dynamic_axes= {
    'input':{ 1: 'length'},
    'output': {1: 'out_length'}
}

torch.onnx.export(cmodel, dummy_input, onnx_path, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes)

  if mask:
  assert embed_dim == self.embed_dim
  assert list(query.size()) == [tgt_len, bsz, embed_dim]
  assert key_bsz == bsz
  assert src_len, bsz == value.shape[:2]
  feature = res["features"] if ret_conv else res["x"]
  if ret_layer_results:


In [0]:
ir_cmodel = mo.convert_model(onnx_path, compress_to_fp16=True)

In [14]:
from openvino.runtime import Core

core = Core()
compiled_cmodel = core.compile_model(ir_cmodel, 'CPU')
c = compiled_cmodel(wav_src)[0]
print(c.shape)

(1, 215, 1024)


# Convert SpeakerEncoder

In [15]:
ir_smodel = mo.convert_model(smodel, input_shape=[1, 160, 40], compress_to_fp16=True)

OpConversionFailure: Check 'unconverted_ops_types.size() == 0' failed at src/frontends/pytorch/src/frontend.cpp:72:
FrontEnd API failed with OpConversionFailure: :
Model wasn't fully converted. Unconverted operation types:
aten::frobenius_norm
aten::lstm
prim::ListConstruct



# Convert SynthesizerTrn

In [16]:
from openvino.tools import mo
from openvino.runtime import Core, serialize

core = Core()


net_g.forward = net_g.infer
ir_model = mo.convert_model(net_g, input_shape=[[1, 1024, -1], [1, 256]], compress_to_fp16=True)
compiled_model = core.compile_model(ir_model, 'CPU')

with torch.no_grad():
    for line in tqdm(zip(titles, srcs, tgts)):
        title, src, tgt = line
        # tgt
        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)

        g_tgt = smodel.embed_utterance(wav_tgt)
        g_tgt = torch.from_numpy(g_tgt).unsqueeze(0)

        # src
        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
        wav_src = torch.from_numpy(wav_src).unsqueeze(0)
        c = utils.get_content(cmodel, wav_src)

        output_layer = compiled_model.output(0)
        audio = compiled_model((c, g_tgt))[output_layer]

        timestamp = time.strftime("%m-%d_%H-%M", time.localtime())
        write(os.path.join('outputs/freevc', "{}.wav".format(timestamp + "_" + title)), hps.data.sampling_rate,
              audio)

Tensor-likes are not close!

Mismatched elements: 320 / 320 (100.0%)
Greatest absolute difference: 0.18093429505825043 at index (0, 0, 97) (up to 1e-05 allowed)
Greatest relative difference: 422.0432504646249 at index (0, 0, 275) (up to 1e-05 allowed)
  _check_trace(


OpConversionFailure: Check 'unconverted_ops_types.size() == 0' failed at src/frontends/pytorch/src/frontend.cpp:72:
FrontEnd API failed with OpConversionFailure: :
Model wasn't fully converted. Unconverted operation types:
aten::flip
aten::randn_like
prim::Constant

