In [1]:
import os
import sys
import argparse
import torch
import librosa
import time
from scipy.io.wavfile import write
from tqdm import tqdm
from pathlib import Path
import IPython.display as ipd

cwd = Path().resolve()
prj_dir = os.path.dirname(os.path.dirname(os.path.abspath(cwd)))
print(f'prj_dir:{prj_dir}')
sys.path.append(prj_dir)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import utils
from models import SynthesizerTrn
from mel_processing import mel_spectrogram_torch,spectrogram_torch
from wavlm import WavLM, WavLMConfig
from speaker_encoder.voice_encoder import SpeakerEncoder
import logging
logging.getLogger('numba').setLevel(logging.WARNING)


prj_dir:/data1/hjh/pycharm_projects/vcc/freevc


# load model

In [5]:
hpfile=f"{prj_dir}/configs/freevc.json"
# ptfile="/mnt/cephfs/hjh/train_record/vc/freevc/pretrain_models/freevc.pth"
ptfile="/mnt/cephfs/hjh/train_record/vc/freevc/train_vctk/train_output_freevc_nosr/G_17000.pth"
cmodel_path="/mnt/cephfs/hjh/train_record/vc/freevc/pretrain_models/WavLM-Large.pt"
config_path="/mnt/cephfs/hjh/train_record/vc/freevc/pretrain_models/config.json"

hps = utils.get_hparams_from_file(hpfile)

print("Loading model...")
net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()
print("Loading checkpoint...")
_ = utils.load_checkpoint(ptfile, net_g, None)

print("Loading WavLM for content...")
cmodel = utils.get_cmodel(0,model_path=cmodel_path)

if hps.model.use_spk:
    print("Loading speaker encoder...")
    smodel = SpeakerEncoder(f'{prj_dir}/speaker_encoder/ckpt/pretrained_bak_5805000.pt')

print("load moel done!")

Loading model...
Loading checkpoint...
INFO:root:Loaded checkpoint '/mnt/cephfs/hjh/train_record/vc/freevc/train_vctk/train_output_freevc_nosr/G_17000.pth' (iteration 26)
Loading WavLM for content...
INFO:wavlm.WavLM:WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': 'static', 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_chann

# infer

In [6]:
# src="/mnt/cephfs/hjh/common_dataset/tts/english/microsoft/wavs_22050/v8/en-US-CoraNeural_1624615432771.wav"
src="/mnt/cephfs/hjh/train_record/vc/freevc/test_wavs/0O7dZN.ogg.wav"
tgt="/mnt/cephfs/hjh/common_dataset/tts/english/microsoft/wavs_22050/v6/en-US-JacobNeural_1624608643404.wav"

wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
if hps.model.use_spk:
    g_tgt = smodel.embed_utterance(wav_tgt)
    g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).cuda()
else:
    wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).cuda()
    mel_tgt = mel_spectrogram_torch(
        wav_tgt, 
        hps.data.filter_length,
        hps.data.n_mel_channels,
        hps.data.sampling_rate,
        hps.data.hop_length,
        hps.data.win_length,
        hps.data.mel_fmin,
        hps.data.mel_fmax
    )
# src
wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
g_src = smodel.embed_utterance(wav_src)
g_src = torch.from_numpy(g_src).unsqueeze(0).cuda()
    
wav_src = torch.from_numpy(wav_src).unsqueeze(0).cuda()
c = utils.get_content(cmodel, wav_src)

if hps.model.use_spk:
    audio = net_g.infer(c, g=g_tgt)
else:
    audio = net_g.infer(c, mel=mel_tgt)
audio = audio[0][0].data.cpu().float().numpy()

print("原音频:")
ipd.display(ipd.Audio(wav_src.cpu(), rate=hps.data.sampling_rate))
print('目标音频:')
ipd.display(ipd.Audio(wav_tgt, rate=hps.data.sampling_rate))
print('转换后:')
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))

原音频:


目标音频:


转换后:


# infer_v2

In [7]:
spec_src = spectrogram_torch(
        wav_src, 
        hps.data.filter_length,
        hps.data.sampling_rate,
        hps.data.hop_length,
        hps.data.win_length
    )
spec_lens=torch.LongTensor([spec_src.size(-1)]).cuda()


    
audio_v2 = net_g.infer_v2(spec_src,spec_lens,g_src,g_tgt)

ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))