In [4]:
import ffmpeg
import numpy as np
from fairseq import checkpoint_utils
import torch
import faiss
from scipy import signal
from time import time
import torch.nn.functional as F
from models import SynthesizerTrn
import librosa

def load_audio(file, sr):
    # Load audio file using ffmpeg and resample as necessary
    out, _ = (
        ffmpeg.input(file, threads=0)
        .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
        .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
    )
    return np.frombuffer(out, np.float32).flatten()

def change_rms(data1, sr1, data2, sr2, rate):  
    rms1 = librosa.feature.rms(
        y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
    )  
    rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
    rms1 = torch.from_numpy(rms1)
    rms1 = F.interpolate(
        rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
    ).squeeze()
    rms2 = torch.from_numpy(rms2)
    rms2 = F.interpolate(
        rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
    ).squeeze()
    rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
    data2 *= (
        torch.pow(rms1, torch.tensor(1 - rate))
        * torch.pow(rms2, torch.tensor(rate - 1))
    ).numpy()
    return data2

device = "cuda" if torch.cuda.is_available() else "cpu"

is_half = True

file_index = 'VCTK-Corpus-0.92/p225/added.index'
index = faiss.read_index(file_index)
big_npy = index.reconstruct_n(0, index.ntotal)

models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
    ["pretrained/hubert_base.pt"],
    suffix="",
)
hubert_model = models[0]
hubert_model = hubert_model.to(device)
if is_half:
    hubert_model = hubert_model.half()
else:
    hubert_model = hubert_model.float()

person = 'VCTK-Corpus-0.92/p225/p225.pth'
cpt = torch.load(person, map_location="cpu")
tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]

net_g = SynthesizerTrn(*cpt["config"], is_half=is_half)
del net_g.enc_q
net_g.load_state_dict(cpt["weight"], strict=False)
net_g.eval().to(device)
if is_half:
    net_g = net_g.half()
else:
    net_g = net_g.float()

window = 160  
f0_up_key = 0
f0_up_key = int(f0_up_key)
audio = load_audio('VCTK-Corpus-0.92/wav48_silence_trimmed/p228/p228_002_mic1.flac', 16000)
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1:
    audio /= audio_max
times = [0, 0, 0]

bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
audio = signal.filtfilt(bh, ah, audio)
audio_pad = np.pad(audio, (window // 2, window // 2), mode="reflect")

rms_mix_rate = 0.25
sr = 16000
x_pad = 3
t_pad_tgt = tgt_sr * x_pad
index_rate = 0.75
speaker_id = 0
s = 0
audio_opt = []
t = None
t1 = time()
t_pad = sr * x_pad
audio_pad = np.pad(audio, (t_pad, t_pad), mode="reflect")
p_len = audio_pad.shape[0] // window
sid = torch.tensor(speaker_id, device=device).unsqueeze(0).long()
t2 = time()
times[1] += t2 - t1

feats = torch.from_numpy(audio_pad[t:])
if is_half:
    feats = feats.half()
else:
    feats = feats.float()
if feats.dim() == 2:  # double channels
    feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(device).fill_(False)
inputs = {
    "source": feats.to(device),
    "padding_mask": padding_mask,
    "output_layer": 12
}
t0 = time()
with torch.no_grad():
    logits = hubert_model.extract_features(**inputs)
    feats = logits[0]

npy = feats[0].cpu().numpy()
if is_half:
    npy = npy.astype("float32")

score, ix = index.search(npy, k=8)
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
if is_half:
    npy = npy.astype("float16")

feats = (
    torch.from_numpy(npy).unsqueeze(0).to(device) * index_rate
    + (1 - index_rate) * feats
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t1 = time()
p_len = audio_pad[t:].shape[0] // window
if feats.shape[1] < p_len:
    p_len = feats.shape[1]
p_len = torch.tensor([p_len], device=device).long()
with torch.no_grad():
    arg = (feats, p_len, sid)
    audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
    del arg
del feats, p_len, padding_mask
if torch.cuda.is_available():
    torch.cuda.empty_cache()

t2 = time()
times[0] += t1 - t0
times[2] += t2 - t1

audio_opt.append(audio1[t_pad_tgt : -t_pad_tgt])
audio_opt = np.concatenate(audio_opt)

if rms_mix_rate != 1:
    audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)

audio_max = np.abs(audio_opt).max() / 0.99
max_int16 = 32768
if audio_max > 1:
    max_int16 /= audio_max
audio_opt = (audio_opt * max_int16).astype(np.int16)
del sid
if torch.cuda.is_available():
    torch.cuda.empty_cache()

2024-03-24 06:33:51 | INFO | fairseq.tasks.hubert_pretraining | current directory is /workspace/rvc_update
2024-03-24 06:33:51 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2024-03-24 06:33:51 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_d

In [5]:
import IPython.display as ipd
ipd.Audio(audio_opt, rate=tgt_sr)


In [6]:
import soundfile as sf

audio_file_path = 'output_audio.wav'  
sf.write(file=audio_file_path, data=audio_opt, samplerate=tgt_sr)


In [26]:
import torchaudio
from speechbrain.inference.speaker import SpeakerRecognition

# Model is downloaded from the speechbrain HuggingFace repo

verification = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir='tmpdir',
)

audio_path_1 = 'output_audio.wav'
audio_path_2 = 'VCTK-Corpus-0.92/wav48_silence_trimmed/p225/p225_009_mic1.flac'
audio_path_3 = 'VCTK-Corpus-0.92/wav48_silence_trimmed/p231/p231_009_mic1.flac'
audio_path_4 = 'VCTK-Corpus-0.92/wav48_silence_trimmed/p231/p231_020_mic1.flac'


signal1, fs1 = torchaudio.load(audio_path_3)
signal2, fs2 = torchaudio.load(audio_path_4)


if fs1 != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=fs1, new_freq=16000)
    signal1 = resampler(signal1)
if fs2 != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=fs2, new_freq=16000)
    signal2 = resampler(signal2)


score, prediction = verification.verify_batch(signal1, signal2)

print(f"Score: {score}")


2024-03-24 06:43:03 | INFO | speechbrain.utils.fetching | Fetch hyperparams.yaml: Using existing file/symlink in tmpdir/hyperparams.yaml.
2024-03-24 06:43:03 | INFO | speechbrain.utils.fetching | Fetch custom.py: Delegating to Huggingface hub, source speechbrain/spkrec-ecapa-voxceleb.
2024-03-24 06:43:04 | INFO | speechbrain.utils.fetching | Fetch embedding_model.ckpt: Using existing file/symlink in tmpdir/embedding_model.ckpt.
2024-03-24 06:43:04 | INFO | speechbrain.utils.fetching | Fetch mean_var_norm_emb.ckpt: Using existing file/symlink in tmpdir/mean_var_norm_emb.ckpt.
2024-03-24 06:43:04 | INFO | speechbrain.utils.fetching | Fetch classifier.ckpt: Using existing file/symlink in tmpdir/classifier.ckpt.
2024-03-24 06:43:04 | INFO | speechbrain.utils.fetching | Fetch label_encoder.txt: Using existing file/symlink in tmpdir/label_encoder.ckpt.
2024-03-24 06:43:04 | INFO | speechbrain.utils.parameter_transfer | Loading pretrained files for: embedding_model, mean_var_norm_emb, classif

Score: tensor([[0.7612]])
