In [1]:
import torch
import librosa
from utils.demo_utils import *
from audiotools import transforms as tfm

transform = tfm.Compose(
            tfm.VolumeNorm(),
            tfm.RescaleAudio())


vc_model = torch.jit.load("pretrained/model-nc.ts")
vc_model = vc_model.eval()

# GENDER:  -1.72 -> 1.94 (Male -> Female)
# AGE:     -0.75 -> 3.50 (18 -> 90)
# TREMBLE:   0.0 -> 12.0 (Tremble Amount) 
# AMBITUS:   0.25 -> 1.75 (Pitch Variance) 
# PITCH:     0.5 -> 2.0 (Pitch Shifting in Octaves) 

gender = -0.1
age = 2.2
tremble = 1.0
ambitus = 1.0
pitch = 1.0

speaker_gender = torch.tensor([gender], dtype=torch.float32)
speaker_age = torch.tensor([age], dtype=torch.float32)
speaker_tremble = torch.tensor([tremble], dtype=torch.float32)
speaker_ambitus = torch.tensor([ambitus], dtype=torch.float32)
speaker_pitch = torch.tensor([pitch], dtype=torch.float32)

x, sr = librosa.load("audio/librispeech2.wav", sr=16000, mono=True)
x = torch.tensor(x, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

with torch.no_grad():
    vc_model.reset_pitch()
    vc_model.set_new_speaker(speaker_gender, speaker_age)
    vc_model.set_tremble_depth(speaker_tremble)
    vc_model.set_ambitus_scaler(speaker_ambitus)
    vc_model.set_pitch_mult(speaker_pitch)

out = vc_model(x)

display_audios([("INPUT", x, sr), ("CONVERTED", out, sr)])

In [4]:
t, sr = librosa.load("targets/p228_004.wav", sr=16000, mono=True)
t = torch.tensor(t[46000:], dtype=torch.float32).unsqueeze(0).unsqueeze(0)

with torch.no_grad():
    vc_model.reset_pitch()
    vc_model.set_embedding_from_audio(t)

out = vc_model(normalize(x, transform))

display_audios([("TARGET", t, sr), ("CONVERTED", out, sr)])

In [3]:
target = ['p231']

json_path = 'utils/speaker_dict.json'
speaker_embedding_avg, speaker_embedding_one, speaker_mean = get_speaker_embeddings_json(target, json_path)

print(f"Stats of {target[0]} -> "
      f"F0 Mean: {speaker_mean[0]:.2f}")

speaker_mean = torch.tensor([speaker_mean[0]], dtype=torch.float32)
speaker_embedding_avg = speaker_embedding_avg[0]

with torch.no_grad():
    vc_model.reset_pitch()
    vc_model.set_new_speaker_from_embedding(speaker_mean, speaker_embedding_avg)

out = vc_model(normalize(x, transform))

display_audios([("TARGET", t, sr), ("CONVERTED", out, sr)])

Stats of p231 -> F0 Mean: 167.12


In [None]:
import torch
import librosa
from utils.demo_utils import *
from audiotools import transforms as tfm

transform = tfm.Compose(
            tfm.VolumeNorm(),
            tfm.RescaleAudio())


vc_model = torch.jit.load("pretrained/model-nc.ts")
vc_model = vc_model.eval()

x, sr = librosa.load("audio/librispeech2.wav", sr=16000, mono=True)
x = torch.tensor(x, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

t, sr = librosa.load("audio/target_p228.wav", sr=16000, mono=True)
t = torch.tensor(t[46000:], dtype=torch.float32).unsqueeze(0).unsqueeze(0)

with torch.no_grad():
    vc_model.reset_pitch()
    vc_model.set_embedding_from_audio(t)

out = vc_model(normalize(x, transform))