In [1]:
import pickle
import os
import torch
import numpy as np
import IPython.display as ipd
from util.evaluate import Evaluator
from factory.AutoVC import AutoVC
from factory.MetaConv import MetaConv
from factory.AutoVC_Adjust import AutoVC_Adjust

In [2]:
class Config:
    def __init__(self,root):
        self.root = root
        self.num_speaker = 40
        self.batch_size = 2
        self.erroment_uttr_idx = 16
        self.max_uttr_idx = 60
        self.len_crop = 176
        self.device = "cpu"
        self.judge = None
        self.x_fid_style = None
        self.all_speaker = sorted(next(iter(os.walk(root)))[1][:self.num_speaker])
        self.metadata = pickle.load(open(f'{root}/test.pkl', "rb"))

In [3]:
config = Config('test_data_vctk_ver2')

In [4]:
org = AutoVC(44,256,512,22).to(config.device)
org.load_state_dict(torch.load("model/autovc_en.pt", map_location=config.device))

conv = AutoVC(44,256,512,22).to(config.device)
conv.load_state_dict(torch.load("model/autovc_en.pt", map_location=config.device))

<All keys matched successfully>

In [5]:
org_ad = AutoVC_Adjust(44,256,512,22).to(config.device)
org_ad.load_state_dict(torch.load("model/autovc_adjust_en.pt", map_location=config.device))
org_ad_gan = AutoVC_Adjust(44,256,512,22).to(config.device)
org_ad_gan.load_state_dict(torch.load("model/autovc_adjust_gan_en.pt", map_location=config.device))

<All keys matched successfully>

In [9]:
E = Evaluator(config)

In [19]:
import random
s = random.randint(0,39)#,random.randint(0,39)
t = random.randint(0,39)

In [20]:
mel_source, mel_target, mel_trans_org = E.get_trans_mel(org,s,t,2,False)
mel_source, mel_target, mel_trans_conv = E.get_trans_mel(conv,s,t,2,False)
mel_source, mel_target, mel_trans_org_ad = E.get_trans_mel(org_ad,s,t,2,True)
mel_source, mel_target, mel_trans_org_ad_gan = E.get_trans_mel(org_ad_gan,s,t,2,True)

In [21]:
import soundfile as sf

In [22]:
wav_target = E.get_wavs(mel_target.transpose(2,1)).detach().numpy()[0].astype(np.float32)
wav_org = E.get_wavs(mel_trans_org.transpose(2,1)).detach().numpy()[0].astype(np.float32)
wav_conv = E.get_wavs(mel_trans_conv.transpose(2,1)).detach().numpy()[0].astype(np.float32)
wav_org_ad = E.get_wavs(mel_trans_org_ad.transpose(2,1)).detach().numpy()[0].astype(np.float32)
wav_org_ad_gan = E.get_wavs(mel_trans_org_ad_gan.transpose(2,1)).detach().numpy()[0].astype(np.float32)

In [23]:
#sf.write("target.wav",wav_target,samplerate=22050)
#sf.write("org.wav",wav_org,samplerate=22050)
#sf.write("conv.wav",wav_conv,samplerate=22050)
#sf.write("org_ad.wav",wav_org_ad,samplerate=22050)
#sf.write("org_ad_gan.wav",wav_org_ad_gan,samplerate=22050)

# Source

In [24]:
ipd.Audio(E.get_wavs(mel_source.transpose(2,1)),rate = 22050)

# Target

In [25]:
ipd.Audio(E.get_wavs(mel_target.transpose(2,1)),rate = 22050)

# Trans

In [26]:
# AutoVC (MetaDV)
ipd.Audio(E.get_wavs(mel_trans_org.transpose(2,1)),rate = 22050)

In [27]:
# MetaConv (MetaDV)
ipd.Audio(E.get_wavs(mel_trans_conv.transpose(2,1)),rate = 22050)

In [28]:
# AutoVC (MetaDV) + Adjust
ipd.Audio(E.get_wavs(mel_trans_org_ad.transpose(2,1)),rate = 22050)

In [29]:
# AutoVC (MetaDV) + Adjust + Gan
ipd.Audio(E.get_wavs(mel_trans_org_ad_gan.transpose(2,1)),rate = 22050)

In [18]:
C = torch.load('model/static/metadv_vctk80.pt').to(config.device)

In [19]:
target_emb_real = C(E.crop_mel(mel_target.squeeze().detach().numpy())[0])[1]

In [20]:
emb_org = C(E.crop_mel(mel_trans_org.squeeze().detach().numpy())[0])[1]
emb_conv = C(E.crop_mel(mel_trans_conv.squeeze().detach().numpy())[0])[1]
emb_org_ad = C(E.crop_mel(mel_trans_org_ad.squeeze().detach().numpy())[0])[1]
emb_org_ad_gan = C(E.crop_mel(mel_trans_org_ad_gan.squeeze().detach().numpy())[0])[1]

## Cosin

In [21]:
torch.nn.functional.cosine_similarity(target_emb_real,emb_org)

tensor([0.7981], grad_fn=<DivBackward0>)

In [22]:
torch.nn.functional.cosine_similarity(target_emb_real,emb_conv)

tensor([0.7907], grad_fn=<DivBackward0>)

In [23]:
torch.nn.functional.cosine_similarity(target_emb_real,emb_org_ad)

tensor([0.7898], grad_fn=<DivBackward0>)

In [24]:
torch.nn.functional.cosine_similarity(target_emb_real,emb_org_ad_gan)

tensor([0.8799], grad_fn=<DivBackward0>)

## Globa Var

In [25]:
from librosa.feature import mfcc
real_mfcc = mfcc(S=mel_source.detach().numpy(),sr=22050)[0].T
org_mfcc = mfcc(S=mel_trans_org.detach().numpy(),sr=22050)[0].T
conv_mfcc = mfcc(S=mel_trans_conv.detach().numpy(),sr=22050)[0].T
org_ad_mfcc = mfcc(S=mel_trans_org_ad.detach().numpy(),sr=22050)[0].T
org_ad_gan_mfcc = mfcc(S=mel_trans_org_ad_gan.detach().numpy(),sr=22050)[0].T

In [26]:
from math import sqrt
def get_t_gv(t,mfcc):
    length = mfcc.shape[0]
    n_mfcc = t
    all_y_head,all_y_t = 0,0
    for frame in mfcc:
        all_y_head += frame[n_mfcc]
    y_head = (all_y_head/length)
    for frame in mfcc:
        all_y_t += sqrt((frame[n_mfcc] - y_head)**2)
    return all_y_t/length

In [27]:
real_gv =[]
org_gv = []
conv_gv = []
org_ad_gv = []
org_ad_gan_gv = []
for i in range(80):
    real_gv.append(get_t_gv(i,real_mfcc))
    org_gv.append(get_t_gv(i,org_mfcc))
    conv_gv.append(get_t_gv(i,conv_mfcc))
    org_ad_gv.append(get_t_gv(i,org_ad_mfcc))
    org_ad_gan_gv.append(get_t_gv(i,org_ad_gan_mfcc))

In [28]:
print("*************")
print(f"Real Data {sum(real_gv)/80}")
print(f"Autovc Convert: {sum(org_gv)/80}")
print(f"MetaConv Convert: {sum(conv_gv)/80}")
print(f"Autovc Adjust Convert: {sum(org_ad_gv)/80}")
print(f"Autovc Adjust GAN Convert: {sum(org_ad_gan_gv)/80}")
print("*************")

*************
Real Data 0.5922074554479314
Autovc Convert: 0.5406275495844768
MetaConv Convert: 0.5406275495844768
Autovc Adjust Convert: 0.5433177324882854
Autovc Adjust GAN Convert: 0.5466762123810287
*************
