In [1]:
import numpy as np
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
from torchsummary import summary

import os
import pyworld
import librosa
import time
import matplotlib.pyplot as plt

from preprocess import *
from model import *

torch.manual_seed(0)
np.random.seed(0)

In [2]:
model_dir = "./model/model_mc32_fr1024"
model_name = "model_mc32_fr1024"

data_dir = "./data/voice_data"
voice_dir = ["F4", "F5", "F6", "M2"]

_from = voice_dir.index("M2")
_to = voice_dir.index("F4")

output_dir = "./converted_voices/test"
figure_dir = "./figure/model_mc32_fr1024"

In [3]:
sampling_rate = 16000
num_mcep = 36
frame_period = 5.0
n_frames = 1024 

In [4]:
def model_save(model, model_dir, model_name):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    torch.save(model.state_dict(), os.path.join(model_dir, model_name))
    
def model_load(model_dir, model_name):
    model = ACVAE()
    model.load_state_dict(torch.load(os.path.join(model_dir, model_name), map_location='cpu'))
    return model

In [5]:
def test(model, s_label, t_label):
    
    print("Test")
    print("Converted: " + voice_dir[s_label] + " -> " + voice_dir[t_label])
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    voice_path_s = os.path.join(data_dir, voice_dir[s_label])
    voice_path_t = os.path.join(data_dir, voice_dir[t_label])
    
    files = os.listdir(voice_path_s)
    
    for i, file in enumerate(files):
        if(file.count("wav") == 0):
            continue   
        if ((i+1) % 20 == 0):
            print(str(((i+1)*100)//len(files)) + " %")

        wav, _ = librosa.load(os.path.join(voice_path_s, file), sr = sampling_rate, mono = True)
        wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
        f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)
        coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_mcep)
        coded_sp_transposed = coded_sp.T

        mcep_normalization_params_s = np.load(os.path.join(voice_path_s, "mcep_"+voice_dir[s_label]+".npz"))
        mcep_mean_s = mcep_normalization_params_s['mean']
        mcep_std_s = mcep_normalization_params_s['std']    
        mcep_normalization_params_t = np.load(os.path.join(voice_path_t, "mcep_"+voice_dir[t_label]+".npz"))
        mcep_mean_t = mcep_normalization_params_t['mean']
        mcep_std_t = mcep_normalization_params_t['std']

        coded_sp_norm = (coded_sp_transposed - mcep_mean_s) / mcep_std_s

        x = torch.Tensor(coded_sp_norm).view(1, 1, coded_sp_norm.shape[0], coded_sp_norm.shape[1])

        label_s_tensor = torch.Tensor(np.array([s_label])).view(1, 1)
        label_t_tensor = torch.Tensor(np.array([t_label])).view(1, 1)

        mu_enc, logvar_enc = model.encode(x, label_s_tensor)
        z_enc = model.reparameterize(mu_enc, logvar_enc)
        mu_dec, logvar_dec = model.decode(z_enc, label_t_tensor)
        z_dec = model.reparameterize(mu_dec, logvar_dec)
        z_dec = z_dec.data.numpy().reshape((coded_sp_norm.shape[0], coded_sp_norm.shape[1]))

        coded_sp_converted = z_dec * mcep_std_t + mcep_mean_t

        logf0s_normalization_params_s = np.load(os.path.join(voice_path_s, "log_f0_"+voice_dir[s_label]+".npz"))
        logf0s_mean_s = logf0s_normalization_params_s['mean']
        logf0s_std_s = logf0s_normalization_params_s['std']
        logf0s_normalization_params_t = np.load(os.path.join(voice_path_t, "log_f0_"+voice_dir[t_label]+".npz"))
        logf0s_mean_t = logf0s_normalization_params_t['mean']
        logf0s_std_t = logf0s_normalization_params_t['std']

        f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_s, std_log_src = logf0s_std_s, mean_log_target = logf0s_mean_t, std_log_target = logf0s_std_t)

        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)
        wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period)
        librosa.output.write_wav(os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)

    print("Finish.")

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = model_load(model_dir, model_name)

test(model, _from, _to)

cpu
Test
Converted: M2 -> F4


  f0_converted = np.exp((np.log(f0) - mean_log_src) / std_log_src * std_log_target + mean_log_target)
  f0_converted = np.exp((np.log(f0) - mean_log_src) / std_log_src * std_log_target + mean_log_target)


37 %
76 %
115 %
154 %
194 %
Finish.
