In [None]:
import numpy as np
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable

import os
import pyworld
import librosa
import time
import matplotlib.pyplot as plt

from preprocess import *
from model import *

In [None]:
model_name = "model_lambda70_f2f3m1m2"
model_dir = "./model/" + model_name

data_dir = "./data/voice_data"
voice_dir_list = ["F2", "F3", "M1", "M2"]

output_dir = "./converted_voices/result/" + model_name

In [None]:
sampling_rate = 16000
num_mcep = 36
frame_period = 5.0
n_frames = 1024 

lambda_p = 70
lambda_s = 70
nb_label = len(voice_dir_list)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
def model_save(model, model_dir, model_name):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    torch.save(model.state_dict(), os.path.join(model_dir, model_name))
    
def model_load(model_dir, model_name):
    model = ACVAE(nb_label, lambda_p, lambda_s)
    if torch.cuda.is_available():
        model.load_state_dict(torch.load(os.path.join(model_dir, model_name), map_location='cuda'))
    else:
        model.load_state_dict(torch.load(os.path.join(model_dir, model_name), map_location='cpu'))
    model.to(device)
    return model

In [None]:
def conv_all(model):
    print("Conversion Start.")
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)i
    
    for s_label in range(nb_label):
        
        output_label_dir = os.path.join(output_dir, voice_dir_list[s_label])
        if not os.path.exists(output_label_dir):
            os.makedirs(output_label_dir)
    
        voice_path_s = os.path.join(data_dir, voice_dir_list[s_label])

        count = -1
        files = os.listdir(voice_path_s)
        for file in files:
            if file.count("wav") == 0:
                continue

            for t_label in range(nb_label):
                if (t_label == s_label):
                    continue

                voice_path_t = os.path.join(data_dir, voice_dir_list[t_label])

                wav, _ = librosa.load(os.path.join(voice_path_s, file), sr = sampling_rate, mono = True)
                wav = librosa.util.normalize(wav, norm=np.inf, axis=None)
                wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
                f0, timeaxis, sp, ap, mc = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)

                mc_transposed  = np.array(mc).T

                mcep_normalization_params_s = np.load(os.path.join(voice_path_s, "mcep_"+voice_dir_list[s_label]+".npz"))
                mcep_mean_s = mcep_normalization_params_s['mean']
                mcep_std_s = mcep_normalization_params_s['std']    
                mcep_normalization_params_t = np.load(os.path.join(voice_path_t, "mcep_"+voice_dir_list[t_label]+".npz"))
                mcep_mean_t = mcep_normalization_params_t['mean']
                mcep_std_t = mcep_normalization_params_t['std']

                mc_norm = (mc_transposed - mcep_mean_s) / mcep_std_s

                x = torch.Tensor(mc_norm).view(1, 1, mc_norm.shape[0], mc_norm.shape[1])

                label_s_tensor = torch.Tensor(np.array([s_label])).view(1, 1)
                label_t_tensor = torch.Tensor(np.array([t_label])).view(1, 1)

                x = x.to(device)
                label_s_tensor = label_s_tensor.to(device)
                label_t_tensor = label_t_tensor.to(device)

                mu_enc, logvar_enc = model.encode(x, label_s_tensor)
                z_enc = model.reparameterize(mu_enc, logvar_enc)
                # x^
                mu_dec_t, logvar_dec_t = model.decode(z_enc, label_t_tensor)
                z_dec_t = model.reparameterize(mu_dec_t, logvar_dec_t)
                if (torch.cuda.is_available()):
                    z_dec_t = z_dec_t.data.cpu().numpy().reshape((mc_norm.shape[0], mc_norm.shape[1]))
                else:
                    z_dec_t = z_dec_t.data.numpy().reshape((mc_norm.shape[0], mc_norm.shape[1]))
                # x_
                mu_dec_s, logvar_dec_s = model.decode(z_enc, label_s_tensor)
                z_dec_s = model.reparameterize(mu_dec_s, logvar_dec_s)
                if (torch.cuda.is_available()):
                    z_dec_s = z_dec_s.data.cpu().numpy().reshape((mc_norm.shape[0], mc_norm.shape[1]))
                else:
                    z_dec_s = z_dec_s.data.numpy().reshape((mc_norm.shape[0], mc_norm.shape[1]))

                mc_converted_t = z_dec_t * mcep_std_t + mcep_mean_t
                mc_converted_t = mc_converted_t.T
                mc_converted_t = np.ascontiguousarray(mc_converted_t)
                sp_converted_t = world_decode_mc(mc = mc_converted_t, fs = sampling_rate)
                mc_converted_s = z_dec_s * mcep_std_s + mcep_mean_s
                mc_converted_s = mc_converted_s.T
                mc_converted_s = np.ascontiguousarray(mc_converted_s)
                sp_converted_s = world_decode_mc(mc = mc_converted_s, fs = sampling_rate)

                sp_gained = np.multiply(sp, np.divide(sp_converted_t, sp_converted_s))

                logf0s_normalization_params_s = np.load(os.path.join(voice_path_s, "log_f0_"+voice_dir_list[s_label]+".npz"))
                logf0s_mean_s = logf0s_normalization_params_s['mean']
                logf0s_std_s = logf0s_normalization_params_s['std']
                logf0s_normalization_params_t = np.load(os.path.join(voice_path_t, "log_f0_"+voice_dir_list[t_label]+".npz"))
                logf0s_mean_t = logf0s_normalization_params_t['mean']
                logf0s_std_t = logf0s_normalization_params_t['std']

                f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_s, std_log_src = logf0s_std_s, mean_log_target = logf0s_mean_t, std_log_target = logf0s_std_t)

                wav_transformed = world_speech_synthesis(f0 = f0_converted, sp = sp_gained, ap = ap, fs = sampling_rate, frame_period = frame_period)
                librosa.output.write_wav(os.path.join(output_label_dir, voice_dir_list[s_label]+"_to_"+voice_dir_list[t_label]+"_["+file+"].wav"), wav_transformed, sampling_rate)
                wav_source = world_speech_synthesis(f0 = f0_converted, sp = sp, ap = ap, fs = sampling_rate, frame_period = frame_period)
                librosa.output.write_wav(os.path.join(output_label_dir, voice_dir_list[s_label]+"_to_"+voice_dir_list[t_label]+"_["+file+"]_nonconv.wav"), wav_source, sampling_rate)
                
            count += 1
            if (count % 10 == 0):
                print("{} ({}/{}) : {:.1f} % is done...".format(voice_dir_list[s_label], str(s_label+1), str(nb_label), count*100/len(files)))
    print("Finish.")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = model_load(model_dir, model_name)
conv_all(model)