## Import

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable

import os
import pyworld
import librosa
import time
import matplotlib.pyplot as plt

from preprocess import *
from model import *

torch.manual_seed(0)
np.random.seed(0)

## Data Settings

In [None]:
model_name = "model_lambda70_f2f3m1m2"
model_dir = "./model/" + model_name

data_dir = "./data/voice_data"
voice_dir_list = ["F2", "F3", "M1", "M2"]

output_dir = "./converted_voices/test/" + model_name + "_training_progress"
figure_dir = "./figure/" + model_name

## Model Settings

In [None]:
lambda_p = 70
lambda_s = 70
nb_label = len(voice_dir_list)

## Training Settings 

In [None]:
num_epochs = 10000
batch_size = 4
learning_rate =1e-3
learning_rate_ = 1e-4
learning_rate__ = 1e-5
learning_rate___ = 1e-6
sampling_rate = 16000
num_envelope  = 36
num_mcep = 36
frame_period = 5.0
n_frames = 1024 

## Preprocessing

In [None]:
for v in voice_dir_list:
    if "log_f0_"+v+".npz" in  os.listdir(os.path.join(data_dir, v)):
        continue
    print("Preprocess: " + v)
    preprocess_voice(os.path.join(data_dir, v), v)

## Functions

In [None]:
def model_save(model, model_dir, model_name):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    torch.save(model.state_dict(), os.path.join(model_dir, model_name))
    
def model_load(model_dir, model_name):
    model = ACVAE(nb_label=nb_label,lambda_p=lambda_p,lambda_s=lambda_s)
    model.load_state_dict(torch.load(os.path.join(model_dir, model_name)))

In [None]:
def save_figure(losses, epoch):        
    if not os.path.exists(figure_dir):
            os.makedirs(figure_dir)
    losses = np.array(losses)
    losses = losses.reshape(-1, 4)
    x = np.linspace(0, len(losses), len(losses))
    losses_label = ("L1", "KLD", "AC_p", "AC_s")
    plt.figure()
    plt.plot(x, losses[:,0], label=losses_label[0])
    plt.plot(x, losses[:,1], label=losses_label[1])
    plt.plot(x, losses[:,2], label=losses_label[2])
    plt.plot(x, losses[:,3], label=losses_label[3])
    plt.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0)
    plt.savefig(figure_dir + "/" + "epoch_{:05}".format(epoch) + ".png")
    plt.savefig(figure_dir + "/" + "result.png")
        
    plt.figure()
    plt.plot(x, losses[:,2], label=losses_label[2])
    plt.plot(x, losses[:,3], label=losses_label[3])
    plt.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0)
    plt.savefig(figure_dir + "/" + "epoch_{:05}_AC".format(epoch) + ".png")
    plt.savefig(figure_dir + "/" + "result_AC.png")
    
    plt.figure()
    plt.plot(x, losses[:,0], label=losses_label[0])
    plt.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0)
    plt.savefig(figure_dir + "/" + "epoch_{:05}_L1".format(epoch) + ".png")
    plt.savefig(figure_dir + "/" + "result_L1.png")
    
    plt.figure()
    plt.plot(x, losses[:,1], label=losses_label[1])
    plt.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0)
    plt.savefig(figure_dir + "/" + "epoch_{:05}_KLD".format(epoch) + ".png")
    plt.savefig(figure_dir + "/" + "result_KLD.png")

In [None]:
def data_load(batchsize = 1, s = -1, t = -1):
    x = []
    label = []
    for i in range(batchsize):
        if (s == -1):
            label_num = np.random.randint(nb_label)
        else:
            label_num = s
        voice_path = os.path.join(data_dir, voice_dir_list[label_num])
        files = os.listdir(voice_path)
        
        frames = 0
        while frames < n_frames:
            
            file = ""
            while file.count("wav") == 0:
                file = np.random.choice(files)
            wav, _ = librosa.load(os.path.join(voice_path, file), sr = sampling_rate, mono = True)
            wav = librosa.util.normalize(wav, norm=np.inf, axis=None)
            wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
            f0, timeaxis, sp, ap, mc = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period, num_mcep = num_mcep)
            
            mc_transposed  = np.array(mc).T
            frames = np.shape(mc_transposed)[1]
            
        mcep_normalization_params = np.load(os.path.join(voice_path, "mcep_"+voice_dir_list[label_num]+".npz"))
        mcep_mean = mcep_normalization_params['mean']
        mcep_std = mcep_normalization_params['std']
        mc_norm = (mc_transposed  - mcep_mean) / mcep_std
            
        start_ = np.random.randint(frames - n_frames + 1)
        end_ = start_ + n_frames
            
        x.append(mc_norm[:,start_:end_])
        label.append(label_num)

    return torch.Tensor(x).view(batchsize, 1, num_mcep, n_frames), torch.Tensor(label)

In [None]:
def test_conv(model, epoch):
    print("Test")
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    output_epoch_dir = os.path.join(output_dir, "epoch_{:05}".format(epoch))
    if not os.path.exists(output_epoch_dir):
        os.makedirs(output_epoch_dir)
    
    for s_label in range(nb_label):
    
        voice_path_s = os.path.join(data_dir, voice_dir_list[s_label])

        files = os.listdir(voice_path_s)
        file = ""
        while file.count("wav") == 0:
            file = np.random.choice(files)
        
        print("Source File:" + file)

        for t_label in range(nb_label):

            if (t_label == s_label):
                continue

            voice_path_t = os.path.join(data_dir, voice_dir_list[t_label])

            wav, _ = librosa.load(os.path.join(voice_path_s, file), sr = sampling_rate, mono = True)
            wav = librosa.util.normalize(wav, norm=np.inf, axis=None)
            wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
            f0, timeaxis, sp, ap, mc = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)

            mc_transposed  = np.array(mc).T

            mcep_normalization_params_s = np.load(os.path.join(voice_path_s, "mcep_"+voice_dir_list[s_label]+".npz"))
            mcep_mean_s = mcep_normalization_params_s['mean']
            mcep_std_s = mcep_normalization_params_s['std']    
            mcep_normalization_params_t = np.load(os.path.join(voice_path_t, "mcep_"+voice_dir_list[t_label]+".npz"))
            mcep_mean_t = mcep_normalization_params_t['mean']
            mcep_std_t = mcep_normalization_params_t['std']

            mc_norm = (mc_transposed - mcep_mean_s) / mcep_std_s

            x = torch.Tensor(mc_norm).view(1, 1, mc_norm.shape[0], mc_norm.shape[1])

            label_s_tensor = torch.Tensor(np.array([s_label])).view(1, 1)
            label_t_tensor = torch.Tensor(np.array([t_label])).view(1, 1)

            x = x.to(device)
            label_s_tensor = label_s_tensor.to(device)
            label_t_tensor = label_t_tensor.to(device)

            mu_enc, logvar_enc = model.encode(x, label_s_tensor)
            z_enc = model.reparameterize(mu_enc, logvar_enc)
            # x^
            mu_dec_t, logvar_dec_t = model.decode(z_enc, label_t_tensor)
            z_dec_t = model.reparameterize(mu_dec_t, logvar_dec_t)
            if (torch.cuda.is_available()):
                z_dec_t = z_dec_t.data.cpu().numpy().reshape((mc_norm.shape[0], mc_norm.shape[1]))
            else:
                z_dec_t = z_dec_t.data.numpy().reshape((mc_norm.shape[0], mc_norm.shape[1]))
            # x_
            mu_dec_s, logvar_dec_s = model.decode(z_enc, label_s_tensor)
            z_dec_s = model.reparameterize(mu_dec_s, logvar_dec_s)
            if (torch.cuda.is_available()):
                z_dec_s = z_dec_s.data.cpu().numpy().reshape((mc_norm.shape[0], mc_norm.shape[1]))
            else:
                z_dec_s = z_dec_s.data.numpy().reshape((mc_norm.shape[0], mc_norm.shape[1]))

            mc_converted_t = z_dec_t * mcep_std_t + mcep_mean_t
            mc_converted_t = mc_converted_t.T
            mc_converted_t = np.ascontiguousarray(mc_converted_t)
            sp_converted_t = world_decode_mc(mc = mc_converted_t, fs = sampling_rate)
            mc_converted_s = z_dec_s * mcep_std_s + mcep_mean_s
            mc_converted_s = mc_converted_s.T
            mc_converted_s = np.ascontiguousarray(mc_converted_s)
            sp_converted_s = world_decode_mc(mc = mc_converted_s, fs = sampling_rate)

            sp_gained = np.multiply(sp, np.divide(sp_converted_t, sp_converted_s))

            logf0s_normalization_params_s = np.load(os.path.join(voice_path_s, "log_f0_"+voice_dir_list[s_label]+".npz"))
            logf0s_mean_s = logf0s_normalization_params_s['mean']
            logf0s_std_s = logf0s_normalization_params_s['std']
            logf0s_normalization_params_t = np.load(os.path.join(voice_path_t, "log_f0_"+voice_dir_list[t_label]+".npz"))
            logf0s_mean_t = logf0s_normalization_params_t['mean']
            logf0s_std_t = logf0s_normalization_params_t['std']

            f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_s, std_log_src = logf0s_std_s, mean_log_target = logf0s_mean_t, std_log_target = logf0s_std_t)
            
            wav_transformed = world_speech_synthesis(f0 = f0_converted, sp = sp_gained, ap = ap, fs = sampling_rate, frame_period = frame_period)
            librosa.output.write_wav(os.path.join(output_epoch_dir, voice_dir_list[s_label]+"_to_"+voice_dir_list[t_label]+"_["+file+"].wav"), wav_transformed, sampling_rate)
            wav_source = world_speech_synthesis(f0 = f0_converted, sp = sp, ap = ap, fs = sampling_rate, frame_period = frame_period)
            librosa.output.write_wav(os.path.join(output_epoch_dir, voice_dir_list[s_label]+"_to_"+voice_dir_list[t_label]+"_["+file+"]_nonconv.wav"), wav_source, sampling_rate)

            print("Converted: " + voice_dir_list[s_label] + " -> " + voice_dir_list[t_label])


## Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = ACVAE(nb_label=nb_label,lambda_p=lambda_p,lambda_s=lambda_s).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model.train()

losses = []

for epoch in range(num_epochs):
    epoch += 1
    
    if (epoch == 3000):
        learning_rate = learning_rate_ 
        for param_group in optimizer.param_groups:
            param_group['lr'] = learning_rate
    if (epoch == 6000):
        learning_rate = learning_rate__
        for param_group in optimizer.param_groups:
            param_group['lr'] = learning_rate
    if (epoch == 8000):
        learning_rate = learning_rate___
        for param_group in optimizer.param_groups:
            param_group['lr'] = learning_rate
    
    start_time = time.time()

    print('Epoch: %d' % epoch)

    x_, label_ = data_load(batch_size)
    optimizer.zero_grad()
    loss, loss_list = model.calc_loss(x_, label_)
    loss.backward()
    losses.append(loss_list)
    optimizer.step()
    
    if epoch % 100 == 0:
        test_conv(model, epoch)
    if epoch % 100 == 0:
        model_save(model, model_dir, model_name)
    if epoch % 2000 == 0:
        model_save(model, model_dir, model_name + "_" + str(epoch))

    if epoch % 100 == 0:
        save_figure(losses, epoch)
    
    elapsed_time = time.time() - start_time
    print('Time Elapsed for one epoch: %02d:%02d:%02d' % (elapsed_time // 3600, (elapsed_time % 3600 // 60), (elapsed_time % 60 // 1)))

model_save(model, model_dir, model_name)

save_figure(losses, epoch)


In [None]:
a, b= data_load(2)

In [None]:
np.shape(a)

In [None]:
plt.hist(np.array(a[0,0].flatten()))