In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import librosa
import numpy as np
import os
import glob
import json
import soundfile as sf

import torch
from text import text_to_sequence, cmudict
from text.symbols import symbols
import commons
import attentions
import modules
import models
import utils
import random

import sys
sys.path.append('./hifi_gan/')

from hifi_gan.env import AttrDict
from hifi_gan.meldataset import MAX_WAV_VALUE
from hifi_gan.models import Generator

def load_checkpoint(filepath, device):
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print("Complete.")
    return checkpoint_dict

In [2]:
# from pyannote.audio import Model
# from pyannote.audio import Inference


# model = Model.from_pretrained("pyannote/embedding",  use_auth_token="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")
# inference = Inference(model, window="whole")

# spk_embed = torch.Tensor(inference("untitled.wav").reshape(1, -1)).to("cuda")

In [3]:
device = torch.device('cuda')
config_file = os.path.join("/run/media/viblab/Markov2/Pras/Thesis/TryMyOwn/hifi-gan/cp_hifigan_bak", 'config.json')
with open(config_file) as f:
    data = f.read()

json_config = json.loads(data)
h = AttrDict(json_config)

generator = Generator(h).to("cuda")

state_dict_g = torch.load("/run/media/viblab/Markov2/Pras/Thesis/TryMyOwn/hifi-gan/cp_hifigan_bak/g_02505000", map_location="cuda")
generator.load_state_dict(state_dict_g['generator'])

generator.eval()
generator.remove_weight_norm()

Removing weight norm...


In [4]:
# If you are using your own trained model
model_dir = "./logs/base_blank_emo_lang/"
hps = utils.get_hparams_from_dir(model_dir)
checkpoint_path = utils.latest_checkpoint_path(model_dir)

# If you are using a provided pretrained model
# hps = utils.get_hparams_from_file("configs/base_blank_ms.json")
# checkpoint_path = "G_76.pth"

model = models.FlowGenerator(
    len(symbols) + getattr(hps.data, "add_blank", False),
    out_channels=hps.data.n_mel_channels,
    n_lang=hps.data.n_lang,
    **hps.model).to("cuda")

utils.load_checkpoint(checkpoint_path, model)
model.decoder.store_inverse() # do not calcuate jacobians for fast decoding
_ = model.eval()

# cmu_dict = cmudict.CMUDict(hps.data.cmudict_path)


./logs/base_blank_emo_lang/G_144.pth
Use Speaker Embed Linear Norm
Use Multilanguage Cathegorical


In [34]:
with open("filelists/combine_audio_sid_text_test_filelist.txt", "r", encoding="utf-8") as txt_file:
    lines = txt_file.readlines()

root_database = "/run/media/viblab/Markov2/Pras/Thesis/Database/dataset_name"
random_test = random.choice(lines).rstrip().split("|")
wav_path = random_test[0]
transcription = random_test[2]
lid = torch.IntTensor([int(random_test[1])]).to("cuda")
embeds_filename = wav_path.split("/")[-1].split(".")[0]
database_name = wav_path.split("/")[8]

spk_emb_src = torch.Tensor(np.load(f"{root_database.replace('dataset_name', database_name)}/spk_embeds/{embeds_filename}.npy")).reshape(1,-1).to("cuda")
emo_emb = torch.Tensor(np.load(f"{root_database.replace('dataset_name', database_name)}/emo_embeds/{embeds_filename}.npy")).reshape(1,-1).to("cuda")

tst_stn = "Alkisah, hiduplah seorang gadis cantik dan baik bernama Cinderella,, Sejak ayahnya meninggal dunia, Cinderella hidup di sebuah rumah besar bersama ibu dan dua saudara tirinya." 
lid = torch.IntTensor([int(2)]).to("cuda")
#tst_stn = transcription

if getattr(hps.data, "add_blank", False):
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'], None)
    text_norm = commons.intersperse(text_norm, len(symbols))
else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality
    tst_stn = " " + tst_stn.strip() + " "
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'], None)
sequence = np.array(text_norm)[None, :]
#print("".join([symbols[c] if c < len(symbols) else "<BNK>" for c in sequence[0]]))
x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda()

with torch.no_grad():
  noise_scale = .667
  length_scale = 1.0
  (y_gen_tst, *_), *_, (attn_gen, *_) = model(x_tst, x_tst_lengths, g=spk_emb_src, emo=emo_emb, l=lid, gen=True, noise_scale=noise_scale, length_scale=length_scale)

print(random_test[1])
print(transcription)
print(wav_path.split("/")[-2])

0
All smile were real and the happier，the more sincere .
Happy


In [35]:
with torch.no_grad():
    x = y_gen_tst.cpu().detach().numpy()
    x = torch.FloatTensor(x).to(device)
    y_g_hat = generator(x)
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.cpu().numpy().astype('int16')

sf.write("sample_sound/generated.wav", audio, hps.data.sampling_rate)
ipd.Audio(audio, rate=hps.data.sampling_rate)

In [36]:
y, sr = librosa.load(wav_path)
sf.write("sample_sound/original.wav", y, sr)
ipd.Audio(y, rate=sr)

In [33]:
spk_emb_tgt = spk_emb_src

In [37]:
from utils import load_wav_to_torch, load_filepaths_and_text

audio, sampling_rate = load_wav_to_torch(wav_path)
tacotronstft = commons.TacotronSTFT(
            hps.data.filter_length, hps.data.hop_length, hps.data.win_length,
            hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin,
            hps.data.mel_fmax)

if sampling_rate != tacotronstft.sampling_rate:
    raise ValueError("{} {} SR doesn't match target {} SR".format(
        sampling_rate, 22050))
audio_norm = audio / hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
melspec = tacotronstft.mel_spectrogram(audio_norm)
melspec = torch.squeeze(melspec, 0)

mel_padded = torch.FloatTensor(1, hps.data.n_mel_channels, melspec.size(1))
mel_padded[0] = melspec

y_lengths = torch.LongTensor(1)
y_lengths[0] = melspec.size(1)

mel_padded = mel_padded.to("cuda")
y_lengths = y_lengths.to("cuda")

In [38]:
with torch.no_grad():
    y_gen_vc = model.voice_conversion(mel_padded, y_lengths, spk_emb_src, spk_emb_tgt, l=lid)
    x = y_gen_vc.cpu().detach().numpy()
    x = torch.FloatTensor(x).to(device)
    y_g_hat = generator(x)
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.cpu().numpy().astype('int16')

sf.write("sample_sound/generated_vc.wav", audio, hps.data.sampling_rate)
ipd.Audio(audio, rate=hps.data.sampling_rate)