In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import sys
sys.path.append('./waveglow/')

import librosa
import numpy as np
import os
import glob
import json
import random

import torch
from text import text_to_sequence, cmudict
from text.symbols import symbols
import commons
import attentions
import modules
import models
import utils

# load WaveGlow
waveglow_path = '/run/media/viblab/Markov2/Pras/Thesis/TryMyOwn/tacotron2/waveglow/waveglow_256channels_universal_v4.pt' # or change to the latest version of the pretrained WaveGlow.
waveglow = torch.load(waveglow_path)['model']
waveglow = waveglow.remove_weightnorm(waveglow)
_ = waveglow.cuda().eval()
from apex import amp
waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") # Try if you want to boost up synthesis speed.



Selected optimization level O3:  Pure FP16 training.
Defaults for this optimization level are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0




In [2]:
# If you are using your own trained model
model_dir = "logs/base_blank_emo"
hps = utils.get_hparams_from_dir(model_dir)
checkpoint_path = utils.latest_checkpoint_path(model_dir)

# If you are using a provided pretrained model
# hps = utils.get_hparams_from_file("./configs/any_config_file.json")
# checkpoint_path = "/path/to/pretrained_model"

model = models.FlowGenerator(
    len(symbols) + getattr(hps.data, "add_blank", False),
    out_channels=hps.data.n_mel_channels,
    n_speakers=hps.data.n_speakers,
    **hps.model).to("cuda")

utils.load_checkpoint(checkpoint_path, model)
model.decoder.store_inverse() # do not calcuate jacobians for fast decoding
_ = model.eval()

# cmu_dict = cmudict.CMUDict(hps.data.cmudict_path)

# normalizing & type casting
def normalize_audio(x, max_wav_value=hps.data.max_wav_value):
    return np.clip((x / np.abs(x).max()) * max_wav_value, -32768, 32767).astype("int16")

logs/base_blank_emo/G_224.pth
Use Speaker Embed Linear Norm


In [3]:
with open("filelists/esd_audio_sid_text_test_filelist.txt", "r") as txt_file:
    lines = txt_file.readlines()

root_database = "/run/media/viblab/Markov2/Pras/Thesis/Database/ESD"
random_test = random.choice(lines).rstrip().split("|")
wav_path = random_test[0]
transcription = random_test[2]
embeds_filename = wav_path.split("/")[-1].split(".")[0]

spk_emb_src = torch.Tensor(np.load(f"{root_database}/spk_embeds/{embeds_filename}.npy")).reshape(1,-1).to("cuda")
emo_emb = torch.Tensor(np.load(f"{root_database}/emo_embeds/{embeds_filename}.npy")).reshape(1,-1).to("cuda")

In [4]:
emo_question = torch.Tensor(np.load(f"{root_database}/emo_embeds/{embeds_filename}.npy")).reshape(1,-1).to("cuda")

In [5]:
emo_angry = torch.Tensor(np.load(f"{root_database}/emo_embeds/{embeds_filename}.npy")).reshape(1,-1).to("cuda")

In [6]:
emo_sad = torch.Tensor(np.load(f"{root_database}/emo_embeds/{embeds_filename}.npy")).reshape(1,-1).to("cuda")

In [7]:
spk_emb_tgt = torch.Tensor(np.load(f"./untitled.npy")).reshape(1,-1).to("cuda")

In [8]:
#tst_stn = "It is a weakness, and not the only one." 
tst_stn = transcription

if getattr(hps.data, "add_blank", False):
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'])
    text_norm = commons.intersperse(text_norm, len(symbols))
else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality
    tst_stn = " " + tst_stn.strip() + " "
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'])
sequence = np.array(text_norm)[None, :]
print("".join([symbols[c] if c < len(symbols) else "<BNK>" for c in sequence[0]]))
x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda()

<BNK>t<BNK>h<BNK>a<BNK>t<BNK> <BNK>w<BNK>a<BNK>s<BNK> <BNK>h<BNK>i<BNK>s<BNK> <BNK>c<BNK>h<BNK>i<BNK>e<BNK>f<BNK> <BNK>t<BNK>h<BNK>o<BNK>u<BNK>g<BNK>h<BNK>t<BNK>.<BNK>


In [13]:
with torch.no_grad():
  noise_scale = .667
  length_scale = 1.0
  sid = torch.LongTensor([108]).cuda()
  (y_gen_tst, *_), *_, (attn_gen, *_) = model(x_tst, x_tst_lengths, g=spk_emb_src, emo=emo_emb, gen=True, noise_scale=noise_scale, length_scale=length_scale)
  try:
    audio = waveglow.infer(y_gen_tst.half(), sigma=.666)
  except:
    audio = waveglow.infer(y_gen_tst, sigma=.666)
ipd.Audio(normalize_audio(audio[0].clamp(-1,1).data.cpu().float().numpy()), rate=hps.data.sampling_rate)

In [10]:
y, sr = librosa.load(wav_path)
ipd.Audio(normalize_audio(y), rate=sr)

In [14]:
from utils import load_wav_to_torch, load_filepaths_and_text

audio, sampling_rate = load_wav_to_torch(wav_path)
tacotronstft = commons.TacotronSTFT(
            hps.data.filter_length, hps.data.hop_length, hps.data.win_length,
            hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin,
            hps.data.mel_fmax)

if sampling_rate != tacotronstft.sampling_rate:
    raise ValueError("{} {} SR doesn't match target {} SR".format(
        sampling_rate, 22050))
audio_norm = audio / hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
melspec = tacotronstft.mel_spectrogram(audio_norm)
melspec = torch.squeeze(melspec, 0)

mel_padded = torch.FloatTensor(1, hps.data.n_mel_channels, melspec.size(1))
mel_padded[0] = melspec

y_lengths = torch.LongTensor(1)
y_lengths[0] = melspec.size(1)

mel_padded = mel_padded.to("cuda")
y_lengths = y_lengths.to("cuda")

In [15]:
with torch.no_grad():
    y_gen_vc = model.voice_conversion(mel_padded, y_lengths, spk_emb_src, spk_emb_tgt)
    try:
        audio = waveglow.infer(y_gen_vc.half(), sigma=.666)
    except:
        audio = waveglow.infer(y_gen_vc, sigma=.666)
ipd.Audio(normalize_audio(audio[0].clamp(-1,1).data.cpu().float().numpy()), rate=hps.data.sampling_rate)