# Tacotron2 + WaveNet Demo

In [3]:
from hparams import hparams

In [1]:
import librosa.display
import IPython
from IPython.display import Audio
import numpy as np
import torch
import os
from os.path import exists, join, expanduser

In [2]:
wavenet_dir = "wavenet_vocoder"
taco2_dir = "Tacotron-2"
wn_preset = "20180510_mixture_lj_checkpoint_step000320000_ema.json"
wn_checkpoint_path = "20180510_mixture_lj_checkpoint_step000320000_ema.pth"

cwd = os.getcwd()
os.chdir(join(cwd, wavenet_dir))

# Setup WaveNet vocoder hparams
from hparams import hparams
with open(wn_preset) as f:
    hparams.parse_json(f.read())

# Setup WaveNet vocoder
from train import build_model
from synthesis import wavegen
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = build_model().to(device)

print("Load checkpoint from {}".format(wn_checkpoint_path))
checkpoint = torch.load(wn_checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])

ModuleNotFoundError: No module named 'nnmnkwii'

In [3]:
from glob import glob
from tqdm import tqdm


with open("/home/bongsang/projects/fast-forecast/demo/tacotron-2/tacotron_output/eval/map.txt") as f:
    maps = f.readlines()
maps = list(map(lambda x:x[:-1].split("|"), maps))
# filter out invalid ones
maps = list(filter(lambda x:len(x) == 2, maps))

print("List of texts to be synthesized")
for idx, (text,_) in enumerate(maps):
    print(idx, text)

List of texts to be synthesized
0 My name is roboking.
1 This is really awesome!
2 Thank you, Jumin So Jang Nim.


In [4]:
waveforms = []
import time

for idx, (text, mel) in enumerate(maps):
    print("\n", idx, text)
    mel_path = join("../tacotron-2", mel)
    c = np.load(mel_path)
    if c.shape[1] != hparams.num_mels:
        np.swapaxes(c, 0, 1)
    # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]
    c = np.interp(c, (0, 4), (0, 1))
    
    # Generate
    tic = time.time()
    waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
    waveforms.append(waveform)
    toc = time.time()
    
    print(f'TTS {idx}. WaveNet Generated Time = {round(toc-tic, 2)} seconds')
    
    # Audio
    IPython.display.display(Audio(waveform, rate=hparams.sample_rate))

  0%|          | 0/27904 [00:00<?, ?it/s]


 0 My name is roboking.


100%|██████████| 27904/27904 [02:35<00:00, 179.99it/s]


TTS 0. WaveNet Generated Time = 155.1 seconds


  0%|          | 1/26112 [00:00<1:21:34,  5.33it/s]


 1 This is really awesome!


100%|██████████| 26112/26112 [02:25<00:00, 179.30it/s]


TTS 1. WaveNet Generated Time = 145.69 seconds


  0%|          | 0/32000 [00:00<?, ?it/s]


 2 Thank you, Jumin So Jang Nim.


100%|██████████| 32000/32000 [02:59<00:00, 177.82it/s]


TTS 2. WaveNet Generated Time = 180.03 seconds
