## Tacotron 2 inference code
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [10]:
from pathlib import Path
from typing import List
%matplotlib inline
import sys
from test_sentences import stressed_sentences

from scipy.io.wavfile import write
import re
import matplotlib.pylab as plt

sys.path.append('nvidia_waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from train import load_model
from libs.text import text_to_sequence
from nvidia_waveglow.denoiser import Denoiser

def plot_data(attn, figsize=(9, 6), imshow=False, out_path=None):
    fig, ax = plt.subplots(figsize=figsize)
    im = ax.imshow(attn, aspect='auto', origin='lower', interpolation='none')
    fig.colorbar(im, ax=ax)
    plt.xlabel('Decoder timestep')
    plt.ylabel('Encoder timestep')

    if out_path:
        plt.savefig(out_path)

    if not imshow:
        plt.close()

def prepare_output_filepath(output_dir, sentence):
    filename = re.sub(r'[^a-ž ]', '', sentence.lower())
    filename = re.sub(r"[\u0300\u0301\u0303]", '', filename)
    # filename = '_'.join(filename.split()[:10])
    filename = '_'.join(filename.split())
    return f"{output_dir}/{filename}.wav"

def synthesize_all(model, checkpoint_name, sentences: List[str], imshow=False):
    for sentence in sentences:
        filepath = prepare_output_filepath(output_dir, sentence)

        sequence = np.array(text_to_sequence(sentence, ['basic_cleaners']))[None, :]
        sequence = torch.autograd.Variable(
            torch.from_numpy(sequence)).cuda().long()

        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
        plot_data(alignments.float().data.cpu().numpy()[0].T, out_path=filepath.replace('.wav', '.png'), imshow=imshow)

        with torch.no_grad():
            audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)

        audio_denoised = denoiser(audio, strength=0.01)[:, 0]
        audio_numpy = audio_denoised.cpu().numpy()

        print(f"Writing {filepath}")
        write(filepath, hparams.sampling_rate, audio_numpy[0])

def synthesize_sentences_using_a_list_of_tacotron_checkpoints(tacotron_dir, checkpoints: List[str], sentence_list):
    for checkpoint_name in checkpoints:
        checkpoint_path = f"{tacotron_dir}/{checkpoint_name}"
        model = load_model(hparams)
        model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
        _ = model.cuda().eval().half()

        synthesize_all(model, checkpoint_name, sentence_list)

def read_sentences_from_filelist(path: Path, stressed: bool) -> List[str]:
    with open(path, mode='r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    return [line.split('|')[1] if stressed else remove_accents(line.split('|')[1]) for line in lines]

def remove_accents(text):
    return re.sub('[\u0300\u0301\u0303]', '', text)


In [11]:
filelist_names = [
    "taco_aurimas_ns",
    "taco_aurimas_s",
    "taco_giedrius_ns",
    "taco_giedrius_s",
    "taco_vytautas_ns",
    "taco_vytautas_s",
]

In [12]:
# aurimas setup
acoustic_name, vocoder_name, acoustic_checkpoint, vocoder_checkpoint, filelist_idx = 'aurimas-2','aurimas-2', "checkpoint_30000", 52000, 1
# # giedrius setup
# acoustic_name, vocoder_name, acoustic_checkpoint, vocoder_checkpoint, filelist_idx = 'giedrius_altoriu_sesely_3','giedrius_altoriu_sesely', "checkpoint_17500", 50000, 3
# # vytautas setup
# acoustic_name, vocoder_name, acoustic_checkpoint, vocoder_checkpoint, filelist_idx = 'vytautas_stressed','vytautas', "checkpoint_24000", 216000, 5

In [13]:
taco_base_dir = Path("/media/arnas/SSD Disk/inovoice/models/text-to-speech/tacotron")
tacotron_dir = taco_base_dir / acoustic_name

output_dir = Path(f"/media/arnas/SSD Disk/uni/semester_4/masters-thesis/mos/filelists/{filelist_names[filelist_idx]}")
output_dir.mkdir(parents=True, exist_ok=True)

#### Setup hparams

In [14]:
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load WaveGlow for mel2audio synthesis and denoiser

In [None]:
waveglow_path = f"/media/arnas/SSD Disk/inovoice/models/text-to-speech/waveglow/{vocoder_name}/waveglow_{vocoder_checkpoint}"

waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

In [16]:
acoustic_checkpoint_path = f"{tacotron_dir}/{acoustic_checkpoint}"
model = load_model(hparams)
model.load_state_dict(torch.load(acoustic_checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

#### Synthesize

In [17]:
sents = [
    "- Jū̃s klaũsote sintezúoto tèksto.",
    "Vìlniaus universitèto Matemãtikos ir̃ informãtikos fakultètas.",
    "Vãsara paskutìnių kùrsų studeñtams kasmẽt prasìdeda įtemptaĩ – láukia baigiamų̃jų darbų̃ gynìmai. Vìlniaus universitèto Matemãtikos ir̃ informãtikos fakultetè šiaĩs mẽtais jiẽ prasìdeda gegužė̃s trisdešimt pirmąją ir̃ baĩgsis biržẽlio dešimtą diẽną.",
]

In [None]:
synthesize_all(model, acoustic_checkpoint, sents)
