In [None]:
# Copyright 2020 NVIDIA. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.
Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.
!pip install wget
!pip install nemo_toolkit[tts]

!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/master/examples/tts/configs/tacotron2.yaml
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/master/examples/tts/configs/waveglow.yaml

In [None]:
import argparse
import math
import os
import copy
import shutil
import librosa
import matplotlib.pyplot as plt
from functools import partial
from scipy.io.wavfile import write
import numpy as np
import IPython.display as ipd

from ruamel.yaml import YAML

import torch
import nemo
import nemo.collections.asr as nemo_asr
import nemo.collections.tts as nemo_tts
import nemo.utils.argparse as nm_argparse

logging = nemo.logging

In [None]:
# Download config files
config_path = '../configs/tacotron2.yaml'
waveglow_config_path = '../configs/waveglow.yaml'

yaml = YAML(typ="safe")
with open(config_path) as file:
    tacotron2_config = yaml.load(file)
    labels = tacotron2_config["labels"]
    
with open(waveglow_config_path) as file:
    waveglow_config = yaml.load(file)

# Download pre-trained checkpoints

Note: The checkpoint for WaveGlow is very large (>1GB), so please ensure you have sufficient storage space.

In [None]:
base_checkpoint_path = './checkpoints/'
WAVEGLOW = os.path.join(base_checkpoint_path, 'WaveGlowNM.pt')
TACOTRON_ENCODER = os.path.join(base_checkpoint_path, 'Tacotron2Encoder.pt')
TACOTRON_DECODER = os.path.join(base_checkpoint_path, 'Tacotron2Decoder.pt')
TACOTRON_POSTNET = os.path.join(base_checkpoint_path, 'Tacotron2Postnet.pt')
TEXT_EMBEDDING = os.path.join(base_checkpoint_path, 'TextEmbedding.pt')

if not os.path.exists(base_checkpoint_path):
    os.makedirs(base_checkpoint_path)
    
if not os.path.exists(WAVEGLOW):
    !wget wget https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ljspeech/versions/2/files/WaveGlowNM.pt -P {base_checkpoint_path};

if not os.path.exists(TACOTRON_ENCODER):
    !wget https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2_ljspeech/versions/2/files/Tacotron2Encoder.pt -P {base_checkpoint_path};
        
if not os.path.exists(TACOTRON_DECODER):
    !wget https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2_ljspeech/versions/2/files/Tacotron2Decoder.pt -P {base_checkpoint_path};

if not os.path.exists(TACOTRON_POSTNET):
    !wget https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2_ljspeech/versions/2/files/Tacotron2Postnet.pt -P {base_checkpoint_path};

if not os.path.exists(TEXT_EMBEDDING):
    !wget https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2_ljspeech/versions/2/files/TextEmbedding.pt -P {base_checkpoint_path};


In [None]:
# Prepare the Neural Factory
neural_factory = nemo.core.NeuralModuleFactory(
        optimization_level="O0", backend=nemo.core.Backend.PyTorch
)

## Text Line Data Layer

Construct a simple datalayer to load a single line of text (accepted from the user) and pass it to the model

In [None]:
from nemo.backends.pytorch import DataLayerNM
from nemo.core.neural_types import *
from nemo.utils.misc import pad_to
from nemo.collections.asr.parts.dataset import TranscriptDataset

In [None]:
class SentenceDataLayer(DataLayerNM):
    """A simple Neural Module for loading textual transcript data.
    The path, labels, and eos_id arguments are dataset parameters.

    Args:
        pad_id (int): Label position of padding symbol
        batch_size (int): Size of batches to generate in data loader
        drop_last (bool): Whether we drop last (possibly) incomplete batch.
            Defaults to False.
        num_workers (int): Number of processes to work on data loading (0 for
            just main process).
            Defaults to 0.
    """

    @property
    def output_ports(self):
        """Returns definitions of module output ports.

        texts:
            0: AxisType(BatchTag)

            1: AxisType(TimeTag)

        texts_length:
            0: AxisType(BatchTag)

        """
        return {
            'texts': NeuralType(('B', 'T'), LabelsType()),
            'texts_length': NeuralType(tuple('B'), LengthsType()),
        }

    def __init__(
        self,
        path,
        labels,
        batch_size,
        bos_id=None,
        eos_id=None,
        pad_id=None,
        drop_last=False,
        num_workers=0,
        shuffle=True,
    ):
        super().__init__()

        # Set up dataset
        self.dataset_params = {
            'path': path,
            'labels': labels,
            'bos_id': bos_id,
            'eos_id': eos_id,
        }

        self._dataset = TranscriptDataset(**self.dataset_params)

        # Set up data loader
        sampler = None
        pad_id = 0 if pad_id is None else pad_id
        
    def update_dataset(self):
        self._dataset = TranscriptDataset(**self.dataset_params)
        logging.info('Dataset updated.')

    def __len__(self):
        return len(self._dataset)

    @property
    def dataset(self):
        return self._dataset

    @property
    def data_iterator(self):
        return None


# Create the Tacotron 2 + WaveGlow Neural Modules

In [None]:
def create_NMs(tacotron2_config, waveglow_config, labels, decoder_infer=False, waveglow_sigma=0.6):
    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **tacotron2_config["AudioToMelSpectrogramPreprocessor"]["init_params"]
    )
    
    text_embedding_params = copy.deepcopy(tacotron2_config["TextEmbedding"]["init_params"])
    text_embedding_params['n_symbols'] = len(labels) + 3
    
    # Load checkpoint for text embedding
    text_embedding = nemo_tts.TextEmbedding(**text_embedding_params)
    text_embedding.restore_from(TEXT_EMBEDDING)
    
    # Load checkpoint for encoder
    t2_enc = nemo_tts.Tacotron2Encoder(**tacotron2_config["Tacotron2Encoder"]["init_params"])
    t2_enc.restore_from(TACOTRON_ENCODER)
    
    # Load checkpoint for decoder
    decoder_params = copy.deepcopy(tacotron2_config["Tacotron2Decoder"]["init_params"])
    
    t2_dec = nemo_tts.Tacotron2DecoderInfer(**decoder_params)    
    t2_dec.restore_from(TACOTRON_DECODER)
        
    # Load checkpoint for PortNet
    t2_postnet = nemo_tts.Tacotron2Postnet(**tacotron2_config["Tacotron2Postnet"]["init_params"])
    t2_postnet.restore_from(TACOTRON_POSTNET)
    
    t2_loss = nemo_tts.Tacotron2Loss(**tacotron2_config["Tacotron2Loss"]["init_params"])
    
    makegatetarget = nemo_tts.MakeGate()

    total_weights = text_embedding.num_weights + t2_enc.num_weights + t2_dec.num_weights + t2_postnet.num_weights

    logging.info('================================')
    logging.info(f"Total number of parameters (Tacotron 2): {total_weights}")
    logging.info('================================')
    
    
    # Load WaveGlow model
    waveglow_args = copy.deepcopy(waveglow_config["WaveGlowNM"]["init_params"])
    waveglow_args['sigma'] = waveglow_sigma
    
    waveglow = nemo_tts.WaveGlowInferNM(**waveglow_args)
    waveglow.restore_from(WAVEGLOW)
    
    total_weights = waveglow.num_weights
    
    logging.info('================================')
    logging.info(f"Total number of parameters (WaveGlow): {total_weights}")
    logging.info('================================')

    return (
        data_preprocessor,
        text_embedding,
        t2_enc,
        t2_dec,
        t2_postnet,
        t2_loss,
        makegatetarget,
    ), waveglow

In [None]:
neural_modules, waveglow = create_NMs(tacotron2_config, waveglow_config, labels, decoder_infer=True, waveglow_sigma=0.6);

# Utility functions

In [None]:
def update_text(text):
    if not os.path.exists('cache/'):
        os.makedirs('cache/')
        
    fp = os.path.join('cache', 'input.txt')
    with open(fp, 'w', encoding='utf8') as f:
        f.write('{}\n'.format(text))
        f.flush()
    
    logging.info("Updated input file with value : %s", text)
    return fp
        
def cleanup_cachedir():
    if os.path.exists('cache/'):
        shutil.rmtree('cache/')
    logging.info("Cleaned up cache directory !")
    
def plot_and_save_spec(spectrogram, i, save_dir=None):
    fig, ax = plt.subplots(figsize=(12, 3))
    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation='none')
    plt.colorbar(im, ax=ax)
    plt.xlabel("Frames")
    plt.ylabel("Channels")
    plt.tight_layout()
    save_file = f"spec_{i}.png"
    if save_dir:
        save_file = os.path.join(save_dir, save_file)
    plt.savefig(save_file)
    plt.close()

# Initializing the inference DAG

To initialize the graph, we accept some text from the user. Later, we will accept the actual text that we want to convert to speech !

In [None]:
text = input('Please enter some initial text here :')

In [None]:
filepath = update_text(text)

## Create inference DAG

In [None]:
# Tacotron 2 DAG
(_, text_embedding, t2_enc, t2_dec, t2_postnet, _, _) = neural_modules

data_layer = SentenceDataLayer(
    path=filepath,
    labels=labels,
    batch_size=1,
    num_workers=0,
    bos_id=len(labels),
    eos_id=len(labels) + 1,
    pad_id=len(labels) + 2,
    shuffle=False,
)
transcript, transcript_len = data_layer()

transcript_embedded = text_embedding(char_phone=transcript)

transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len,)

mel_decoder, gate, alignments, mel_len = t2_dec(
    char_phone_encoded=transcript_encoded, encoded_length=transcript_len,
)

mel_postnet = t2_postnet(mel_input=mel_decoder)

# WaveGlow DAG
audio_pred = waveglow(mel_spectrogram=mel_postnet)

In [None]:
# Setup inference tensors
infer_tensors = [mel_postnet, gate, alignments, mel_len]

## Run inference DAG

In [None]:
def run_tacotron2():
    logging.info("Running Tacotron 2")
    # Run tacotron 2
    evaluated_tensors = neural_factory.infer(
        tensors=infer_tensors, offload_to_cpu=False
    )
    logging.info("Done Running Tacotron 2")
    
    mel_len_val = evaluated_tensors[-1]
    
    filterbank = librosa.filters.mel(
        sr=tacotron2_config["sample_rate"],
        n_fft=tacotron2_config["n_fft"],
        n_mels=tacotron2_config["n_mels"],
        fmax=tacotron2_config["fmax"],
    )
    
    return evaluated_tensors, filterbank, mel_len_val

def run_waveglow(save_dir, waveglow_denoiser_strength=0.0):
    # Run Tacotron 2 and WaveGlow
    evaluated_tensors, filterbank, mel_len_val = run_tacotron2()
    
    logging.info("Running Waveglow")
    evaluated_tensors = neural_factory.infer(
        tensors=[audio_pred],
    )
    logging.info("Done Running Waveglow")
    
    if waveglow_denoiser_strength > 0:
        logging.info("Setup WaveGlow denoiser")
        waveglow.setup_denoiser()
    
    logging.info("Saving results to disk")
    for i, batch in enumerate(evaluated_tensors[0]):
        audio = batch.cpu().numpy()
        for j, sample in enumerate(audio):
            sample_len = mel_len_val[i][j] * tacotron2_config["n_stride"]
            sample = sample[:sample_len]
            save_file = f"sample_{i * 32 + j}.wav"
            if save_dir:
                save_file = os.path.join(save_dir, save_file)
            if waveglow_denoiser_strength > 0:
                sample, spec = waveglow.denoise(sample, strength=waveglow_denoiser_strength)
            else:
                spec, _ = librosa.core.magphase(librosa.core.stft(sample, n_fft=waveglow_config["n_fft"]))
            write(save_file, waveglow_config["sample_rate"], sample)
            spec = np.dot(filterbank, spec)
            spec = np.log(np.clip(spec, a_min=1e-5, a_max=None))
            plot_and_save_spec(spec, i * 32 + j, save_dir)

# Run Tacotron 2 + WaveGlow on input text

In [None]:
text = input('Please enter some initial text here :')

In [None]:
filepath = update_text(text)
data_layer.update_dataset()

## Prepare directories to save results

In [None]:
savedir = 'results/'
saved_audio = os.path.join(savedir, 'sample_0.wav')
saved_spectrogram = os.path.join(savedir, 'spec_0.png')

if not os.path.exists(savedir):
    os.makedirs(savedir)

## Generate the audio

Lets run the Tacotron 2 model and send the results to WaveGlow to generate the audio!

In [None]:
run_waveglow(savedir, waveglow_denoiser_strength=0.0)

## Lets hear the generated audio !

In [None]:
ipd.Audio(saved_audio, rate=16000)

In [None]:
ipd.Image(saved_spectrogram)

# Cleanup cachedir

In [None]:
cleanup_cachedir()