In [None]:
import torch
import torchaudio
from scipy.io.wavfile import write
from tacotron2.text import text_to_sequence

# Load Tacotron 2 model on GPU
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2').cuda()

# Load WaveGlow model on GPU
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow').cuda()

# Set device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tacotron2 = tacotron2.to(device)
waveglow = waveglow.to(device)

# Text input
text = "I am working on a task given by openinapp"

# Set cleaner names
cleaner_names = ['english_cleaners']

# Convert text to sequence
sequence = text_to_sequence(text, cleaner_names)

# Convert text to mel spectrogram
sequence = torch.tensor(sequence, dtype=torch.long, device=device).unsqueeze(0)
input_lengths = torch.tensor([sequence.size(1)], device=device)
mel_outputs, mel_lengths, _ = tacotron2.infer(sequence, input_lengths=input_lengths)

# combine waveform using WaveGlow
with torch.no_grad():
    audio = waveglow.infer(mel_outputs)

# Normalize the audio waveform
audio = audio.squeeze().cpu().numpy()
audio /= audio.max()

# Saving the  audio as WAV file
output_path = 'output10 (2) (1).wav'
write(output_path, 22050, audio)


