In [1]:
import os
from os.path import exists, join, basename, splitext
git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  # clone and install
  !git clone -q --recursive {git_repo_url}
  !cd {project_name}/waveglow && git checkout 2fd4e63
  
import sys
sys.path.append(join(project_name, 'waveglow/'))
sys.path.append(project_name)
import time
import matplotlib
import matplotlib.pylab as plt
import gdown
d = 'https://drive.google.com/uc?id='

In [2]:
force_download_TT2 = True
tacotron2_pretrained_model = 'path-to-aawaj'#@param {type:"string"}
waveglow_pretrained_model = 'path-to-waveglow-model'#@param {type:"string"}

In [3]:
#@title Initialize Tacotron and Waveglow 
%matplotlib inline
import IPython.display as ipd
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT
from audio_processing import griffin_lim
from text import text_to_sequence
from denoiser import Denoiser

graph_width = 900
graph_height = 360
def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
    %matplotlib inline
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none', cmap='inferno')
    fig.canvas.draw()
    plt.show()

!gdown --id '1E12g_sREdcH5vuZb44EZYX8JjGWQ9rRp'
thisdict = {}
for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
    thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
def ARPA(text):
    out = ''
    for word_ in text.split(" "):
        word=word_; end_chars = ''
        while any(elem in word for elem in r"!?,.;") and len(word) > 1:
            if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
            if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
            if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
            if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
            if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
            else: break
        try: word_arpa = thisdict[word.upper()]
        except: word_arpa = ''
        if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
        out = (out + " " + word + end_chars).strip()
    if out[-1] != ";": out = out + ";"
    return out

#torch.set_grad_enabled(False)

# initialize Tacotron2 with the pretrained model
hparams = create_hparams()

Downloading...
From: https://drive.google.com/uc?id=1E12g_sREdcH5vuZb44EZYX8JjGWQ9rRp
To: /home/sampanna/Desktop/MAJOR/synthesis/merged.dict.txt
100%|██████████████████████████████████████| 7.94M/7.94M [00:00<00:00, 20.9MB/s]
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [8]:
#@title Parameters
# Load Tacotron2 (run this cell every time you change the model)
hparams.sampling_rate = 22050 # Don't change this
hparams.max_decoder_steps = 1000 # How long the audio will be before it cuts off (1000 is about 11 seconds)
hparams.gate_threshold = 0.1 # Model must be 90% sure the clip is over before ending generation (the higher this number is, the more likely that the AI will keep generating until it reaches the Max Decoder Steps)
model = Tacotron2(hparams)
model.load_state_dict(torch.load(tacotron2_pretrained_model)['state_dict'])
_ = model.cuda().eval().half()

# Load WaveGlow
waveglow = torch.load(waveglow_pretrained_model)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

In [9]:
import soundfile as sf
#@title Let's Have Fun!
text = 'कर्नरमा बल प्रहार गर्ने प्रयासमा रहेका लियान्द्रो पाराडेसलाई मेसीले साउदीका डिफेन्डर अब्दुल हमिदसाउदले धकेलेर लडाएपछि भिडियो असिस्टेन्ट रेफ्रीको सहयोगमा पाएको पेनाल्टीमा मेसी '
sigma = 0.8
denoise_strength = 0.324
raw_input = True # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing.
                  # should be True if synthesizing a non-English language

for i in text.split("\n"):
    if len(i) < 1: continue;
    print(i)
    if raw_input:
        if i[-1] != ";": i=i+";" 
    else: i = ARPA(i)
    print(i)
    with torch.no_grad(): # save VRAM by not including gradients
        sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
        # plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
        #         alignments.float().data.cpu().numpy()[0].T))

        audio = waveglow.infer(mel_outputs_postnet, sigma=sigma); 
        print(""); ipd.display(ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate))
        # sf.write("speech.wav", audio.to('cpu').detach().numpy()[0], 22050)

कर्नरमा बल प्रहार गर्ने प्रयासमा रहेका लियान्द्रो पाराडेसलाई मेसीले साउदीका डिफेन्डर अब्दुल हमिदसाउदले धकेलेर लडाएपछि भिडियो असिस्टेन्ट रेफ्रीको सहयोगमा पाएको पेनाल्टीमा मेसी 
कर्नरमा बल प्रहार गर्ने प्रयासमा रहेका लियान्द्रो पाराडेसलाई मेसीले साउदीका डिफेन्डर अब्दुल हमिदसाउदले धकेलेर लडाएपछि भिडियो असिस्टेन्ट रेफ्रीको सहयोगमा पाएको पेनाल्टीमा मेसी ;

