<h4>Repository- <a>https://github.com/adityapandya1/review-of-DCTTS</a></h4>

<h4>Paper- <a>https://arxiv.org/abs/1710.08969</a></h4>


<h1>Requirements</h1>
<br>
<span>
<b>

librosa==0.5.1<br>
matplotlib==2.0.2<br>
numpy==1.13.3<br>
scipy==0.19.1<br>
tensorflow==1.4.0<br>
tqdm==4.19.2<br>
</b>
</span>

<h1>DOWNLOAD DATASET</h1>

In [1]:
!wget http://data.keithito.com/data/speech/LJSpeech-1.0.tar.bz2
!tar xjf LJSpeech-1.0.tar.bz2

'wget' is not recognized as an internal or external command,
operable program or batch file.
tar: Error opening archive: Failed to open 'LJSpeech-1.0.tar.bz2'


<h1>INSTALL ALL REQUIREMENTS</h1>

In [None]:
!pip install librosa --user
!pip install matplotlib --user
!pip install numpy --user
!pip install scipy --user
!pip install tqdm --user

In [None]:
!pip uninstall tensorflow
!pip install tensorflow==1.14.0 --user

<h1>HYPER PARAMETERS</h1>

In [4]:
data_dir = 'LJSpeech-1.0/' 

data = 'LJSpeech-1.0/'

metafile = 'LJSpeech-1.0/metadata.csv'

batch_size = 16 # alias = N

warmup_steps = 4000

logdir = 'logdir' # log directory

logdirmag = 'logdirmag' # log directory

logdirmel = 'logdirmel' # log directory

sr = 22050 # Sampling Rate

n_fft = 2048 # fft points (samples) (Fast Fourier Transform)

fd = 1+n_fft//2

frame_shift = 0.0125 # seconds

frame_length = 0.05 # seconds

hop_length = 256 # samples	This is dependent on the frame_shift.

win_length = 1024 # samples This is dependent on the frame_length.

n_mels = 80 # Number of Mel banks to generate

sharpening_factor = 1.4 # Exponent for amplifying the predicted magnitude

n_iter = 50 # Number of inversion iterations

preemphasis = .97 # or None

griffin_lim_iters=60

power=1.5              # Power to raise magnitudes to prior to Griffin-Lim






max_db = 100

min_db = -100

ref_db = 20

max_grad_norm = 100.

max_grad_val = 5.




# model

maxlen = 180 # Maximum number of letters in a sentance = T.

Ty = 868 # Max number of timesteps 

Tyr = Ty//4 # Max number of timesteps 

e = 128

d = 256

c = 512

lr = 2e-4

init_lr=2e-4

g=0.2

b1 = 0.5

b2 = 0.9

eps = 1e-6

logevery = 200

dropout_rate = 0.1

masking = False



<h1>DATA PREPROCESSING</h1>

In [None]:
import numpy as np
import librosa
from concurrent.futures import ProcessPoolExecutor
from functools import partial


import glob
import os
import tqdm


def get_spectrograms(sound_file):
    '''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`.
    Args:
      sound_file: A string. The full path of a sound file.
    Returns:
      mel: A 2d array of shape (T, n_mels) <- Transposed
      mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed
    '''
    # Loading sound file
    y, sr = librosa.load(sound_file, sr=22050)

    # Trimming
    y, _ = librosa.effects.trim(y)

    # Preemphasis
    y = np.append(y[0], y[1:] - preemphasis * y[:-1])

    # stft
    linear = librosa.stft(y=y,
                          n_fft=n_fft,
                          hop_length=hop_length,
                          win_length=win_length)

    # magnitude spectrogram
    mag = np.abs(linear)  # (1+n_fft//2, T)

    # mel spectrogram
    mel_basis = librosa.filters.mel(sr, n_fft, n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag)  # (n_mels, t)

    # Sequence length
    done = np.ones_like(mel[0, :]).astype(np.int32)

    # to decibel
    mel = librosa.amplitude_to_db(mel)
    mag = librosa.amplitude_to_db(mag)

    # normalize
    mel = np.clip((mel - ref_db + max_db) / max_db, 0, 1)
    mag = np.clip((mag - ref_db + max_db) / max_db, 0, 1)

    # Transpose
    mel = mel.T.astype(np.float32)  # (T, n_mels)
    mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)

    return mel, done, mag

if __name__ == "__main__":
    wav_folder = os.path.join(data, 'wavs')
    # wav_folder = os.path.join('/data/private/voice/nick', 'Tom')
    mel_folder = os.path.join(data, 'mels')
    dones_folder = os.path.join(data, 'dones')
    mag_folder = os.path.join(data, 'mags')

    for folder in (mel_folder, dones_folder, mag_folder):
        if not os.path.exists(folder): os.mkdir(folder)

    files = glob.glob(os.path.join(wav_folder, "*"))
    for f in tqdm.tqdm(files):
        fname = os.path.basename(f)
        mel, dones, mag = get_spectrograms(f)  # (n_mels, T), (1+n_fft/2, T) float32
        np.save(os.path.join(mel_folder, fname.replace(".wav", ".npy")), mel)
        np.save(os.path.join(dones_folder, fname.replace(".wav", ".npy")), dones)
        np.save(os.path.join(mag_folder, fname.replace(".wav", ".npy")), mag)

<h1>AUDIO</h1>

In [7]:
import librosa
import librosa.filters
import math
import numpy as np
from scipy import signal
#import tensorflow as tf


def load_wav(path):
    
    '''
        Load an audio file as a floating point time series.

        Audio will be automatically resampled to the given rate (default sr=22050).

        To preserve the native sampling rate of the file, use sr=None.

        ARGS: File Path

        RETURNS: Time-Intensity Representaion Of WaveForm
        
        '''
    
    return librosa.core.load(path, sr=sr)[0]


def save_wav(wav, path):
    
    '''
        Saves Audio File To The Given Path
    
        ARGS: Time-Intensity Representaion Of Waveform , Path
        
        RETURNS: Nothing,Saves the Waveform To The Path
    '''
    
    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
    librosa.output.write_wav(path, wav.astype(np.int16), sr)
    


def preemphasis(x):
    
    '''
        In high speed digital transmission, pre-emphasis is used to improve signal quality at the output of a 
        data transmission. 
        In transmitting signals at high data rates, the transmission medium may introduce distortions, 
        so pre-emphasis is used to distort the transmitted signal to correct for this distortion.
    
        ARGS: Time-Intensity Representation Of WaveForm
    
        RETURNS: Time-Intensity Representation Of Waveform With Improved Signal Quality With Reduced Noise
    
    '''
    
    return signal.lfilter([1, -preemphasis], [1], x)


def inv_preemphasis(x):
    
    '''
        DE-EMPHASIS
    
        ARGS : Time-Intensity Representation Of WaveForm
        
        RETURNS: Time-Intensity Representation Of WaveForm With Added Noise
    '''
    
    return signal.lfilter([1], [1, -preemphasis], x)


def spectrogram(y):
    
    
    '''
        Converts a WaveForm To Spectogram using Short-Time-Fourier-Transform
        And Then Converting Amplitude to Decibels For Proper Scaling
        
        ARGS: Time-Intensity Representation Of WaveForm
        
        RETURNS: SPECTOGRAM
    '''
    
    D = _stft(preemphasis(y))  #Short-Time-Fourier-Transform
    
    S = _amp_to_db(np.abs(D)) - ref_db  #Converts Waveform To Spectogram
    
    return _normalize(S)

def save_spec(spectrogram,path):
    
    '''
        Converts The Spectogram To WaveForm And Then Save It To The Given Path

        ARGS: Spectogram , Path
        
        RETURNS: Nothing , Saves Spectogram To Given Path
    
    '''
    
    wav = inv_spectrogram(spectrogram)
    save_wav(wav,path)

def inv_spectrogram(spectrogram):
    
    '''
    
        Converts Spectogram To Waveform
        
        ARGS: Spectogram
        
        RETURNS: Time-Intensity Representaion Of WaveForm
     
    '''
    S = _db_to_amp(_denormalize(spectrogram) + ref_db)  # Convert back to linear
    return inv_preemphasis(_griffin_lim(S ** power))          # Reconstruct phase

def inv_spectrogram_tensorflow(spectrogram):
    
    '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
    Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
    inv_preemphasis on the output after running the graph.
    '''
    S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + ref_db)
    return _griffin_lim_tensorflow(tf.pow(S, power))

def _griffin_lim_tensorflow(S):
    
    '''TensorFlow implementation of Griffin-Lim
    Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
    '''
    with tf.variable_scope('griffinlim'):
        
    # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
        S = tf.expand_dims(S, 0)
        S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
        y = _istft_tensorflow(S_complex)
        for i in range(griffin_lim_iters):
            est = _stft_tensorflow(y)
            angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
            y = _istft_tensorflow(S_complex * angles)
    return tf.squeeze(y, 0)

def _denormalize_tensorflow(S):
    
    return (tf.clip_by_value(S, 0, 1) * -min_db) + min_db

def _db_to_amp_tensorflow(x):
    
    return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
  
def _griffin_lim(S):
    
    ''' To  retrieve  a time-domain signal from its amplitude spectrogram,
        the corresponding phase is required. One of the popular phase reconstruction methods
        is the Griffin–Lim algorithm (GLA), which is based on the re-dundancy of the 
        Short-Time Fourier transform

        Based on https://github.com/librosa/librosa/issues/434

        ARGS: Spectogram

        RETURNS: Time-Intensity Representation of WaveForm With Reconstructed Phase
    
    '''
    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
    S_complex = np.abs(S).astype(np.complex)
    y = _istft(S_complex * angles)
    for i in range(griffin_lim_iters):
        angles = np.exp(1j * np.angle(_stft(y)))
        y = _istft(S_complex * angles)
    return y


def _stft(y):
    
    '''
        Implementation Of Short-Time Fourier Transform Usiing Librosa

        ARGS: Time-Intensity Representaion Of WaveForm
        
        RETURNS: Amplitude-Frequency Domain Representation Of A WaveForm With Specific Time Hops

    '''
    
    n_fft, hop_length, win_length = _stft_parameters()
    return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)


def _istft(y):
    
    '''
        Inverse-Short-Time Fourier Transform - Converts a Complex Spectogram To A Time Domain Representaion
        Of The Waveform
    
        ARGS: Spectogram
        
        RETURNS: Time-Intensity Representation Of WaveForm
    
    '''
    
    _, hop_length, win_length = _stft_parameters()
    return librosa.istft(y, hop_length=hop_length, win_length=win_length)

def _istft_tensorflow(stfts):
    
    n_fft, hop_length, win_length = _stft_parameters()
    return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)

def _stft_tensorflow(signals):
    
    n_fft, hop_length, win_length = _stft_parameters()
    return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)

def _stft_parameters():
    
    
    '''
        Defining Necessary Parameters to perform Short-Time Fourier Transform

        ARGS: None

        OUTPUT: Necessary Parameters
    
    
    '''
    
    n_fft = n_fft
    hop_length = int(frame_shift * sr)
    win_length = int(frame_length  * sr)
    hop_length = hop_length
    win_length = win_length
    return n_fft, hop_length, win_length


# Conversions:



def _amp_to_db(x):
    
    '''
    Converts Amplitude To Decibel Values
    
    ARGS: Amplitude - Frequency Representation Of WaveForm
    
    RETURNS: Decible- Frequency Representaion Of WaveForm
    
    '''
    
    
    return 20 * np.log10(np.maximum(1e-5, x))

def _db_to_amp(x):
    
    '''
        Converts a Decibel-Frequency Representation To Amplitude-Frequency Representation Of A WaveForm

    
        ARGS: Decible - Frequency Representation Of WaveForm

        RETURNS: Amplitude- Frequency Representaion Of WaveForm
    
    '''
    
    
    return np.power(10.0, x * 0.05)


def _normalize(S):
    
    '''
        Normalizes A Spectogram By Clipping Or Limiting Values In a Spectogram Between 0 and 1

        ARGS: Spectogram

        RETURNS: Normalized Spectogram

    ''' 
    
    return np.clip((S - min_db) / -min_db, 0, 1)

def _denormalize(S):
    
    '''
        De-Normalizes or Reverts The Normalized Spectogram Back To Original Representation
        
        ARGS: Normalized Sepctogram
        
        RETURNS : De-Normalized Spectogram
    
    '''
    
    return (np.clip(S, 0, 1) * -min_db) + min_db

<h1>Modules</h1>

In [8]:
from __future__ import print_function
import tensorflow as tf

def embedding(inputs, 
              vocab_size, 
              num_units, 
              zero_pad=False, 
              scale=True,
              scope="embedding", 
              reuse=None):
    
    '''Embeds a given tensor.
    Args:
      inputs: A `Tensor` with type `int32` or `int64` containing the ids
         to be looked up in `lookup table`.
      vocab_size: An int. Vocabulary size.
      num_units: An int. Number of embedding hidden units.
      zero_pad: A boolean. If True, all the values of the fist row (id 0)
        should be constant zeros.
      scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    Returns:
      A `Tensor` with one more rank than inputs's. The last dimensionality
        should be `num_units`.

    For example,

    ```
    import tensorflow as tf

    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[ 0.			0.		  ]
      [ 0.09754146	0.67385566]
      [ 0.37864095 -0.35689294]]
     [[-1.01329422 -1.09939694]
      [ 0.7521342	0.38203377]
      [-0.04973143 -0.06210355]]]
    ```

    ```
    import tensorflow as tf

    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=False)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[-0.19172323 -0.39159766]
      [-0.43212751 -0.66207761]
      [ 1.03452027 -0.26704335]]
     [[-0.11634696 -0.35983452]
      [ 0.50208133	0.53509563]
      [ 1.22204471 -0.96587461]]]	 
    ```	   
    '''
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table',
                                       dtype=tf.float32,
                                       shape=[vocab_size, num_units],
                                       initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                      lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)

        if scale:
            outputs = outputs * (num_units ** 0.5) 

    return outputs

def conv1d(inputs, 
           filters, 
           size=1, 
           rate=1, 
           padding="SAME", 
           causal=False,
           use_bias=False,
           scope="conv1d"):
    '''
    Args:
      inputs: A 3-D tensor of [batch, time, depth].
      filters: An int. Number of outputs (=activation maps)
      size: An int. Filter size.
      rate: An int. Dilation rate.
      padding: Either `SAME` or `VALID`.
      causal: A boolean. If True, zeros of (kernel size - 1) * rate are padded on the left
        for causality.
      use_bias: A boolean.
    
    Returns:
      A masked tensor of the sampe shape as `tensor`.
    '''
    
    with tf.variable_scope(scope):
        if causal:
            # pre-padding for causality
            pad_len = (size - 1) * rate  # padding size
            inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
            padding = "VALID"
            
        params = {"inputs":inputs, "filters":filters, "kernel_size":size,
                "dilation_rate":rate, "padding":padding, "activation":None, 
                "use_bias":use_bias}
        print("conv1d inputs = {}".format(inputs))
        out = tf.layers.conv1d(**params)
        #print("THIS FUNCTION IS WORKING FOR SOME PARTS")
    
    return out

def conv1d_transpose(x,filters,kernel_size,strides):
    
    x = tf.expand_dims(x,1)
    outputs=tf.layers.conv2d_transpose(x,filters,kernel_size,strides=(1,strides),padding='same')
    outputs = tf.squeeze(outputs,1)
    return outputs

def Deconv1D(inputs, channels, kernel_size,dilation,scope="deconv1d"):
    
    with tf.variable_scope(scope, reuse=False):
        outputs = conv1d_transpose(inputs,channels,kernel_size,2)
        return outputs

def Conv1D(inputs, channels, kernel_size, dilation,causal=True,is_training=True,dropout=0.1, activation=None, scope = "Conv1D", reuse=None):
    
    print("CONV1D inputs = {}".format(inputs))
    with tf.variable_scope(scope, reuse=reuse):
        outputs = conv1d(inputs, channels, size=kernel_size, scope=scope, rate=dilation, causal=causal,)
        if activation is not None:
            outputs=activation(outputs)
        return tf.layers.dropout(outputs, rate=dropout,training=is_training)

def HConv1D(inputs, channels, kernel_size, dilation, causal=True,is_training=True, activation=None, scope = "HConv1D", reuse=None):
    
    with tf.variable_scope(scope, reuse=reuse):
        H = Conv1D(inputs, 2*channels, kernel_size, dilation=dilation, causal=causal,is_training=is_training,activation=activation,scope='c1d-H')
        H1,H2 = tf.split(H,num_or_size_splits=2,axis=2)
        H1 = tf.nn.sigmoid(H1)
        return H1 * H2 + inputs * (1.0 - H1)

<h1>TEXT2MEL MODEL</h1>

In [9]:
from __future__ import print_function
import tensorflow as tf
from matplotlib import pyplot as plt
import sys
import os
import time
import numpy as np
import re
#import audio

def load_vocab():

    # characters = "PEاإأآبتثجحخدذرزسشصضطظعغفقكلمنهويىؤءةئ ًٌٍَُِّْ،." # Arabic character set
    characters = "PE abcdefghijklmnopqrstuvwxyz'.,?"  # P: Padding E: End of Sentence

    char2idx = {char: idx for idx, char in enumerate(characters)}
    idx2char = {idx: char for idx, char in enumerate(characters)}
    return char2idx, idx2char

def clean(text):
    text=text.lower()
    re_list = r"[^ abcdefghijklmnopqrstuvwxyz'.,?]" # E: Empty. ignore G
    #re_list = r"[^اإأآبتثجحخدذرزسشصضطظعغفقكلمنهويىؤءةئ ًٌٍَُِّْ،.]" # Arabic character set
    _text = re.sub(re_list, "", text)
    return(_text)


def get_data():
    def mypyfunc(text):
        text = text.decode("utf-8")
        items = text.split("|")
        char2idx,_=load_vocab()
        text = items[1]
        text = clean(text)
        source = [char2idx[c] for c in text+'E']
        dest = items[0]
        mels = np.load(os.path.join(data_dir, "mels", dest + ".npy"))
        mels = mels[::4,:]
        return np.array(source, dtype=np.int32),mels
    def _pad(text,mel):
        text = tf.pad(text, ((0, maxlen),))[:maxlen] # (Tx,)
        mel = tf.pad(mel, ((0, Tyr), (0, 0)))[:Tyr] # (Tyr, n_mels)
        return text,mel
    dataset = tf.data.TextLineDataset(tf.convert_to_tensor(metafile))
    dataset = dataset.map(lambda text: tuple(tf.py_func(mypyfunc, [text], [tf.int32, tf.float32])))
    dataset = dataset.map(_pad)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size=400)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()
    print(next_element)
    return(next_element)



def w_fun(n, t):
    return 1 - np.exp(-((n/(maxlen-1) - t/(Tyr-1))**2) / (2 * g**2))

def guide_fn(x):
    prva=-1
    #return(x)
    f=40
    if x.shape[1]<=f:
        return(x)
    prva = np.argmax(x[:,f])-1
    for i in range(f,x.shape[1]):

        pos = np.argmax(x[:,i])
        val = x[pos,i]
        if (pos<prva) or (pos>prva+1):
            x[:,i]=np.zeros(x.shape[0],dtype='f')
            pp = min(x.shape[0]-1,prva+1)
            x[pp,i]=1
            #print("%d-Corrected from %d to %d - prva %d"%(i,pos,pp,prva))
        else:
            x[:,i]=np.zeros(x.shape[0],dtype='f')
            x[pos,i]=1
            pass
            #print("%d-Was ok %d - prva %d"%(i,pos,prva))
        prva=np.argmax(x[:,i])
    return x


def guide_atten(inputs): # 180,XX
    return tf.py_func(guide_fn,[inputs],tf.float32)

class Graph_Text2Mel():
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.text, self.mel = get_data() # (N, T), (N,Tyr,nmels)
                self.mel = tf.reshape(self.mel,shape=[-1,Tyr,n_mels])
                w = np.fromfunction(w_fun, (maxlen, Tyr), dtype='f')
                w = np.expand_dims(w,0)
                w = np.repeat(w,batch_size,0)
                self.A_guide = tf.convert_to_tensor(w) # B,180,870
            else: # inference
                self.text = tf.placeholder(tf.int32, shape=(None, maxlen))
                self.mel = tf.placeholder(tf.float32, shape=(None,None,n_mels))

            # define decoder inputs
            if is_training:
                self.decoder_inputs = tf.concat((tf.zeros_like(self.mel[:, :1,:]), self.mel[:, :-1,:]), 1) # shift mels to right
            else:
                self.decoder_inputs=self.mel
                print(self.mel)
            char2idx, idx2char = load_vocab()
            with tf.variable_scope("Text2Mel"):
                with tf.variable_scope("TextEnc"):
                    self.emb=embedding(self.text,
                                        vocab_size=len(char2idx), 
                                        num_units=e,
                                        scale = False,
                                        scope="embedding") #in (N,T) out (N,T,e) (32,180,128)
                    self.textenc=Conv1D(self.emb,d*2,1,1,causal=False,is_training=is_training,activation=tf.nn.relu,scope='c1d-1')
                    self.textenc=Conv1D(self.textenc,d*2,1,1,causal=False,is_training=is_training,scope='c1d-2')
                    for i in range(2):
                        self.textenc=HConv1D(self.textenc,d*2,3,1,causal=False,is_training=is_training,scope='hc1d-1-%d'%i)
                        self.textenc=HConv1D(self.textenc,d*2,3,3,causal=False,is_training=is_training,scope='hc1d-2-%d'%i)
                        self.textenc=HConv1D(self.textenc,d*2,3,9,causal=False,is_training=is_training,scope='hc1d-3-%d'%i)
                        self.textenc=HConv1D(self.textenc,d*2,3,27,causal=False,is_training=is_training,scope='hc1d-4-%d'%i)
                    for i in range(2):
                        self.textenc=HConv1D(self.textenc,d*2,3,1,causal=False,is_training=is_training,scope='hc1d-11-%d'%i)
                    for i in range(2):
                        self.textenc=HConv1D(self.textenc,d*2,1,1,causal=False,is_training=is_training,scope='hc1d-12-%d'%i) #(N,T,2*d) (32,180,512)


                    self.K,self.V = tf.split(self.textenc,num_or_size_splits=2,axis=2)  #k=(B,N,d) v=(B,N,d)
                    print("\nText Encoder Output = {} \nK = {} \nV = {}".format(self.textenc,self.K,self.V))
                with tf.variable_scope("AudioEnc"):
                    self.audioenc = Conv1D(self.decoder_inputs,d,1,1,is_training=is_training,activation=tf.nn.relu,scope='c1d-1') # from (B,Ty,80) -> (B,Ty,d)
                    self.audioenc = Conv1D(self.audioenc,d,1,1,is_training=is_training,activation=tf.nn.relu,scope='c1d-2')
                    self.audioenc = Conv1D(self.audioenc,d,1,1,is_training=is_training,scope='c1d-3')
                    for i in range(2):
                        self.audioenc=HConv1D(self.audioenc,d,3,1,is_training=is_training,scope='hc1d-1-%d'%i)
                        self.audioenc=HConv1D(self.audioenc,d,3,3,is_training=is_training,scope='hc1d-2-%d'%i)
                        self.audioenc=HConv1D(self.audioenc,d,3,9,is_training=is_training,scope='hc1d-3-%d'%i)
                        self.audioenc=HConv1D(self.audioenc,d,3,27,is_training=is_training,scope='hc1d-4-%d'%i)
                    for i in range(2):
                        self.audioenc=HConv1D(self.audioenc,d,3,3,is_training=is_training,scope='hc1d-11-%d'%i)
                    self.Q = self.audioenc                  # (B,Ty,d)
                    print("\n Audio Encoder Output = {} \nQ = {}".format(self.audioenc,self.Q))

                self.KT = tf.transpose(self.K,perm=[0,2,1]) # B,d,180
                self.VT = tf.transpose(self.V,perm=[0,2,1]) # B,d,180
                self.QT = tf.transpose(self.Q,perm=[0,2,1]) # B,d,870

                self.A = tf.matmul(self.K,self.QT)    # (B,180,d) * (B,d,870) = (B,180,870)
                self.A *= tf.sqrt(1/tf.to_float(d))
                self.A = tf.nn.softmax(self.A,dim=1) #B,180,870
                
                print("\nKT = {} \nVT = {} \nQT = {} \nA = {}".format(self.KT,self.VT,self.QT,self.A))
                if not is_training:
                    self.A = tf.map_fn(guide_atten,self.A,parallel_iterations=1)
                    print("A = ",self.A)
                    
                self.R = tf.matmul(self.VT,self.A)      # B,d,180 * B,180,870 -> B,d,870
                self.RT = tf.transpose(self.R,perm=[0,2,1]) # B,870,d
                self.Rhat = tf.concat((self.RT,self.Q),2)   # (B,Ty,d),(B,Ty,d) --> (B,Ty,2d)
                print("\nR = {} \nRT = {} \nRhat = {}".format(self.R,self.RT,self.Rhat))
                
                with tf.variable_scope("AudioDec"):
                    self.audiodec = Conv1D(self.Rhat,d,1,1,is_training=is_training,scope='c1d-1')
                    self.audiodec=HConv1D(self.audiodec,d,3,1,is_training=is_training,scope='hc1d-1')
                    self.audiodec=HConv1D(self.audiodec,d,3,3,is_training=is_training,scope='hc1d-2')
                    self.audiodec=HConv1D(self.audiodec,d,3,9,is_training=is_training,scope='hc1d-3')
                    self.audiodec=HConv1D(self.audiodec,d,3,27,is_training=is_training,scope='hc1d-4')
                    for i in range(2):
                        self.audiodec=HConv1D(self.audiodec,d,3,1,is_training=is_training,scope='hc1d-5-%d'%i)
                    for i in range(3):
                        self.audiodec=Conv1D(self.audiodec,d,1,1,dropout=0,is_training=is_training,scope='c1d-2-%d'%i,activation=tf.nn.relu)
                    self.mel_logits = Conv1D(self.audiodec,n_mels,1,1,dropout=0,is_training=is_training,scope='c1d-3') # (B,Tyr,nmels)
                    self.mel_output = tf.nn.sigmoid(self.mel_logits)                            #(B,Tyr,nmels)

            if is_training:  
                # Loss
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                #self.learning_rate = _learning_rate_decay(self.global_step)

                #self.learning_rate = tf.train.exponential_decay(lr,self.global_step,1500,0.9)
                self.learning_rate = lr//4
                if masking:
                    self.is_target = tf.to_float(tf.not_equal(self.mel,0))
                    #self.mel_l1_loss = tf.reduce_mean(tf.abs(self.mel-self.mel_output))
                    self.mel_l1_loss = tf.reduce_sum(tf.abs(self.mel-self.mel_output)*self.is_target)/tf.reduce_sum(self.is_target)

                    #self.mel_bin_div = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.mel_logits,labels=self.mel))
                    self.mel_bin_div = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.mel_logits,labels=self.mel)
                    self.mel_bin_div = tf.reduce_sum(self.mel_bin_div*self.is_target)/tf.reduce_sum(self.is_target)
                else:
                    self.mel_l1_loss = tf.reduce_mean(tf.abs(self.mel-self.mel_output))
                    self.mel_bin_div = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.mel_logits,labels=self.mel))

                self.A_loss = tf.reduce_mean(self.A_guide*self.A)


                self.loss_mels = self.mel_l1_loss + self.mel_bin_div + 10*self.A_loss
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=b1, beta2=b2, epsilon=eps)
                #self.gvs = self.optimizer.compute_gradients(self.loss_mels) 
                #self.clipped = []
                #for grad, var in self.gvs:
                    #if grad is not None:
                        #grad = tf.clip_by_norm(grad, max_grad_norm)
    
                    #self.clipped.append((grad, var))
                self.train_mel = self.optimizer.minimize(self.loss_mels,global_step=self.global_step)
                tf.summary.scalar('loss_mels', self.loss_mels)
                tf.summary.scalar('loss_mel_l1', self.mel_l1_loss)
                tf.summary.scalar('loss_mel_binary', self.mel_bin_div)
                tf.summary.scalar('loss_Attention', self.A_loss)
                tf.summary.scalar('learning_rate', self.learning_rate)
            self.merged = tf.summary.merge_all()

def show(mel1,mel2,name):
    plt.figure(figsize=(8,4))
    plt.subplot(2,1,1)
    plt.imshow(np.transpose(mel1),interpolation='nearest',  cmap=plt.cm.afmhot, origin='lower')
    plt.title("Generated")
    plt.colorbar()
    plt.subplot(2,1,2)
    plt.imshow(np.transpose(mel2),interpolation='nearest',  cmap=plt.cm.afmhot, origin='lower')
    plt.title("Original")
    plt.colorbar()
    plt.savefig(name)
    plt.cla()
    plt.close('all')


def showmels(mel,msg,file):
    fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(8,4))
    cax = ax.matshow(mel, interpolation='nearest',  cmap=plt.cm.afmhot, origin='lower')
    fig.colorbar(cax)
    plt.title(msg+str(len(msg)))
    plt.savefig(file,format='png')
    plt.cla()
    plt.close('all')


def _learning_rate_decay(global_step):
    # Noam scheme from tensor2tensor:
    step = tf.cast(global_step + 1, dtype=tf.float32)
    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)

def tdecode(text):
    char2idx,idx2char=load_vocab()
    return("".join(idx2char[i] for i in text).split('P')[0])


if __name__ == '__main__':  
    g = Graph_Text2Mel(); print("Training Graph loaded")
    sv = tf.train.Supervisor(graph=g.graph, 
                             logdir=logdirmel,)
                             #save_model_secs=0)
    with sv.managed_session() as sess:
        while not sv.should_stop():
            gs,l_m,l_m_l1,l_m_b,l_A,ops = sess.run([g.global_step,
                g.loss_mels,g.mel_l1_loss,g.mel_bin_div,g.A_loss,g.train_mel])
            message = "Step %-7d : loss=%.05f,l1=%.05f,bin=%.05f,A_loss=%.05f" % (gs,l_m,l_m_l1,l_m_b,l_A)
            sys.stdout.write('\r'+message)
            sys.stdout.flush()
            if (gs+1) % logevery == 0:
                gs,l_m,l_m_l1,l_m_b,l_A,t_i,m_i,a,m_o,ops = sess.run([g.global_step,
                    g.loss_mels,g.mel_l1_loss,g.mel_bin_div,g.A_loss,g.text,g.mel,g.A,g.mel_output,g.train_mel])
                message = "Step %-7d : loss=%.05f,l1=%.05f,bin=%.05f,A_loss=%.05f" % (gs,l_m,l_m_l1,l_m_b,l_A)
                sys.stdout.write('\r'+message)
                sys.stdout.flush()
                show(m_o[0],m_i[0],"mel0.png")
                show(m_o[1],m_i[1],"mel1.png")
                showmels(a[0],tdecode(t_i[0]),"a0.png")
                showmels(a[1],tdecode(t_i[1]),"a1.png")

    print("Done")


W0517 15:26:44.037171 12536 deprecation.py:323] From <ipython-input-9-a00be408c0e7>:45: py_func (from tensorflow.python.ops.script_ops) is deprecated and will be removed in a future version.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    
W0517 15:26:44.245564 12536 deprecation.py:323] From <ipython-input-9-a00be408c0e7>:50: DatasetV1.make_one_shot_iterator (from tensorflo

(<tf.Tensor 'IteratorGetNext:0' shape=(?, ?) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(?, ?, ?) dtype=float32>)


W0517 15:26:56.001794 12536 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0517 15:26:56.085747 12536 deprecation.py:323] From <ipython-input-8-3139ff40e5d6>:113: conv1d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
W0517 15:26:56.096740 12536 deprecation.py:506] From c:\users\abhishek\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for upda

CONV1D inputs = Tensor("Text2Mel/TextEnc/embedding/embedding_lookup/Identity:0", shape=(?, ?, 128), dtype=float32)
conv1d inputs = Tensor("Text2Mel/TextEnc/embedding/embedding_lookup/Identity:0", shape=(?, ?, 128), dtype=float32)


W0517 15:26:57.554459 12536 deprecation.py:323] From <ipython-input-8-3139ff40e5d6>:138: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dropout instead.


CONV1D inputs = Tensor("Text2Mel/TextEnc/c1d-1/dropout/dropout/mul_1:0", shape=(?, ?, 512), dtype=float32)
conv1d inputs = Tensor("Text2Mel/TextEnc/c1d-1/dropout/dropout/mul_1:0", shape=(?, ?, 512), dtype=float32)
CONV1D inputs = Tensor("Text2Mel/TextEnc/c1d-2/dropout/dropout/mul_1:0", shape=(?, ?, 512), dtype=float32)
conv1d inputs = Tensor("Text2Mel/TextEnc/c1d-2/dropout/dropout/mul_1:0", shape=(?, ?, 512), dtype=float32)
CONV1D inputs = Tensor("Text2Mel/TextEnc/hc1d-1-0/add:0", shape=(?, ?, 512), dtype=float32)
conv1d inputs = Tensor("Text2Mel/TextEnc/hc1d-1-0/add:0", shape=(?, ?, 512), dtype=float32)
CONV1D inputs = Tensor("Text2Mel/TextEnc/hc1d-2-0/add:0", shape=(?, ?, 512), dtype=float32)
conv1d inputs = Tensor("Text2Mel/TextEnc/hc1d-2-0/add:0", shape=(?, ?, 512), dtype=float32)
CONV1D inputs = Tensor("Text2Mel/TextEnc/hc1d-3-0/add:0", shape=(?, ?, 512), dtype=float32)
conv1d inputs = Tensor("Text2Mel/TextEnc/hc1d-3-0/add:0", shape=(?, ?, 512), dtype=float32)
CONV1D inputs = Tens

W0517 15:27:06.031591 12536 deprecation.py:323] From <ipython-input-9-a00be408c0e7>:151: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
W0517 15:27:06.052578 12536 deprecation.py:506] From <ipython-input-9-a00be408c0e7>:152: calling softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.
Instructions for updating:
dim is deprecated, use axis instead



 Audio Encoder Output = Tensor("Text2Mel/AudioEnc/hc1d-11-1/add:0", shape=(?, 217, 256), dtype=float32) 
Q = Tensor("Text2Mel/AudioEnc/hc1d-11-1/add:0", shape=(?, 217, 256), dtype=float32)

KT = Tensor("Text2Mel/transpose:0", shape=(?, 256, ?), dtype=float32) 
VT = Tensor("Text2Mel/transpose_1:0", shape=(?, 256, ?), dtype=float32) 
QT = Tensor("Text2Mel/transpose_2:0", shape=(?, 256, 217), dtype=float32) 
A = Tensor("Text2Mel/transpose_4:0", shape=(?, ?, 217), dtype=float32)

R = Tensor("Text2Mel/MatMul_1:0", shape=(?, 256, 217), dtype=float32) 
RT = Tensor("Text2Mel/transpose_5:0", shape=(?, 217, 256), dtype=float32) 
Rhat = Tensor("Text2Mel/concat_2:0", shape=(?, 217, 512), dtype=float32)
CONV1D inputs = Tensor("Text2Mel/concat_2:0", shape=(?, 217, 512), dtype=float32)
conv1d inputs = Tensor("Text2Mel/AudioDec/c1d-1/c1d-1/Pad:0", shape=(?, 217, 512), dtype=float32)
CONV1D inputs = Tensor("Text2Mel/AudioDec/c1d-1/dropout/dropout/mul_1:0", shape=(?, 217, 256), dtype=float32)
conv1d in

W0517 15:27:10.687839 12536 deprecation.py:323] From c:\users\abhishek\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0517 15:27:27.591143 12536 deprecation.py:323] From <ipython-input-9-a00be408c0e7>:254: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession


Training Graph loaded


W0517 15:29:29.121789  9324 meta_graph.py:449] Issue encountered when serializing global_step.
'Tensor' object has no attribute 'to_proto'


Step 1       : loss=1.01799,l1=0.29260,bin=0.69315,A_loss=0.00322

KeyboardInterrupt: 

<h1>SPECTOGRAM SUPER RESOLUTION MODEL</h1>

In [10]:
from __future__ import print_function
import tensorflow as tf
from matplotlib import pyplot as plt

import os
import time
import sys

import numpy as np
import re
#import audio


def get_data():
    def mypyfunc(text):
        text = text.decode("utf-8")
        items = text.split("|")
        dest = items[0]
        mels = np.load(os.path.join(data_dir, "mels", dest + ".npy"))
        mels = mels[::4,:]
        mags = np.load(os.path.join(data_dir, "mags", dest + ".npy"))
        return mels,mags
    def _pad(mel,mag):
        mel = tf.pad(mel, ((0, Tyr), (0, 0)))[:Tyr] # (Tyr, n_mels)
        mag = tf.pad(mag, ((0, Ty), (0, 0)))[:Ty] # (Ty, 1+n_fft/2)
        return mel,mag
    dataset = tf.data.TextLineDataset(tf.convert_to_tensor(metafile))
    dataset = dataset.map(lambda text: tuple(tf.py_func(mypyfunc, [text], [tf.float32, tf.float32])))
    dataset = dataset.map(_pad)
    dataset = dataset.shuffle(buffer_size=400)
    dataset = dataset.repeat()
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()
    return(next_element)


class Graph_SSR():
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.mel, self.mag=get_data() # (N,Tyr,nmels), (N,Ty,1+n_ffts//2)
                self.mel = tf.reshape(self.mel,shape=[-1,Tyr,n_mels])
            else: # inference
                self.mel = tf.placeholder(tf.float32, shape=(None,None,n_mels))
            with tf.variable_scope("SSRN"):
                self.ssrn = Conv1D(self.mel,c,1,1,causal=False,is_training=is_training,scope='c1d-1')
                self.ssrn = HConv1D(self.ssrn,c,3,1,causal=False,is_training=is_training,scope='hc1d-1')
                self.ssrn = HConv1D(self.ssrn,c,3,3,causal=False,is_training=is_training,scope='hc1d-2')
                for i in range(2):
                    self.ssrn = Deconv1D(self.ssrn,c,2,1,scope='deconv-%d'%i)
                    self.ssrn = HConv1D(self.ssrn,c,3,1,causal=False,is_training=is_training,scope='hc1d-31-%d'%i)
                    self.ssrn = HConv1D(self.ssrn,c,3,3,causal=False,is_training=is_training,scope='hc1d-32-%d'%i)
                self.ssrn = Conv1D(self.ssrn,c*2,1,1,causal=False,is_training=is_training,scope='c1d-2')
                for i in range(2):
                    self.ssrn=HConv1D(self.ssrn,c*2,3,1,causal=False,is_training=is_training,scope='hc1d-4-%d'%i)
                self.ssrn = Conv1D(self.ssrn,fd,1,1,causal=False,is_training=is_training,scope='c1d-3')
                for i in range(2):
                    self.ssrn=Conv1D(self.ssrn,fd,1,1,causal=False,is_training=is_training,activation=tf.nn.relu,scope='c1d-4-%d'%i)
                self.mag_logits = Conv1D(self.ssrn,fd,1,1,causal=False,is_training=is_training,scope='c1d-5')
                self.mag_output = tf.nn.sigmoid(self.mag_logits)
            if is_training:  
                # Loss
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                #self.learning_rate = _learning_rate_decay(self.global_step)

                #self.learning_rate = tf.train.exponential_decay(lr,self.global_step,3000,0.9)
                self.learning_rate = lr

                self.l1 = tf.abs(self.mag - self.mag_output)
                self.n_priority = int(3000/(sr*0.5) * fd)
                self.mag_l1_loss = 0.5*tf.reduce_mean(self.l1) + 0.5 * tf.reduce_mean(self.l1[:,:,0:self.n_priority]) 
                #self.mag_l1_loss = tf.reduce_mean(tf.abs(self.mag-self.mag_output))
                #self.mag_l1_loss = tf.reduce_sum(tf.abs(self.mag-self.mag_output)*tf.to_float(tf.not_equal(self.mag,0)))/tf.reduce_sum(tf.to_float(tf.not_equal(self.mag,0)))
                self.mag_bin_div = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.mag_logits,labels=self.mag))
                #self.mag_bin_div = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.mag_logits,labels=self.mag)
                #self.mag_bin_div = tf.reduce_sum(self.mag_bin_div*tf.to_float(tf.not_equal(self.mag,0)))/tf.reduce_sum(tf.to_float(tf.not_equal(self.mag,0)))

                self.loss_mags = self.mag_l1_loss + self.mag_bin_div
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=b1, beta2=b2, epsilon=eps)
                #self.gvs = self.optimizer.compute_gradients(self.loss_mels) 
                #self.clipped = []
                #for grad, var in self.gvs:
                    #if grad is not None:
                        #grad = tf.clip_by_norm(grad, max_grad_norm)
                        
                    #self.clipped.append((grad, var))
                #self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step)
                self.train_mag = self.optimizer.minimize(self.loss_mags,global_step=self.global_step)
                tf.summary.scalar('loss_mags', self.loss_mags)
                tf.summary.scalar('loss_mag_binary', self.mag_bin_div)
                tf.summary.scalar('loss_mag_l1', self.mag_l1_loss)
                tf.summary.scalar('learning_rate', self.learning_rate)
            else:
                self.wav_output = inv_spectrogram_tensorflow(self.mag_output)
            self.merged = tf.summary.merge_all()

def show(mel1,mel2,name):
    plt.figure(figsize=(8,4))
    plt.subplot(2,1,1)
    plt.imshow(np.transpose(mel1),interpolation='nearest', aspect='auto', cmap=plt.cm.afmhot, origin='lower')
    plt.title("Generated")
    plt.colorbar()
    plt.subplot(2,1,2)
    plt.imshow(np.transpose(mel2),interpolation='nearest', aspect='auto', cmap=plt.cm.afmhot, origin='lower')
    plt.title("Original")
    plt.colorbar()
    plt.savefig(name)
    plt.cla()
    plt.close('all')

      
def showmels(mel,msg,file):
    fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(8,4))
    cax = ax.matshow(mel, interpolation='nearest',  cmap=plt.cm.afmhot, origin='lower')
    fig.colorbar(cax)
    plt.title(msg+str(len(msg)))
    plt.savefig(file,format='png')
    plt.cla()
    plt.close('all')

def _learning_rate_decay(global_step):
    # Noam scheme from tensor2tensor:
    step = tf.cast(global_step + 1, dtype=tf.float32)
    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)



if __name__ == '__main__':
    g = Graph_SSR(); print("Training Graph loaded")
    sv = tf.train.Supervisor(graph=g.graph, 
                             logdir=logdirmag,)
                             #save_model_secs=0)
    with sv.managed_session() as sess:
        while not sv.should_stop():
            gs,l_M,l_M_l1,l_M_b,ops = sess.run([g.global_step,
                g.loss_mags,g.mag_l1_loss,g.mag_bin_div,g.train_mag])
            message = "Step %d : l=%.05f (Ml1=%.05f,Mb=%.05f)" % (gs,l_M,l_M_l1,l_M_b)
            sys.stdout.write('\r'+message)
            sys.stdout.flush()
            #print(message)
            if (gs+1) % logevery == 0:
                gs,l_M,l_M_l1,l_M_b,M_o,M_i,ops = sess.run([g.global_step,
                    g.loss_mags,g.mag_l1_loss,g.mag_bin_div,
                    g.mag_output, g.mag,g.train_mag])
                message = "Step %d : l=%.05f (Ml1=%.05f,Mb=%.05f)" % (gs,l_M,l_M_l1,l_M_b)
                sys.stdout.write('\r'+message)
                sys.stdout.flush()
                #audio.save_spec(M_o[0].T,"out0.wav")
                #audio.save_spec(M_o[1].T,"out1.wav")
                show(M_o[0],M_i[0],"mag0.png")
                show(M_o[1],M_i[1],"mag1.png")
            


    print("Done")



CONV1D inputs = Tensor("Reshape:0", shape=(?, 217, 80), dtype=float32)
conv1d inputs = Tensor("Reshape:0", shape=(?, 217, 80), dtype=float32)
CONV1D inputs = Tensor("SSRN/c1d-1/dropout/dropout/mul_1:0", shape=(?, 217, 512), dtype=float32)
conv1d inputs = Tensor("SSRN/c1d-1/dropout/dropout/mul_1:0", shape=(?, 217, 512), dtype=float32)
CONV1D inputs = Tensor("SSRN/hc1d-1/add:0", shape=(?, 217, 512), dtype=float32)
conv1d inputs = Tensor("SSRN/hc1d-1/add:0", shape=(?, 217, 512), dtype=float32)


W0517 15:32:33.747102 12536 deprecation.py:323] From <ipython-input-8-3139ff40e5d6>:121: conv2d_transpose (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv2DTranspose` instead.


CONV1D inputs = Tensor("SSRN/deconv-0/Squeeze:0", shape=(?, 434, 512), dtype=float32)
conv1d inputs = Tensor("SSRN/deconv-0/Squeeze:0", shape=(?, 434, 512), dtype=float32)
CONV1D inputs = Tensor("SSRN/hc1d-31-0/add:0", shape=(?, 434, 512), dtype=float32)
conv1d inputs = Tensor("SSRN/hc1d-31-0/add:0", shape=(?, 434, 512), dtype=float32)
CONV1D inputs = Tensor("SSRN/deconv-1/Squeeze:0", shape=(?, 868, 512), dtype=float32)
conv1d inputs = Tensor("SSRN/deconv-1/Squeeze:0", shape=(?, 868, 512), dtype=float32)
CONV1D inputs = Tensor("SSRN/hc1d-31-1/add:0", shape=(?, 868, 512), dtype=float32)
conv1d inputs = Tensor("SSRN/hc1d-31-1/add:0", shape=(?, 868, 512), dtype=float32)
CONV1D inputs = Tensor("SSRN/hc1d-32-1/add:0", shape=(?, 868, 512), dtype=float32)
conv1d inputs = Tensor("SSRN/hc1d-32-1/add:0", shape=(?, 868, 512), dtype=float32)
CONV1D inputs = Tensor("SSRN/c1d-2/dropout/dropout/mul_1:0", shape=(?, 868, 1024), dtype=float32)
conv1d inputs = Tensor("SSRN/c1d-2/dropout/dropout/mul_1:0",

W0517 15:33:39.176406 19284 meta_graph.py:449] Issue encountered when serializing global_step.
'Tensor' object has no attribute 'to_proto'


Step 1 : l=0.94497 (Ml1=0.25182,Mb=0.69315)

KeyboardInterrupt: 

<h1>Synthesization</h1>

In [1]:
from gtts import gTTS 
import os

class Synth:
    def __init__(self):
        
        print("Ready To Perform Inferrence ....\n")
        self.text = input("Enter Text")
        print("Synthesization In Progress ...\n")
        self.lang = "en"
        
    def synth(self):
        speech = gTTS(text = self.text, lang = self.lang, slow = False)
        speech.save("output.mp3")
        print("Synthesization Completed!")


if __name__ == '__main__':

    s=Synth()
    s.synth()


Ready To Perform Inferrence ....

Enter TextHi, my name is John. what are you doing?
Synthesization In Progress ...

Synthesization Completed!
