# Transformer TTS: A Text-to-Speech Transformer in TensorFlow 2

In [1]:
# Clone the repo, the pretrained model and WaveRNN for the vocoder
!git clone https://github.com/as-ideas/TransformerTTS.git
!git clone https://github.com/as-ideas/tts_model_outputs.git
!git clone https://github.com/fatchord/WaveRNN

Cloning into 'TransformerTTS'...
remote: Enumerating objects: 110, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 2334 (delta 55), reused 48 (delta 17), pack-reused 2224[K
Receiving objects: 100% (2334/2334), 1.60 MiB | 1.82 MiB/s, done.
Resolving deltas: 100% (1573/1573), done.
Cloning into 'tts_model_outputs'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 65 (delta 9), reused 0 (delta 0), pack-reused 43[K
Unpacking objects: 100% (65/65), done.
Cloning into 'WaveRNN'...
remote: Enumerating objects: 928, done.[K
remote: Total 928 (delta 0), reused 0 (delta 0), pack-reused 928
Receiving objects: 100% (928/928), 241.65 MiB | 13.75 MiB/s, done.
Resolving deltas: 100% (540/540), done.


In [2]:
# Install requirements
!apt-get install -y espeak
!pip install -r TransformerTTS/requirements.txt

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  espeak-data libespeak1 libportaudio2 libsonic0
The following NEW packages will be installed:
  espeak espeak-data libespeak1 libportaudio2 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 31 not upgraded.
Need to get 1,219 kB of archives.
After this operation, 3,031 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0-1 [64.6 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libsonic0 amd64 0.2.0-6 [13.4 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak-data amd64 1.48.04+dfsg-5 [934 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libespeak1 amd64 1.48.04+dfsg-5 [145 kB]
Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak amd64 1.48.04+dfsg-5 [61.6 kB]
Fetched 1,219 kB in 3s (457 kB/s)
Sel

In [3]:
# Load pretrained models
config_path = 'tts_model_outputs/ljspeech_forward_transformer/standard'
project_path = 'TransformerTTS'

import sys
sys.path.append(project_path)
from utils.config_manager import ConfigManager
from utils.audio import reconstruct_waveform

import IPython.display as ipd

config_loader = ConfigManager(config_path, model_kind='forward')
model = config_loader.load_model('tts_model_outputs/ljspeech_forward_transformer/standard/model_weights/ckpt-133')

restored weights from tts_model_outputs/ljspeech_transformertts/standard/model_weights/ckpt-90 at step 900000


In [4]:
# Synthesize text
sentence = 'Scientists at the CERN laboratory, say they have discovered a new particle.'
out = model.predict(sentence)

pred text mel: 397 stop out: -1.9915766716003418Stopping


In [5]:
# Convert spectrogram to wav (with griffin lim)
wav = reconstruct_waveform(out['mel'].numpy().T, config=config_loader.config)
ipd.display(ipd.Audio(wav, rate=config_loader.config['sampling_rate']))

### WaveRNN

In [0]:
# Export for WaveRNN
import numpy as np
from pathlib import Path
WaveRNN_path = Path('WaveRNN/')
np.save(WaveRNN_path / 'scientists.npy', (out['mel'].numpy().T+4.)/8.)

In [0]:
# Do some sys cleaning and imports
sys.path.remove('TransformerTTS')
sys.modules.pop('utils')

import sys
sys.path.append('WaveRNN/')
from utils.dsp import hp
from models.fatchord_version import WaveRNN
import torch
import numpy as np
from pathlib import Path

In [8]:
# Unzip the pretrained model
!unzip WaveRNN/pretrained/ljspeech.wavernn.mol.800k.zip -d WaveRNN/pretrained/

Archive:  WaveRNN/pretrained/ljspeech.wavernn.mol.800k.zip
  inflating: WaveRNN/pretrained/latest_weights.pyt  


In [9]:
# Load pretrained model
hp.configure(WaveRNN_path / 'hparams.py')  # Load hparams from file
device = torch.device('cpu')
model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                fc_dims=hp.voc_fc_dims,
                bits=hp.bits,
                pad=hp.voc_pad,
                upsample_factors=hp.voc_upsample_factors,
                feat_dims=hp.num_mels,
                compute_dims=hp.voc_compute_dims,
                res_out_dims=hp.voc_res_out_dims,
                res_blocks=hp.voc_res_blocks,
                hop_length=hp.hop_length,
                sample_rate=hp.sample_rate,
                mode=hp.voc_mode).to(device)

model.load(str(WaveRNN_path / 'pretrained/latest_weights.pyt'))

Trainable Parameters: 4.234M


In [0]:
# Ignore some TF warnings
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [11]:
# Generate sample with pre-trained WaveRNN vocoder
mel = np.load(WaveRNN_path / 'scientists.npy')
_ = model.generate(mel[np.newaxis,:,:], 'scientists.wav', False, 1, hp.voc_overlap, hp.mu_law)

| ████████████████ 109400/109450 | Batch Size: 1 | Gen Rate: 0.7kHz | 

In [12]:
# Load wav file
ipd.display(ipd.Audio('scientists.wav'))