<a href="https://colab.research.google.com/github/bilalabbasi55/TTS/blob/main/Final_TTS_Parallelizable_espnet2_tts_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb)

## Installation

In [3]:
# NOTE: pip shows imcompatible errors due to preinstalled libraries but you do not need to care
!pip install -q espnet==0.9.5 parallel_wavegan==0.4.8

## Single speaker model demo

In [4]:
###################################
#          ENGLISH MODELS         #
###################################
fs, lang = 22050, "English"
# tag = "kan-bayashi/ljspeech_tacotron2"
# tag = "kan-bayashi/ljspeech_fastspeech"
# tag = "kan-bayashi/ljspeech_fastspeech2"
tag = "kan-bayashi/ljspeech_conformer_fastspeech2"
vocoder_tag = "ljspeech_parallel_wavegan.v1"
# vocoder_tag = "ljspeech_full_band_melgan.v2"
# vocoder_tag = "ljspeech_multi_band_melgan.v2"

###################################
#         JAPANESE MODELS         #
###################################
# fs, lang = 24000, "Japanese"
# tag = "kan-bayashi/jsut_tacotron2"
# tag = "kan-bayashi/jsut_transformer"
# tag = "kan-bayashi/jsut_fastspeech"
# tag = "kan-bayashi/jsut_fastspeech2"
# tag = "kan-bayashi/jsut_conformer_fastspeech2"
# vocoder_tag = "jsut_parallel_wavegan.v1"
# vocoder_tag = "jsut_multi_band_melgan.v2"

###################################
#         MANDARIN MODELS         #
###################################
# fs, lang = 24000, "Mandarin"
# tag = "kan-bayashi/csmsc_tacotron2"
# tag = "kan-bayashi/csmsc_transformer"
# tag = "kan-bayashi/csmsc_fastspeech"
# tag = "kan-bayashi/csmsc_fastspeech2"
# tag = "kan-bayashi/csmsc_conformer_fastspeech2"
# vocoder_tag = "csmsc_parallel_wavegan.v1"
# vocoder_tag = "csmsc_multi_band_melgan.v2"

### Model Setup

In [5]:
import time
import torch
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model
d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is 
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.
https://zenodo.org/record/4036268/files/tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space_train.loss.ave.zip?download=1: 100%|██████████| 269M/269M [00:15<00:00, 18.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1PdZv37JhAQH6AwNh31QlqruqrvjTBq7U
To: /root/.cache/parallel_wavegan/ljspeech_parallel_wavegan.v1.tar.gz
15.9MB [00:00, 125MB/s] 


### Synthesis

In [6]:
# decide the input sentence by yourself
print(f"Input your favorite sentence in {lang}.")
x = input()

# synthesis
with torch.no_grad():
    start = time.time()
    wav, c, *_ = text2speech(x)
    wav = vocoder.inference(c)
rtf = (time.time() - start)
print(f"RTF = {rtf:5f}")

# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=fs))

Input your favorite sentence in English.
Hi Please Read a Book as that will really help you, please call me back at 6 5 0 5 9 3 5 4 0 0 to get connected
RTF = 1.055778


## Multi-speaker Model Demo

### Model Selection

Please select models by comment out.

Now we provide only English multi-speaker pretrained model.

In [7]:
###################################
#          ENGLISH MODELS         #
###################################
fs, lang = 24000, "English"
tag = "kan-bayashi/vctk_gst_tacotron2"
# tag = "kan-bayashi/vctk_gst_transformer"
vocoder_tag = "vctk_parallel_wavegan.v1"
# vocoder_tag = "vctk_multi_band_melgan.v2"

### Model Setup

In [8]:
import time
import torch
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model
d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=0.8,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is 
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

https://zenodo.org/record/3986237/files/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space_train.loss.best.zip?download=1: 100%|██████████| 105M/105M [00:04<00:00, 24.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1bqEFLgAroDcgUy5ZFP4g2O2MwcwWLEca
To: /root/.cache/parallel_wavegan/vctk_parallel_wavegan.v1.tar.gz
15.5MB [00:00, 68.5MB/s]


### Synthesis

For multi-speaker model, we need to provide the reference speech to decide the speaker characteristics.  
You can use any speech but please make sure the sampling rate is matched.

In [9]:
# decide the input sentence by yourself
print(f"Input your favorite sentence in {lang}.")
x = input()

# you can change here to load your own reference speech
# e.g.
import soundfile as sf
# speech, fs = sf.read("/content/1993-147964-0006(1).wav")
# speech = torch.from_numpy(speech).float()
speech = torch.randn(50000,)

# synthesis
with torch.no_grad():
    start = time.time()
    wav, c, *_ = text2speech(x, speech=speech)
    wav = vocoder.inference(c)
rtf = (time.time() - start) / (len(wav) / fs)
print(f"RTF = {rtf:5f}")

# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=fs))

Input your favorite sentence in English.
Hi Please read a book as that will really help you to get more understanding please call me back at 6 5 0 5 9 3 5 4 0 0 to get connected


  normalized, onesided, return_complex)
  normalized, onesided, return_complex)


RTF = 0.201426


In [None]:
!pip list -v

Package                       Version         Location                               Installer
----------------------------- --------------- -------------------------------------- ---------
absl-py                       0.10.0          /usr/local/lib/python3.6/dist-packages pip      
alabaster                     0.7.12          /usr/local/lib/python3.6/dist-packages pip      
albumentations                0.1.12          /usr/local/lib/python3.6/dist-packages pip      
altair                        4.1.0           /usr/local/lib/python3.6/dist-packages pip      
appdirs                       1.4.4           /usr/local/lib/python3.6/dist-packages pip      
argcomplete                   1.12.2          /usr/local/lib/python3.6/dist-packages pip      
argon2-cffi                   20.1.0          /usr/local/lib/python3.6/dist-packages pip      
asgiref                       3.3.1           /usr/local/lib/python3.6/dist-packages pip      
astor                         0.8.1           /usr

In [None]:
%cd ~/.cache/parallel_wavegan

/root/.cache/parallel_wavegan


In [None]:
!ls

ljspeech_parallel_wavegan.v1  ljspeech_parallel_wavegan.v1.tar.gz


In [None]:
%cd ljspeech_parallel_wavegan.v1

/root/.cache/parallel_wavegan/ljspeech_parallel_wavegan.v1


In [None]:
!ls

checkpoint-400000steps.pkl  config.yml	stats.h5
