# Natural TTS Voice Generator by WaveNet

In [1]:
import librosa.display
import IPython
from IPython.display import Audio
import numpy as np
import pandas as pd
import torch
import os
from os.path import exists, join, expanduser

In [2]:
os.getcwd()

'/home/bongsang/projects/wavenet_generator/demo'

In [3]:
df = pd.read_csv("speaker-info.csv")
print(f"Number of Speakers = {len(df)}")
new_df = df.drop(df.columns[5], axis=1)
new_df.groupby('ACCENTS').count()

Number of Speakers = 108


Unnamed: 0_level_0,ID,AGE,GENDER,REGION
ACCENTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
American,22,22,22,22
Australian,2,2,2,2
Canadian,8,8,8,8
English,33,33,33,33
Indian,3,3,3,3
Irish,9,9,9,9
NewZealand,1,1,1,1
NorthernIrish,6,6,6,6
Scottish,19,19,19,19
SouthAfrican,4,4,4,4


In [4]:
new_df

Unnamed: 0,ID,AGE,GENDER,ACCENTS,REGION
0,225,23,F,English,Southern England
1,226,22,M,English,Surrey
2,227,38,M,English,Cumbria
3,228,22,F,English,Southern England
4,229,23,F,English,Southern England
5,230,22,F,English,Stockton-on-tees
6,231,23,F,English,Southern England
7,232,23,M,English,Southern England
8,233,23,F,English,Staffordshire
9,234,22,F,Scottish,West Dumfries


In [5]:
wavenet_dir = "wavenet_vocoder"
taco2_dir = "tacotron-2"

# I trained wavenet for 320,000 epochs
wn_preset = "checkpoint_step_320000.json"
wn_checkpoint_path = "checkpoint_step_320000.pth"

os.chdir(join(os.getcwd(), wavenet_dir))

# Setup WaveNet vocoder hparams
from hparams import hparams
with open(wn_preset) as f:
    hparams.parse_json(f.read())

# Setup WaveNet vocoder
from train import build_model
from synthesis import wavegen
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = build_model().to(device) # Push my model to Cuda

print("The traiend checkpoints are loaded well from {}".format(wn_checkpoint_path))
checkpoint = torch.load(wn_checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])

Using TensorFlow backend.


The traiend checkpoints are loaded well from checkpoint_step_320000.pth


## Input texts to be synthesized

In [6]:
os.getcwd()

'/home/bongsang/projects/wavenet_generator/demo/wavenet_vocoder'

In [7]:
%%bash
cat << EOS > text_list.txt
Nice to meet you, Amazon!
Nice to meet you, Omar!
My name is Bongsang Kim.
These are generated by Wave Net!
I hope see you next time.
EOS

cat text_list.txt

Nice to meet you, Amazon!
Nice to meet you, Omar!
My name is Bongsang Kim.
These are generated by Wave Net!
I hope see you next time.


In [8]:
os.chdir(join(os.getcwd(), "../tacotron-2"))

## Mel-spectrogram prediction by Tacoron2

In [9]:
os.getcwd()

'/home/bongsang/projects/wavenet_generator/demo/tacotron-2'

In [10]:
! rm -rf tacotron_output
! python synthesize.py --model='Tacotron' --mode='eval' \
  --hparams='symmetric_mels=False,max_abs_value=4.0,power=1.1,outputs_per_step=1' \
  --text_list="../wavenet_vocoder/text_list.txt"

W0924 19:40:06.607476 140696892950336 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

loaded model at logs-Tacotron/pretrained/model.ckpt-189500
Hyperparameters:
  allow_clipping_in_normalization: True
  attention_dim: 128
  attention_filters: 32
  attention_kernel: (31,)
  cleaners: english_cleaners
  cumulative_weights: True
  decoder_layers: 2
  decoder_lstm_units: 1024
  embedding_dim: 512
  enc_conv_channels: 512
  enc_conv_kernel_size: (5,)
  enc_conv_num_layers: 3
  encoder_lstm_units: 256
  fft_size: 1024
  fmax: 7600
  fmin: 125
  frame_shift_ms: None
  griffin_lim_iters: 60
  hop_size: 256
  impute_finished: False
  input_type: raw
  log_scale_mi

100%|█████████████████████████████████████████████| 5/5 [00:03<00:00,  1.20it/s]
synthesized mel spectrograms at tacotron_output/eval


In [11]:
os.getcwd()

'/home/bongsang/projects/wavenet_generator/demo/tacotron-2'

In [12]:
os.chdir(join(os.getcwd(), "../wavenet_vocoder"))

# Setup WaveNet vocoder hparams
from hparams import hparams
with open(wn_preset) as f:
    hparams.parse_json(f.read())

# Setup WaveNet vocoder
from train import build_model
from synthesis import wavegen
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = build_model().to(device)

print("Load checkpoint from {}".format(wn_checkpoint_path))
checkpoint = torch.load(wn_checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])

Load checkpoint from checkpoint_step_320000.pth


In [13]:
os.getcwd()

'/home/bongsang/projects/wavenet_generator/demo/wavenet_vocoder'

In [14]:
from glob import glob
from tqdm import tqdm

with open("../tacotron-2/tacotron_output/eval/map.txt") as f:
    maps = f.readlines()
    
maps = list(map(lambda x:x[:-1].split("|"), maps))
print(maps)
# filter out invalid ones
maps = list(filter(lambda x:len(x) == 2, maps))

print("List of texts to be synthesized")
for idx, (text,_) in enumerate(maps):
      print(idx, text)

[['Nice to meet you, Amazon!', 'tacotron_output/eval/speech-mel-00001.npy'], ['Nice to meet you, Omar!', 'tacotron_output/eval/speech-mel-00002.npy'], ['My name is Bongsang Kim.', 'tacotron_output/eval/speech-mel-00003.npy'], ['These are generated by Wave Net!', 'tacotron_output/eval/speech-mel-00004.npy'], ['I hope see you next time.', 'tacotron_output/eval/speech-mel-00005.npy']]
List of texts to be synthesized
0 Nice to meet you, Amazon!
1 Nice to meet you, Omar!
2 My name is Bongsang Kim.
3 These are generated by Wave Net!
4 I hope see you next time.


### Waveform generation

In [15]:
waveforms = []

for idx, (text, mel) in enumerate(maps):
    print("\n", idx, text)
    mel_path = join("../tacotron-2", mel)
    c = np.load(mel_path)
    if c.shape[1] != hparams.num_mels:
        np.swapaxes(c, 0, 1)
    c = np.interp(c, (0, 4), (0, 1))
 
    # Generate
    waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
    waveforms.append(waveform)

    # Audio
    IPython.display.display(Audio(waveform, rate=hparams.sample_rate))

  0%|          | 0/32256 [00:00<?, ?it/s]


 0 Nice to meet you, Amazon!


100%|██████████| 32256/32256 [03:48<00:00, 141.08it/s]


  0%|          | 1/29952 [00:00<1:34:47,  5.27it/s]


 1 Nice to meet you, Omar!


100%|██████████| 29952/29952 [03:34<00:00, 139.35it/s]


  0%|          | 0/31488 [00:00<?, ?it/s]


 2 My name is Bongsang Kim.


100%|██████████| 31488/31488 [03:47<00:00, 138.42it/s]


  0%|          | 0/39936 [00:00<?, ?it/s]


 3 These are generated by Wave Net!


100%|██████████| 39936/39936 [04:46<00:00, 139.43it/s]


  0%|          | 0/31488 [00:00<?, ?it/s]


 4 I hope see you next time.


100%|██████████| 31488/31488 [03:44<00:00, 140.21it/s]


# Thank you~~
- https://www.linkedin.com/in/bongsang/
- https://github.com/bongsang