In [1]:
from fastai.data.all import *
from modules import *
from data import TTSDataset, collate_fn

In [2]:
path_ds = Path("../data/LJSpeech-1.1/")
path_vocab = Path("../data/CMUDict/cmudict-0.7b.symbols.txt")

sample_rate = 22050
n_fft = 1024
hop_length = 256 
n_bins = 80
ratio = 0.01
preload = False
bs = 4

n_hidden = 384
n_heads = 2
kernal_sz = 3
n_filters = 1536
n_blocks = 6
kernal_sz_v = 3
n_filters_v = 256
p_dropout = 0.5
upsample_ratio = 1


n_iter = 180

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
ds = TTSDataset(path_ds, path_vocab, sample_rate, n_fft, 
                hop_length, n_bins, ratio, preload)
dl = torch.utils.data.DataLoader(ds, bs, shuffle=True,
                                 collate_fn=partial(collate_fn, pad_num=ds.pad_num))

In [4]:
mel, phones, durations = first(dl); mel.shape, phones.shape, durations.shape

(torch.Size([4, 80, 508]), torch.Size([4, 63]), torch.Size([4, 63]))

In [5]:
class FastSpeech(nn.Module):
    def __init__(self, vocab_sz, nhidden, nout, nheads, kernal_sz, nfilters, nblocks, 
                 kernal_sz_v, nfilters_v, dropout, device=None):
        super(FastSpeech, self).__init__()
        self.device = device
        self.embedding = nn.Embedding(vocab_sz, nhidden)
        self.fft_pho = nn.ModuleList([FeedForwardTransformer(nhidden, nheads, kernal_sz, nfilters)
                                      for _ in range(nblocks)])
    
    def forward(self, inp, durations, upsample_ratio, dur_train=False):
        x = self.embedding(inp)
        x = x + positional_embeddings(*x.shape[-2:], device=self.device)
        for layer in self.fft_pho:
            x = layer(x)
        return x

In [6]:
model = FastSpeech(len(ds.vocab), n_hidden, n_bins, n_heads, kernal_sz, n_filters, n_blocks, 
                 kernal_sz_v, n_filters_v, p_dropout, device=None)

In [7]:
hi = model(phones, durations, upsample_ratio); hi.shape

torch.Size([4, 63, 384])

In [8]:
padding = (kernal_sz - 1) // 2
conv1 = nn.Conv1d(n_hidden, n_filters_v, kernal_sz_v, padding=padding)
norm1 = nn.LayerNorm(n_filters_v)
conv2 = nn.Conv1d(n_filters_v, n_hidden, kernal_sz_v, padding=padding)
norm2 = nn.LayerNorm(n_hidden)
dropout = nn.Dropout(p_dropout)
linear = nn.Linear(n_hidden, 1)

In [19]:
x = conv1(hi.transpose(1,2)); x.shape

torch.Size([4, 256, 63])

In [20]:
x = dropout(x); x.shape

torch.Size([4, 256, 63])

In [21]:
x = norm1(x.transpose(1,2)); x.shape

torch.Size([4, 63, 256])

In [22]:
x = conv2(x.transpose(1,2)); x.shape

torch.Size([4, 384, 63])

In [23]:
x = dropout(x); x.shape

torch.Size([4, 384, 63])

In [24]:
x = norm2(x.transpose(1,2)); x.shape

torch.Size([4, 63, 384])

In [25]:
x = F.relu(linear(x)).squeeze(-1).to(torch.long); x.shape

torch.Size([4, 63])

In [27]:
durations

tensor([[  2,   8,   6,  10,   3,  12,  34,  24,  11,   1,   3,  12,  15,   6,
           9,   8,   4,   9,   6,   8,   6,   3,   7,   3,   6,   4,   4,   9,
           4,   2,   6,   9,   4,   3,   9,  12,   6,   6,   2,  12,  10,   4,
           5,   1,   2,   7,   2,   4,  11,   6,   4,   6,   9,   6,  12,  10,
          14,   0,   0,   0,   0,   0,  87],
        [  5,   5,  13,  18,  10,  11,  36,   3,   7,   8,   5,   8,   4,   6,
           4,   8,   9,   6,   5,   5,   5,   8,   7,   7,   5,  14,   7,  12,
           7,   8,  15,   7,   2,   6,   6,   2,   4,  11,   2,   3,  10,   3,
          11,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0, 170],
        [  2,   2,   3,   0,   6,   4,   4,   7,   5,  11,   6,  12,  12,   4,
           6,  15,  18,  21,   5,   3,  17,   7,   3,   2,   6,   5,   6,   6,
           3,   4,   5,   2,   9,   6,   8,   9,  11,   8,   1,   3,  12,   3,
          18,  10,  33,   4,   3,   5,   

In [None]:
import torch.nn as nn

class VariencePredictor(nn.Module):
    def __init__(self, ni, ks, nf, p):
        super(VariencePredictor, self).__init__()
        padding = (ks - 1) // 2
        self.conv1, self.norm1 = nn.Conv1d(ni, nf, ks, padding=padding), nn.LayerNorm(nf)
        self.conv2, self.norm2 = nn.Conv1d(nf, ni, ks, padding=padding), nn.LayerNorm(ni)
        self.dropout = nn.Dropout(p)
        self.linear = nn.Linear(ni, 1)
    def forward(self, hi):
        x = F.relu(self.conv1(hi.transpose(1,2)))
        x = self.norm1((self.dropout(x)).transpose(1,2))
        x = F.relu(self.conv2(x.transpose(1,2)))
        x = self.norm2((self.dropout(x)).transpose(1,2))
        return self.linear(x).squeeze(1).to

In [None]:
predictor = VariencePredictor(n_hidden, kernal_sz_v, n_filters_v, p_dropout)

In [None]:
predictor(hi).shape