In [1]:
from utils.utils import TacotronPreprocessor, TTSDataset, collate_fn, reconstruct_audio
import pandas as pd
import numpy as np
import re
import torch
import torchaudio
import torchaudio.functional as F
from torch.utils.data import Dataset, DataLoader
from torchaudio import transforms
from torchaudio.functional import preemphasis
import hyperparams as hps
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm
from torch.utils.tensorboard import SummaryWriter
from IPython import display

In [2]:
import torch 
from torch import nn

In [3]:
### Model

In [4]:
class EncoderConvLayer(nn.Module):
    def __init__(self, input_channels, output_channels, kernel_size) -> None:
        super().__init__()
        self.module = nn.Sequential(
            nn.Conv1d(in_channels=input_channels, out_channels=output_channels, kernel_size=kernel_size, bias=False, padding=2, dilation=1),
            nn.BatchNorm1d(output_channels),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
    def forward(self, x):
        return self.module(x)

In [5]:
class Encoder(nn.Module):
    def __init__(self, characters_num, embedding_size, lstm_hidden_size) -> None:
        super().__init__()
        self.char_embedding = nn.Embedding(characters_num, embedding_size)
        self.conv_layers = nn.Sequential(
            EncoderConvLayer(embedding_size, embedding_size, 5),
            EncoderConvLayer(embedding_size, embedding_size, 5),
            EncoderConvLayer(embedding_size, embedding_size, 5),
        )
        self.rnn = nn.LSTM(input_size=embedding_size,
                           hidden_size=lstm_hidden_size,
                           bidirectional=True, batch_first=True)
        self.rnn_dropout = nn.Dropout(0.1)

    
    def forward(self, x: torch.tensor, mask_idx=None):
        """
        На вход подается последовательность символов. Размерность [BATCH_SIZE, NUM_CHARACTERS]
        """
        x = self.char_embedding(x)  #[BATCH_SIZE, NUM_CHARACTERS, EMB_SIZE]
        x = x.transpose(1,2) #[BATCH_SIZE, EMB_SIZE, NUM_CHARACTERS]
        x = self.conv_layers(x)
        x = x.transpose(1,2) #[BATCH_SIZE, NUM_CHARACTERS, CONV_EMB]

        if mask_idx is not None:
            x = nn.utils.rnn.pack_padded_sequence(x, mask_idx, batch_first=True, enforce_sorted=False)   
        x = self.rnn(x)[0]
        if mask_idx is not None:   
            x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        x = self.rnn_dropout(x)
        return x

In [6]:
class PreNet(nn.Module):
    def __init__(self, num_mels, prenet_hidden_dim) -> None:
        super().__init__()
        self.module = nn.Sequential(
            nn.Linear(num_mels, prenet_hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(prenet_hidden_dim, prenet_hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
    def forward(self, x):
        x = self.module(x)
        return x
        

In [7]:
### Самая сложная часть модели
class Tacotron2Attention(nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.location = nn.Conv1d(in_channels=1, out_channels=hps.ATTENTION_LOCATION_FILTERS, 
                                  kernel_size=hps.ATTENTION_LOCATION_KERNEL_SIZE, dilation=1, padding=((hps.ATTENTION_LOCATION_KERNEL_SIZE-1)//2), bias=False)
        self.location_linear = nn.Linear(hps.ATTENTION_LOCATION_FILTERS, hps.ATTENTION_DIM, bias=False)
        self.rnn_hs_linear = nn.Linear(hps.DECODER_RNN_HIDDEN_DIM * 2, hps.ATTENTION_DIM, bias=False)

        self.alignments_linear = nn.Linear(hps.ATTENTION_DIM, 1, bias=False)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, rnn_hs, encoder_output, processed_encoder_output, location_attention, mask=None):
        """
         На вход подсчета аттеншна передаются:
            - Накопленная информация из РНН 
            - Выход энкодера (BS, seq_len, 2*encoder_lstm_dim)
            - Прогнанный через линейный слой энкодер (чтобы не дублить операцию)
            - commulative-attention (BS, 1, seq_len)
        """

        ### Location attention
        #   Логика:
        #   Входящий вектор на каждую букву прогоняем через свертку таким образом, чтобы у нас на каждую букву было 32 значения, которые будут говорить о том, что данная буква уже встретилась в выданном аудио
        location_attention = location_attention.transpose(1,2) # [BS, seq_len, 1]
        location_attention = self.location(location_attention) # [BS, seq_len, Location_attention_filters]
        location_attention = self.dropout(location_attention)
        location_attention = location_attention.transpose(1,2) # [BS, seq_len, Location_attention_filters]
        location_attention = self.location_linear(location_attention) # [BS, seq_len, attention_dim]

        rnn_hs = torch.cat((rnn_hs[0], rnn_hs[1]), 1) # [BS, 2 * DECODER_RNN_HIDDEN_DIM]
        rnn_hs = rnn_hs.unsqueeze(1) # [BS, 1, 2 * DECODER_RNN_HIDDEN_DIM]
        rnn_hs = self.rnn_hs_linear(rnn_hs) # [BS, 1, Attention_dim]
        alignments = nn.functional.tanh(rnn_hs + location_attention + processed_encoder_output) # (BS, seq_len, Attention_dim)
        alignments = self.alignments_linear(alignments) # (BS, seq_len, 1)


        if mask is not None:
            alignments.data.masked_fill_(~mask.unsqueeze(-1), -torch.inf)
        alignments = self.softmax(alignments) # (BS, seq_len, 1)

        attention_score = (encoder_output.transpose(1, 2) @ alignments).transpose(1, 2)

        

        
        return  attention_score, alignments



In [8]:
class DecoderPostNetConv(nn.Module):
    def __init__(self, in_kernels, out_kernels, last_layer=False) -> None:
        super().__init__()
        self.last_layer = last_layer
        self.post_net = nn.Sequential(
            nn.Conv1d(in_kernels, out_kernels, hps.POSTNET_KERNEL_SIZE, padding=2, bias=False, dilation=1),
            nn.BatchNorm1d(hps.POSTNET_NUM_FILTERS),
            nn.Identity() if last_layer else nn.Tanh(),
            nn.Dropout(0.5),
        )
    def forward(self, x):
        return self.post_net(x)

In [9]:
class Decoder(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.prenet = PreNet(hps.N_MEL_FILTERBANKS, hps.PRENET_HIDDEN_SIZE)
        self.encoder_linear = nn.Linear(hps.LSTM_HIDDEN_SIZE * 2, hps.ATTENTION_DIM)
        self.decoder_rnn = nn.LSTM(input_size=hps.PRENET_HIDDEN_SIZE + hps.CHARACTER_EMB_SIZE, 
                                   hidden_size=hps.DECODER_RNN_HIDDEN_DIM, batch_first=True, num_layers=2)
        self.attention = Tacotron2Attention()
        self.linear_projection = nn.Linear(hps.DECODER_RNN_HIDDEN_DIM + hps.CHARACTER_EMB_SIZE, hps.N_MEL_FILTERBANKS)
        self.stop_projection = nn.Linear(hps.DECODER_RNN_HIDDEN_DIM + hps.CHARACTER_EMB_SIZE, 1)

        self.post_net = nn.Sequential(
            DecoderPostNetConv(hps.N_MEL_FILTERBANKS, hps.POSTNET_NUM_FILTERS, hps.POSTNET_NUM_FILTERS),
            DecoderPostNetConv(hps.POSTNET_NUM_FILTERS, hps.POSTNET_NUM_FILTERS),
            DecoderPostNetConv(hps.POSTNET_NUM_FILTERS, hps.POSTNET_NUM_FILTERS),
            DecoderPostNetConv(hps.POSTNET_NUM_FILTERS, hps.POSTNET_NUM_FILTERS),
            DecoderPostNetConv(hps.POSTNET_NUM_FILTERS, hps.POSTNET_NUM_FILTERS, last_layer=True)
        )
        self.post_linear = nn.Linear(hps.POSTNET_NUM_FILTERS, hps.N_MEL_FILTERBANKS)
        self.dropout = nn.Dropout(0.1)

        
    
    def forward(self, mels, encoder_output, mask):
        mels = mels.transpose(1,2)
        mels = self.prenet(mels)
        processed_encoder = self.encoder_linear(encoder_output)


        next_h = torch.zeros(2, mels.shape[0], hps.DECODER_RNN_HIDDEN_DIM, device=mels.device)
        next_c = torch.zeros(2, mels.shape[0], hps.DECODER_RNN_HIDDEN_DIM, device=mels.device)

        mel_predictions = []
        stop_tokens = []
        curr_attention_context = torch.zeros(encoder_output.shape[0], 1, hps.CHARACTER_EMB_SIZE, device=mels.device)
        cummulated_attention = torch.zeros(encoder_output.shape[0], encoder_output.shape[1], 1, device=mels.device)

        next_h.requires_grad = True
        next_c.requires_grad = True
        curr_attention_context.requires_grad = True
        cummulated_attention.requires_grad = True




        for i in range(mels.shape[1]):
            curr_mel = mels[:, i, :].unsqueeze(1)
            curr_rnn_input = torch.cat((curr_mel, curr_attention_context), dim=-1)
            next_mel, (next_h, next_c) = self.decoder_rnn(curr_rnn_input, (next_h, next_c))
            next_mel = self.dropout(next_mel)
            curr_attention_context, alignments = self.attention(next_h, encoder_output, processed_encoder, cummulated_attention, mask)
            cummulated_attention = cummulated_attention + alignments
            next_mel_inp = torch.cat((next_mel, curr_attention_context), dim=2)
            next_mel = self.linear_projection(next_mel_inp)
            next_stop = self.stop_projection(next_mel_inp).squeeze(1)
            mel_predictions.append(next_mel)
            stop_tokens.append(next_stop)

        result_mel = torch.cat(mel_predictions, dim=1)
        result_stops = torch.cat(stop_tokens, dim=1)
        result_mel = result_mel.transpose(1, 2)
        result_mel_post = self.post_net(result_mel)
        result_mel_post = result_mel_post.transpose(1,2)
        result_mel_post = self.post_linear(result_mel_post)
        result_mel_post = result_mel_post.transpose(1,2)
        result_mel_post = result_mel + result_mel_post
        
        
        return result_mel, result_mel_post, result_stops
    
    def predict(self, encoder_output):
        mels = torch.log(torch.clamp(torch.zeros(encoder_output.shape[0], 1, hps.N_MEL_FILTERBANKS, device=encoder_output.device), hps.CLIPMIN))
        mels = self.prenet(mels)
        processed_encoder = self.encoder_linear(encoder_output)


        next_h = torch.zeros(2, mels.shape[0], hps.DECODER_RNN_HIDDEN_DIM, device=mels.device)
        next_c = torch.zeros(2, mels.shape[0], hps.DECODER_RNN_HIDDEN_DIM, device=mels.device)

        mel_predictions = []
        mel_predictions_post = []
        stop_tokens = []

        curr_attention_context = torch.zeros(encoder_output.shape[0], 1, hps.CHARACTER_EMB_SIZE, device=mels.device)
        cummulated_attention = torch.zeros(encoder_output.shape[0], encoder_output.shape[1], 1, device=mels.device)

        next_h.requires_grad = True
        next_c.requires_grad = True
        curr_attention_context.requires_grad = True
        cummulated_attention.requires_grad = True


        for i in range(1500):
            curr_mel = mels
            curr_rnn_input = torch.cat((curr_mel, curr_attention_context), dim=-1)
            next_mel, (next_h, next_c) = self.decoder_rnn(curr_rnn_input, (next_h, next_c))
            curr_attention_context, alignments = self.attention(next_h, encoder_output, processed_encoder, cummulated_attention)
            cummulated_attention = cummulated_attention + alignments
            next_mel_inp = torch.cat((next_mel, curr_attention_context), dim=2)
            next_mel = self.linear_projection(next_mel_inp)

            next_mel_post = self.post_net(next_mel.transpose(1,2)).transpose(1,2)
            next_mel_post = self.post_linear(next_mel_post)
            next_mel_post = next_mel_post + next_mel
            mels = self.prenet(next_mel_post)

            next_stop = self.stop_projection(next_mel_inp).squeeze(1)
            mel_predictions.append(next_mel)
            mel_predictions_post.append(next_mel_post)
            stop_tokens.append(next_stop)

        
        result_mel = torch.cat(mel_predictions, dim=1)
        result_mel_post = torch.cat(mel_predictions_post, dim=1)
        result_stops = torch.cat(stop_tokens, dim=1)

        return result_mel_post.transpose(1,2), result_stops

In [10]:
class Tacotron2(nn.Module):
    def __init__(self, characters_num: int = 0) -> None:
        super().__init__()
        self.characters_num = characters_num
        self.encoder = Encoder(characters_num, hps.CHARACTER_EMB_SIZE, hps.LSTM_HIDDEN_SIZE)
        self.decoder = Decoder()

    def get_mask(self, mask_idx):
        mask = torch.zeros(mask_idx.shape[0], max(mask_idx))
        mask = ((torch.arange(0, max(mask_idx)).unsqueeze(1)<torch.tensor(mask_idx))).transpose(0,1)
        return mask

    def forward(self, text, mels, mask_idx):
        encoder_output = self.encoder(text, torch.tensor(mask_idx))
        mask = self.get_mask(torch.tensor(mask_idx))
        mask = mask.to(encoder_output.device)
        decoder_output = self.decoder(mels, encoder_output, mask)
        return decoder_output
    
    def predict(self, text):
        encoder_output = self.encoder(text)
        decoder_output = self.decoder.predict(encoder_output)
        return decoder_output

In [11]:
class Tacotron2Loss(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.mse_loss = nn.MSELoss()
        self.bce_loss = nn.BCEWithLogitsLoss()
    def forward(self, mel_true, mel_pred, mel_pred_post, stops, stops_pred):
        mel_loss = self.mse_loss(mel_pred, mel_true) + self.mse_loss(mel_pred_post, mel_true)
        stop_loss = self.bce_loss(stops_pred, stops)
        return mel_loss + stop_loss

In [12]:
BATCH_SIZE = 32
dataset = TTSDataset()
dataloader = DataLoader(dataset, BATCH_SIZE, collate_fn=collate_fn, shuffle=True,
                         num_workers=8
                         )
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
NUM_EPOCHS = 200
vocab_size = dataset.preprocessor.vocab.shape[0] + 1
loss = Tacotron2Loss()
model = Tacotron2(vocab_size).to(DEVICE)
# model.load_state_dict(torch.load('model_saves\epoch_40_train_result'))
optimizer = torch.optim.Adam(model.parameters(), lr = 4e-4, weight_decay=1e-6)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.5)
writer = SummaryWriter()


RuntimeError: Error(s) in loading state_dict for Tacotron2:
	size mismatch for decoder.prenet.module.0.weight: copying a param with shape torch.Size([256, 80]) from checkpoint, the shape in current model is torch.Size([256, 40]).
	size mismatch for decoder.linear_projection.weight: copying a param with shape torch.Size([80, 1536]) from checkpoint, the shape in current model is torch.Size([40, 1536]).
	size mismatch for decoder.linear_projection.bias: copying a param with shape torch.Size([80]) from checkpoint, the shape in current model is torch.Size([40]).
	size mismatch for decoder.post_net.0.post_net.0.weight: copying a param with shape torch.Size([512, 80, 5]) from checkpoint, the shape in current model is torch.Size([512, 40, 5]).
	size mismatch for decoder.post_linear.weight: copying a param with shape torch.Size([80, 512]) from checkpoint, the shape in current model is torch.Size([40, 512]).
	size mismatch for decoder.post_linear.bias: copying a param with shape torch.Size([80]) from checkpoint, the shape in current model is torch.Size([40]).

In [13]:
def code_text(text):
    text = dataset.preprocessor.transform_single_text(text)
    text = torch.tensor(text).unsqueeze(0)
    return text

In [14]:
eval_step = 0
for epoch in range(40, NUM_EPOCHS):
    model.train(True)
    epoch_train_losses = [0]
    for text, mel, stop, mask_idx in tqdm(dataloader):
        text = text.to(DEVICE)
        mel = mel.to(DEVICE)
        stop = stop.to(DEVICE)
        result_mel, result_mel_post, result_stops = model(text, mel, mask_idx)
        loss_val = loss(mel[:, :, 1:], result_mel[:, :, :-1], result_mel_post[:, :, :-1], stop[:, 1:], result_stops[:, :-1])
        loss_val.backward()
        nn.utils.clip_grad_norm(model.parameters(), 1.)
        optimizer.step()
        epoch_train_losses.append(loss_val.cpu().detach().numpy())
        writer.add_scalar('Loss/train', loss_val, eval_step)
        writer.add_scalar('LR/train', optimizer.param_groups[0]['lr'], eval_step)
        eval_step += 1

    model.train(False)
    if epoch%5==0:
        sample_text = code_text("Привет Это проверка генерации речи из текста!").to(DEVICE)
        audio, stops = model.predict(sample_text)
        stops = nn.functional.sigmoid(stops)
        stops = (stops>1).int()
        stops[0] = 0
        aud = reconstruct_audio(audio[0].detach(), stops[0].detach())
        writer.add_audio("Model_results", aud, sample_rate=hps.SAMPLE_RATE, global_step=epoch)
        torch.save(model.state_dict(), f"model_saves/epoch_{epoch}_train_result")
        


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, mel, stop, mask_idx in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  mask = ((torch.arange(0, max(mask_idx)).unsqueeze(1)<torch.tensor(mask_idx))).transpose(0,1)
  nn.utils.clip_grad_norm(model.parameters(), 1.)


NameError: name 'optimizer' is not defined

In [None]:
nn.Sigmoid()(result_stops)[0].sum()

tensor(408.8496, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
stop[0].sum()

tensor(409., device='cuda:0')

In [None]:
model.train(False)
sample_text = code_text("Привет Это проверка генерации речи из текста!").to(DEVICE)
audio, stops = model.predict(sample_text)

In [None]:
nn.Sigmoid()(stops).list()

AttributeError: 'Tensor' object has no attribute 'list'

In [None]:
optimizer.param_groups[0]['lr']

0.001

In [15]:
epoch

50

In [None]:
for i, k in enumerate([[1,2], [1,2]]):
    print(i, k)

0 [1, 2]
1 [1, 2]


In [None]:
next(iter(dataloader))[2].shape

torch.Size([32, 558])

In [16]:
100000/509

196.46365422396858

'cuda'

In [306]:
vocab_size = dataset.preprocessor.vocab.shape[0] + 1
model = Tacotron2(vocab_size).cuda()

In [318]:
temp_input_encoder = data[0].cuda()
temp_input_decoder = data[1].cuda()
with torch.no_grad():
    decoder_output = model(temp_input_encoder, temp_input_decoder)

In [320]:
loss = Tacotron2Loss()

In [322]:
BATCH_SIZE = 64


(tensor([[[-5.1191e-02, -5.5188e-02, -5.7950e-02,  ..., -5.7682e-02,
           -5.3528e-02, -5.6456e-02],
          [-1.6762e-02, -1.5057e-02, -1.3166e-02,  ..., -1.1898e-03,
           -2.8852e-03, -1.2697e-03],
          [ 2.6739e-03,  5.5839e-03,  7.8182e-03,  ...,  1.3650e-02,
            9.1022e-03,  1.0750e-02],
          ...,
          [-5.3367e-02, -5.4048e-02, -5.3908e-02,  ..., -5.6514e-02,
           -5.2911e-02, -5.1862e-02],
          [ 4.7301e-02,  4.7091e-02,  4.6879e-02,  ...,  4.9430e-02,
            4.8796e-02,  4.7187e-02],
          [ 3.5727e-02,  3.7901e-02,  4.0362e-02,  ...,  3.4831e-02,
            3.1521e-02,  3.3556e-02]],
 
         [[-5.1383e-02, -5.6293e-02, -5.9644e-02,  ..., -5.7011e-02,
           -5.6926e-02, -5.9111e-02],
          [-1.0403e-02, -1.0031e-02, -9.2605e-03,  ...,  1.8100e-03,
            5.2113e-03, -2.6451e-03],
          [ 3.0505e-02,  2.9349e-02,  2.6603e-02,  ...,  3.5301e-02,
            3.8561e-02,  3.3970e-02],
          ...,
    

In [324]:
loss(data[1].cuda(), decoder_output[0], decoder_output[1], data[2].cuda(), decoder_output[2])

tensor(33.4727, device='cuda:0')

In [316]:
data[2]

tensor([[0., 0., 0.,  ..., 1., 1., 1.],
        [0., 0., 0.,  ..., 1., 1., 1.],
        [0., 0., 0.,  ..., 1., 1., 1.],
        ...,
        [0., 0., 0.,  ..., 1., 1., 1.],
        [0., 0., 0.,  ..., 1., 1., 1.],
        [0., 0., 0.,  ..., 1., 1., 1.]])

In [314]:
decoder_output[1]

tensor([[[ 1.6583,  0.1691, -1.4776,  ..., -0.5090,  1.3761, -0.1395],
         [ 0.7005,  0.3326,  1.5623,  ...,  0.8281,  0.1491,  0.4642],
         [-0.1606,  0.7656,  0.1319,  ...,  0.5466,  0.0921, -1.0205],
         ...,
         [ 0.1107,  0.0927, -0.6015,  ...,  0.3849, -0.0065, -0.2685],
         [ 0.6088,  0.9316, -0.6893,  ...,  0.7076,  0.8728,  0.5165],
         [ 0.3180, -0.6657,  0.1599,  ..., -0.1405, -0.8220, -0.7299]],

        [[ 0.5274,  0.1623,  0.5567,  ...,  0.5036, -0.9857, -0.7307],
         [-0.4895, -1.0142, -0.0366,  ...,  0.2099, -0.3934,  0.0809],
         [-0.2767,  0.8930, -0.9992,  ...,  0.4480,  0.1019, -0.8237],
         ...,
         [-0.4323, -0.0780,  0.5408,  ..., -0.4314, -0.9461, -0.1175],
         [ 0.0771, -0.9793,  0.8653,  ..., -0.1375,  0.9183, -0.7513],
         [ 0.1413,  0.0856,  1.5972,  ...,  0.3891, -1.1657, -0.0515]],

        [[ 0.1181, -0.2188, -0.8945,  ..., -1.2362,  1.1403, -0.0732],
         [ 0.8851,  0.0421, -0.0420,  ...,  0

In [312]:
decoder_output[0]

tensor([[[-3.5660e-02, -3.9718e-02, -4.2668e-02,  ..., -4.0272e-02,
          -3.9391e-02, -4.2186e-02],
         [-2.5500e-02, -2.3928e-02, -2.2079e-02,  ..., -1.1441e-02,
          -9.4777e-03, -9.4681e-03],
         [ 1.2907e-02,  1.5863e-02,  1.8014e-02,  ...,  2.2873e-02,
           2.8016e-02,  2.1616e-02],
         ...,
         [-6.1557e-02, -6.2468e-02, -6.2598e-02,  ..., -6.2267e-02,
          -6.1395e-02, -6.1663e-02],
         [ 4.2395e-02,  4.2034e-02,  4.1709e-02,  ...,  4.5554e-02,
           4.6440e-02,  4.7393e-02],
         [ 2.6135e-02,  2.8165e-02,  3.0416e-02,  ...,  2.4501e-02,
           2.3975e-02,  2.2832e-02]],

        [[-7.2453e-02, -7.7478e-02, -8.0861e-02,  ..., -7.7089e-02,
          -7.7956e-02, -7.7716e-02],
         [-2.3804e-02, -2.3456e-02, -2.2719e-02,  ..., -1.2019e-02,
          -9.2107e-03, -7.5158e-03],
         [ 2.5730e-02,  2.5117e-02,  2.3034e-02,  ...,  3.4300e-02,
           3.3576e-02,  3.5578e-02],
         ...,
         [-7.4056e-02, -7

In [309]:
decoder_output[1].shape

torch.Size([32, 1500])

In [268]:
decoder_output[0].shape, decoder_output[1].shape, decoder_output[2].shape

(torch.Size([32, 80, 558]), torch.Size([32, 80, 558]), torch.Size([32, 558]))

In [19]:
decoder_output.shape

torch.Size([32, 80, 558])

In [295]:
decoder_output[0].shape, decoder_output[1].shape, decoder_output[2].shape

(torch.Size([32, 1, 80]), torch.Size([32, 1, 512]), torch.Size([32, 84, 1]))

In [264]:
temp_input_decoder.shape

torch.Size([32, 80, 558])

In [246]:
encoder_output.shape

torch.Size([32, 84, 512])

In [17]:
encoder_output.shape

torch.Size([32, 84, 512])

In [117]:
attention(mel_unit[0], temp_input_encoder)[2].shape

torch.Size([558, 32, 256])

In [77]:
temp_input_decoder[0].shape

torch.Size([80, 558])

In [75]:
encoder_output[0].shape

torch.Size([84, 512])

In [85]:
temp_input_decoder.transpose(1,2).shape

torch.Size([32, 558, 80])

In [28]:
nn.Identity()(torch.tensor([1,2,3]))

tensor([1, 2, 3])

In [52]:
ttt = decoder(temp_input_decoder, encoder_output)

In [53]:
ttt[0].shape

torch.Size([32, 558, 128])

In [46]:
ttt[1].shape

torch.Size([32, 84, 128])

In [35]:
encoder_output.shape

torch.Size([32, 84, 512])

In [20]:
decoder(data[1]).shape

torch.Size([32, 558, 256])

In [14]:
data[1].shape

torch.Size([32, 80, 558])

In [23]:
data[1].shape

torch.Size([32, 80, 558])

In [10]:
temp_input = data[0]

In [11]:
vocab_size = dataset.preprocessor.vocab.shape[0]+1
model = Tacotron2(vocab_size)

In [12]:
model(temp_input).shape

torch.Size([32, 84, 512])

In [14]:
data[1].shape

torch.Size([32, 80, 558])

In [16]:
data[2].shape

torch.Size([32, 558])

In [41]:
model.characters_num

39

In [13]:
temp_input.unique().shape

torch.Size([39])