In [1]:
from model import Tacotron
from utils import TTSDataset, collate_fn
import pandas as pd
from torchaudio import transforms
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
from torch.nn.functional import relu
import numpy as np

In [2]:
from tqdm import tqdm_notebook as tqdm

In [3]:
from IPython import display
from IPython.display import clear_output

In [4]:
dataset = TTSDataset(resample_rate=12000, num_elements=None)
dataloader = DataLoader(dataset, 32, collate_fn=collate_fn)

In [5]:
def reconstruct_audio(spectrogram):
    transform = transforms.GriffinLim(n_fft=2048, win_length=int(12000*0.05), hop_length=int(12000*0.0125))
    waveform = transform(spectrogram).detach()
    return waveform

In [6]:
def code_text(text):
    text = dataset.preprocessor.transform_single_text(text)
    text = torch.tensor(text).unsqueeze(0)
    return text

In [7]:
def loss(true, pred):
    criterion = nn.L1Loss()
    output = criterion(pred, true)
    return output

In [8]:
spec = next(iter(dataloader))[3]
spec.shape

torch.Size([32, 1025, 639])

In [9]:
display.Audio(reconstruct_audio(spec)[0].numpy(), rate=12000)

In [8]:
device = torch.device('cpu') if torch.has_mps else torch.device('cpu')

In [9]:
model = Tacotron(38)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [10]:
NUM_EPOCHS = 10
train_loss = []
test_loss = []

In [None]:
model.to(device)
for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_train_losses = []
    for text, audio, mel, spectrogram in tqdm(dataloader):
        text = text.to(device)
        mel = mel.to(device)
        spectrogram = spectrogram.to(device)
        model.zero_grad()
        pred_mel, pred_spec = model(text, mel)
        pred_spec = pred_spec.transpose(1,2)
        loss_mel = loss(mel, pred_mel)
        loss_spec = loss(spectrogram, pred_spec)
        final_loss = loss_spec + loss_spec
        final_loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), 1.)
        optimizer.step()
        epoch_train_losses.append(final_loss.cpu().detach().numpy())
        sample_text = code_text("Привет! Это проверка генерации речи из текста!")
        # with torch.no_grad():
        #     test_sample = model.predict(sample_text.to(device))
        #     test_spec = reconstruct_audio(relu(model.predict(sample_text)))
        #     display.Audio(test_spec[0].cpu().numpy(), rate=24000)
    torch.save(model.state_dict(), f"epoch_{epoch}_train_result")
    print(epoch, np.mean(epoch_train_losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/694 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


In [26]:
loss(spectrogram, pred_spec)

tensor(nan, device='mps:0', grad_fn=<MeanBackward0>)

In [19]:
pred_mel.shape

torch.Size([32, 80, 639])

In [21]:
pred_mel

tensor([[[-0.1837,  0.1647, -0.0232,  ...,  0.4475, -0.0352, -0.3313],
         [-0.1807, -0.2076, -0.1271,  ..., -0.6714, -0.2655, -0.7089],
         [-0.3381,  0.0343, -0.1301,  ...,  0.3218, -0.4129,  1.2553],
         ...,
         [-0.0153, -0.3196, -0.0170,  ..., -0.8734,  0.1366, -0.2401],
         [-0.0070, -0.3456,  0.3253,  ..., -0.9944,  0.6447, -0.0279],
         [-0.1882,  0.2738, -0.0601,  ...,  0.7569, -0.1516,  0.1795]],

        [[-0.1451,  0.2347,  0.0482,  ...,  0.4463, -0.0393, -0.3286],
         [-0.1796, -0.0107, -0.0431,  ..., -0.6719, -0.2589, -0.7160],
         [-0.3159,  0.1480, -0.0749,  ...,  0.3212, -0.4124,  1.2552],
         ...,
         [ 0.0097, -0.4118, -0.0115,  ..., -0.8714,  0.1326, -0.2427],
         [ 0.0834, -0.3256,  0.1809,  ..., -0.9920,  0.6434, -0.0222],
         [-0.1847,  0.1938, -0.1970,  ...,  0.7615, -0.1525,  0.1812]],

        [[-0.1810,  0.2049, -0.1158,  ...,  0.4477, -0.0390, -0.3264],
         [ 0.0209, -0.3395, -0.0979,  ..., -0

In [14]:
import time

In [15]:
%%time
next(iter(dataloader))

CPU times: user 130 ms, sys: 144 ms, total: 275 ms
Wall time: 338 ms


(tensor([[23,  0, 24,  ...,  0,  0,  0],
         [16, 20,  9,  ...,  0,  0,  0],
         [29, 24, 20,  ...,  0,  0,  0],
         ...,
         [ 9, 17,  6,  ...,  0,  0,  0],
         [24, 22, 14,  ...,  0,  0,  0],
         [19,  6, 21,  ...,  0,  0,  0]]),
 tensor([[ 0.0000,  0.0042, -0.0124,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0221,  0.0610,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0216, -0.0033,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000, -0.0214, -0.0364,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0299,  0.0494,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000, -0.0217, -0.0397,  ...,  0.0000,  0.0000,  0.0000]]),
 tensor([[[4.3511e-05, 1.0559e-05, 1.7280e-08,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [4.2784e-05, 1.4735e-05, 1.4073e-06,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [1.0683e-05, 2.7048e-05, 4.4954e-05,  ..., 0.0000e+00,
           0.0000e+00, 0.0000

In [1]:
sample_text = code_text("как дела!")
test_sample = model.predict(sample_text.to(device))
test_spec = reconstruct_audio(relu(model.predict(sample_text)))
display.Audio(test_spec[0].cpu().numpy(), rate=12000)

NameError: name 'code_text' is not defined

In [28]:
test_sample.shape

torch.Size([1, 1025, 2495])

In [None]:
pred_spec.shape

In [None]:
next(iter(dataloader))[2].shape

In [None]:
spectrogram.shape

In [None]:
pred_spec.shape

In [None]:
ttt = next(iter(dataloader))

In [None]:
ttt[3].min()

In [None]:
display.Audio(ttt[1][0].numpy(), rate=24000)

In [None]:
def reconstruct_audio(spectrogram):
    transform = transforms.GriffinLim(n_fft=2048, win_length=int(24000*0.05), hop_length=int(24000*0.0125))
    waveform = transform(spectrogram).detach()
    return waveform

In [None]:
def code_text(text):
    text = dataset.preprocessor.transform_single_text(text)
    text = torch.tensor(text).unsqueeze(0)
    return text

In [None]:
sample_text = code_text("Привет! Это проверка генерации речи из текста!")

In [None]:
model = Tacotron(38)

In [None]:
model.predict(sample_text)

In [None]:
from torch.nn.functional import relu

In [None]:
model.predict(sample_text).shape

In [None]:
ttt[3].shape

In [None]:
relu = torch.nn.ReLU()

In [None]:
specs = reconstruct_audio(relu(model.predict(sample_text)))


In [None]:
specs

In [None]:
display.Audio(specs[0].numpy(), rate=24000)
display.Audio(specs[0].numpy(), rate=24000)

In [None]:
specs[0]