In [1]:
from model import Tacotron
from utils import TTSDataset, collate_fn
import pandas as pd
from torchaudio import transforms
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
from torch.nn.functional import relu
import numpy as np
from torchaudio.functional import deemphasis
import tempfile
import torchaudio

In [2]:
from tqdm import tqdm_notebook as tqdm

In [3]:
from IPython import display
from IPython.display import clear_output

In [4]:
dataset = TTSDataset(resample_rate=24000, num_elements=None)
dataloader = DataLoader(dataset, 32, collate_fn=collate_fn, num_workers=8, shuffle=True)

In [5]:
def reconstruct_audio(spectrogram):
    spectrogram = (torch.clip(spectrogram, 0, 1)*100) - 80
    spectrogram = torch.pow(10, spectrogram*0.05)
    transform = transforms.GriffinLim(n_fft=2048, win_length=int(24000*0.05), hop_length=int(24000*0.0125), power=1.5)
    waveform = transform(spectrogram).detach()
    waveform = deemphasis(waveform)
    return waveform

In [6]:
def code_text(text):
    text = dataset.preprocessor.transform_single_text(text)
    text = torch.tensor(text).unsqueeze(0)
    return text

In [7]:
def loss(true, pred):
    criterion = nn.L1Loss()
    output = criterion(pred, true)
    return output

In [8]:
spec = next(iter(dataloader))[3]
spec.shape

torch.Size([32, 1025, 568])

In [15]:
display.Audio(reconstruct_audio(spec)[0].numpy(), rate=24000)

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [10]:
model = Tacotron(38)
model.load_state_dict(torch.load('../tacotron_vw/epoch_680_train_result'))

<All keys matched successfully>

In [11]:
import os
if 'metrics.csv' not in os.listdir():
    metric_frame = pd.DataFrame({
        'epoch':[],
        'loss_value':[]
    }).to_csv('metrics.csv', index=False)
metrics_frame = pd.read_csv('metrics.csv', index_col=False)

In [12]:
metrics_frame.shape

(686, 2)

In [13]:
NUM_EPOCHS = 800
train_loss = []
test_loss = []
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)

In [50]:
model.to(device)
for epoch in range(700, NUM_EPOCHS):
    model.train()
    epoch_train_losses = []
    for text, audio, mel, spectrogram in tqdm(dataloader):
        text = text.to(device)
        mel = mel.to(device)
        spectrogram = spectrogram.to(device)
        model.zero_grad()
        pred_mel, pred_spec = model(text, mel)
        pred_spec = pred_spec.transpose(1,2)
        loss_mel = loss(mel, pred_mel)
        loss_spec = loss(spectrogram, pred_spec)
        final_loss = 0.5 * loss_spec + 0.5 * loss_mel
        final_loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), 1.)
        optimizer.step()
        epoch_train_losses.append(final_loss.cpu().detach().numpy())
    if epoch%10==0:
        with torch.no_grad():
            sample_text = code_text("Привет Это проверка генерации речи из текста!")
            model.eval()
            model.to('cpu')
            test_spec = reconstruct_audio(model.predict(sample_text))
            with tempfile.TemporaryDirectory() as tempdir:
                path = f"save_example_{epoch}.wav"
                torchaudio.save(path, test_spec, 24000)
            model.to(device)
        torch.save(model.state_dict(), f"epoch_{epoch}_train_result")
    metrics_frame.loc[epoch] = np.mean(epoch_train_losses)
    metrics_frame.to_csv('metrics.csv', index=False)
    print(epoch, np.mean(epoch_train_losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


700 0.032638874


  0%|          | 0/509 [00:00<?, ?it/s]

701 0.032448266


  0%|          | 0/509 [00:00<?, ?it/s]

702 0.032665007


  0%|          | 0/509 [00:00<?, ?it/s]

703 0.032531183


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


704 0.032630667


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


705 0.032568123


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


706 0.032583892


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


707 0.032556225


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


708 0.032598168


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


709 0.032544013


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


710 0.032562286


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


711 0.03256278


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


712 0.032627817


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


713 0.032601804


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


714 0.032561135


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


715 0.03254334


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


716 0.032584287


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


717 0.032524582


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


718 0.03262678


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


719 0.032435622


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


720 0.03256542


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


721 0.032590833


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


722 0.032516457


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


723 0.032450918


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


724 0.032665737


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


725 0.03254514


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


726 0.032520033


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


727 0.032565035


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


728 0.032564595


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


729 0.032597374


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


730 0.032584436


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


731 0.032555237


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


732 0.03252244


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


733 0.03247609


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


734 0.03252359


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


735 0.03250792


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


736 0.032565065


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


737 0.03244971


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


738 0.032551963


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


739 0.03264408


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


740 0.03254853


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


741 0.03261943


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


742 0.03257344


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


743 0.03263672


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


744 0.032532666


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


745 0.032530777


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


746 0.0326006


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


747 0.032521784


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


748 0.032469273


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


749 0.03249919


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


750 0.032462962


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


751 0.03253416


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


752 0.03258241


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


753 0.032526996


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


754 0.03244523


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


755 0.032480665


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


756 0.03245868


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


757 0.032453414


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


758 0.03259204


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


759 0.032559067


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


760 0.032538105


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


761 0.03247009


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


762 0.032555107


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


763 0.03250936


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


764 0.032387625


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


765 0.03256586


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


766 0.032573953


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


767 0.032506093


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


768 0.03262976


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


769 0.03242737


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


770 0.032543514


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


771 0.032504197


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


772 0.032426197


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


773 0.032449387


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


774 0.032463532


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


775 0.032482076


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


776 0.032405585


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


777 0.0324155


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


778 0.032415807


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


779 0.032521043


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


780 0.032469124


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


781 0.032458846


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


782 0.032444246


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


783 0.03257607


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


784 0.032557175


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


785 0.03247167


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


786 0.032452676


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


787 0.032566715


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


788 0.032500386


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


789 0.032519814


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


790 0.032485005


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


791 0.032421585


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/509 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), 1.)


In [49]:
sample_text = code_text("Ровно три года назад у вокзала стояла голая девушка и позволяла трогать себа за левую грудь.")
model.to('cpu')
model.eval()
test_spec = reconstruct_audio(model.predict(sample_text))
with tempfile.TemporaryDirectory() as tempdir:
    path = f"teeeest.wav"
    torchaudio.save(path, test_spec, 24000)

In [13]:
model.train()
sample_text = code_text("Как незнайка оказался на луне?")
model.to('cpu')
test_spec = reconstruct_audio(model.predict(sample_text))
# with tempfile.TemporaryDirectory() as tempdir:
#     path = f"save_example_{epoch}.wav"
#     torchaudio.save(path, test_spec, 24000)

In [56]:
display.Audio(test_spec.numpy(), rate=24000)

In [19]:
loss_mel

tensor(1.8905, device='cuda:0', grad_fn=<MeanBackward0>)

In [20]:
loss_spec

tensor(0.0354, device='cuda:0', grad_fn=<MeanBackward0>)

In [45]:
test_spec.abs()

tensor([[5.8682e-04, 8.7431e-04, 7.9268e-04,  ..., 1.6969e-06, 2.0844e-06,
         1.5385e-06]])

In [None]:
С тревожным чувством берусь я за перо.

In [40]:
sample_text = code_text("Привет! Это проверка генерации речи из текста!")
model.to('cpu')
model.eval()

Tacotron(
  (encoder): TacotronEncoder(
    (char_embedding): Embedding(38, 256)
    (encoder_prenet): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): Dropout(p=0.5, inplace=False)
      (2): Linear(in_features=256, out_features=128, bias=True)
      (3): Dropout(p=0.5, inplace=False)
    )
    (CBHD): EncoderCBHD(
      (convolve_sets): ModuleList(
        (0): BatchNormConvolution(
          (batchnorm): Sequential(
            (0): ConstantPad1d(padding=(0, 0), value=0)
            (1): Conv1d(128, 128, kernel_size=(1,), stride=(1,), bias=False)
            (2): ReLU()
            (3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
        )
        (1): BatchNormConvolution(
          (batchnorm): Sequential(
            (0): ConstantPad1d(padding=(0, 1), value=0)
            (1): Conv1d(128, 128, kernel_size=(2,), stride=(1,), bias=False)
            (2): ReLU()
            (3): BatchNorm1d(128, e

In [41]:
test_audio = reconstruct_audio(model.predict(sample_text))

In [42]:
test_audio

tensor([[ 5.0051e-04, -9.9959e-04, -2.2230e-03,  ..., -1.6136e-05,
         -1.1925e-05, -9.9754e-06]])

In [46]:
display.Audio(test_audio.abs()[0].numpy(), rate=24000)

In [33]:
loss_mel

tensor(874.6371, device='cuda:0', grad_fn=<MeanBackward0>)

In [34]:
loss_spec

tensor(10.9902, device='cuda:0', grad_fn=<MeanBackward0>)

In [17]:
spectrogram

tensor([[[ -66.2601,  -81.4281,  -74.2407,  ..., -100.0000, -100.0000,
          -100.0000],
         [ -66.4388,  -70.5101,  -69.3915,  ..., -100.0000, -100.0000,
          -100.0000],
         [ -66.8526,  -70.0612,  -67.6829,  ..., -100.0000, -100.0000,
          -100.0000],
         ...,
         [ -52.0097,  -51.1252,  -59.6431,  ..., -100.0000, -100.0000,
          -100.0000],
         [ -66.3263,  -54.3086,  -57.9520,  ..., -100.0000, -100.0000,
          -100.0000],
         [ -76.9271,  -57.7109,  -57.5889,  ..., -100.0000, -100.0000,
          -100.0000]],

        [[ -82.9806,  -89.9874, -100.0000,  ..., -100.0000, -100.0000,
          -100.0000],
         [ -83.0703,  -90.1837, -100.0000,  ..., -100.0000, -100.0000,
          -100.0000],
         [ -83.0809,  -90.7176, -100.0000,  ..., -100.0000, -100.0000,
          -100.0000],
         ...,
         [ -49.0506,  -56.2065,  -53.1666,  ..., -100.0000, -100.0000,
          -100.0000],
         [ -48.8502,  -55.8873,  -52.919

In [18]:
pred_spec

tensor([[[ -62.2363,  -77.3947,  -85.8708,  ..., -100.0043, -100.0032,
          -100.0043],
         [ -61.3311,  -76.2063,  -83.7461,  ..., -100.0084, -100.0081,
          -100.0084],
         [ -60.5917,  -75.0856,  -82.0439,  ..., -100.0244, -100.0204,
          -100.0244],
         ...,
         [ -49.8313,  -63.9831,  -70.8464,  ...,  -99.9781,  -99.9824,
           -99.9781],
         [ -49.7358,  -64.2959,  -70.9207,  ...,  -99.9793,  -99.9750,
           -99.9793],
         [ -50.6218,  -64.9588,  -71.9419,  ...,  -99.9991,  -99.9964,
           -99.9991]],

        [[ -61.3475,  -76.8347,  -87.2627,  ..., -100.0020, -100.0021,
          -100.0024],
         [ -60.4987,  -75.6955,  -85.6668,  ..., -100.0058, -100.0068,
          -100.0062],
         [ -59.7536,  -74.4733,  -83.7902,  ..., -100.0225, -100.0199,
          -100.0228],
         ...,
         [ -50.0605,  -64.1179,  -74.7696,  ...,  -99.9754,  -99.9811,
           -99.9763],
         [ -49.8314,  -64.3668,  -74.890

In [24]:
test_spec.max()

tensor(0.0514)

In [23]:
audio.max()

tensor(0.4296)

In [26]:
np.mean(epoch_train_losses)

In [27]:
print(epoch_train_losses)

In [15]:
ttt = model.to('cpu').predict(sample_text)

In [17]:
ttt.shape

torch.Size([1, 1025, 2495])

In [20]:
reconstruct_audio(ttt).max()

tensor(0.0359)

In [25]:
display.Audio(reconstruct_audio(ttt)[0].numpy(), rate=24000)

NameError: name 'ttt' is not defined

In [None]:
with tempfile.TemporaryDirectory() as tempdir:
                path = f"save_example_{epoch}.wav"
                torchaudio.save(path, test_spec, 24000)

In [15]:
sample_text = code_text("как дела!")
test_sample = model.to('cpu').predict(sample_text)
# test_spec = reconstruct_audio(relu(model.predict(sample_text)))
# display.Audio(test_spec[0].cpu().numpy(), rate=12000)

In [19]:
test_sample.shape

torch.Size([1, 1025, 2495])

In [19]:
model

Tacotron(
  (encoder): TacotronEncoder(
    (char_embedding): Embedding(38, 256)
    (encoder_prenet): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): Dropout(p=0.5, inplace=False)
      (2): Linear(in_features=256, out_features=128, bias=True)
      (3): Dropout(p=0.5, inplace=False)
    )
    (CBHD): EncoderCBHD(
      (convolve_sets): ModuleList(
        (0): BatchNormConvolution(
          (batchnorm): Sequential(
            (0): ConstantPad1d(padding=(0, 0), value=0)
            (1): Conv1d(128, 128, kernel_size=(1,), stride=(1,), bias=False)
            (2): ReLU()
            (3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
        )
        (1): BatchNormConvolution(
          (batchnorm): Sequential(
            (0): ConstantPad1d(padding=(0, 1), value=0)
            (1): Conv1d(128, 128, kernel_size=(2,), stride=(1,), bias=False)
            (2): ReLU()
            (3): BatchNorm1d(128, e

In [28]:
test_sample.shape

torch.Size([1, 1025, 2495])

In [None]:
pred_spec.shape

In [15]:
np.mean(epoch_train_losses)

67.20557

In [21]:
code_text("С тревожным чувсвом")

tensor([[23,  0, 24, 22, 11,  8, 20, 12, 19, 33, 18,  0, 29, 25,  8, 23,  8, 20,
         18]])

In [None]:
next(iter(dataloader))[2].shape

In [None]:
spectrogram.shape

In [None]:
pred_spec.shape

In [23]:
model.to(device)

Tacotron(
  (encoder): TacotronEncoder(
    (char_embedding): Embedding(38, 256)
    (encoder_prenet): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): Dropout(p=0.5, inplace=False)
      (2): Linear(in_features=256, out_features=128, bias=True)
      (3): Dropout(p=0.5, inplace=False)
    )
    (CBHD): EncoderCBHD(
      (convolve_sets): ModuleList(
        (0): BatchNormConvolution(
          (batchnorm): Sequential(
            (0): ConstantPad1d(padding=(0, 0), value=0)
            (1): Conv1d(128, 128, kernel_size=(1,), stride=(1,), bias=False)
            (2): ReLU()
            (3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
        )
        (1): BatchNormConvolution(
          (batchnorm): Sequential(
            (0): ConstantPad1d(padding=(0, 1), value=0)
            (1): Conv1d(128, 128, kernel_size=(2,), stride=(1,), bias=False)
            (2): ReLU()
            (3): BatchNorm1d(128, e

In [None]:
with torch.no_grad():
            sample_text = code_text("Привет! Это проверка генерации речи из текста!")
            model.to('cpu')
            test_spec = reconstruct_audio(torch.clamp(model.predict(sample_text), min=1e-5))
            with tempfile.TemporaryDirectory() as tempdir:
                path = f"save_example_{epoch}.wav"
                torchaudio.save(path, test_spec, 24000)
            model.to(device)
        torch.save(model.state_dict(), f"epoch_{epoch}_train_result")

In [None]:
ttt = next(iter(dataloader))

In [None]:
ttt[3].min()

In [None]:
display.Audio(ttt[1][0].numpy(), rate=24000)

In [None]:
def reconstruct_audio(spectrogram):
    transform = transforms.GriffinLim(n_fft=2048, win_length=int(24000*0.05), hop_length=int(24000*0.0125))
    waveform = transform(spectrogram).detach()
    return waveform

In [None]:
def code_text(text):
    text = dataset.preprocessor.transform_single_text(text)
    text = torch.tensor(text).unsqueeze(0)
    return text

In [None]:
sample_text = code_text("Привет! Это проверка генерации речи из текста!")

In [None]:
model = Tacotron(38)

In [None]:
model.predict(sample_text)

In [None]:
from torch.nn.functional import relu

In [None]:
model.predict(sample_text).shape

In [None]:
ttt[3].shape

In [None]:
relu = torch.nn.ReLU()

In [None]:
specs = reconstruct_audio(relu(model.predict(sample_text)))


In [None]:
specs

In [None]:
display.Audio(specs[0].numpy(), rate=24000)
display.Audio(specs[0].numpy(), rate=24000)

In [None]:
specs[0]