In [4]:
from model import Tacotron
from utils import TTSDataset, collate_fn
import pandas as pd
from torchaudio import transforms
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
from torch.nn.functional import relu
import numpy as np

In [5]:
from tqdm import tqdm_notebook as tqdm

In [6]:
from IPython import display
from IPython.display import clear_output

In [7]:
texts = pd.read_csv('../RUSLAN_text/metadata_RUSLAN_22200.csv', sep='|', header=None)
texts.columns = ['path', 'text']

In [8]:
dataset = TTSDataset(resample_rate=24000)
dataloader = DataLoader(dataset, 32, collate_fn=collate_fn)

In [9]:
def reconstruct_audio(spectrogram):
    transform = transforms.GriffinLim(n_fft=2048, win_length=int(24000*0.05), hop_length=int(24000*0.0125))
    waveform = transform(spectrogram).detach()
    return waveform

In [10]:
def code_text(text):
    text = dataset.preprocessor.transform_single_text(text)
    text = torch.tensor(text).unsqueeze(0)
    return text

In [11]:
def loss(true, pred):
    criterion = nn.L1Loss()
    output = criterion(pred, true)
    return output

In [12]:
device = torch.device('mps') if torch.has_mps else torch.device('cpu')

In [13]:
model = Tacotron(38)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [14]:
NUM_EPOCHS = 1
train_loss = []
test_loss = []

In [15]:
model.to(device)
for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_train_losses = []
    for text, audio, mel, spectrogram in tqdm(dataloader):
        text = text.to(device)
        mel = mel.to(device)
        spectrogram = spectrogram.to(device)
        model.zero_grad()
        pred_mel, pred_spec = model(text, mel)
        pred_spec = pred_spec.transpose(1,2)
        loss_mel = loss(mel, pred_mel)
        loss_spec = loss(spectrogram, pred_spec)
        final_loss = loss_spec + loss_spec
        final_loss.backward()
        optimizer.step()
        epoch_train_losses.append(final_loss.cpu().detach().numpy())
        sample_text = code_text("Привет! Это проверка генерации речи из текста!")
        with torch.no_grad():
            test_sample = model.predict(sample_text.to(device))
            test_spec = reconstruct_audio(relu(model.predict(sample_text)))
            display.Audio(test_spec[0].cpu().numpy(), rate=24000)
    print(epoch, np.mean(epoch_train_losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text, audio, mel, spectrogram in tqdm(dataloader):


  0%|          | 0/694 [00:00<?, ?it/s]

RuntimeError: Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same

In [29]:
model.encoder.CBHD

EncoderCBHD(
  (pre_pool_padding): ConstantPad1d(padding=(1, 0), value=0)
  (pooling): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  (highway_net): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=128, bias=True)
    (7): ReLU()
  )
  (gru): GRU(128, 128, batch_first=True, bidirectional=True)
)

In [None]:
sample_text = code_text("как дела!")
test_sample = model.predict(sample_text.to(device))
test_spec = reconstruct_audio(relu(model.predict(sample_text)))
display.Audio(test_spec[0].cpu().numpy(), rate=24000)

In [28]:
test_sample.shape

torch.Size([1, 1025, 2495])

In [None]:
pred_spec.shape

In [None]:
next(iter(dataloader))[2].shape

In [None]:
spectrogram.shape

In [None]:
pred_spec.shape

In [None]:
ttt = next(iter(dataloader))

In [None]:
ttt[3].min()

In [None]:
display.Audio(ttt[1][0].numpy(), rate=24000)

In [None]:
def reconstruct_audio(spectrogram):
    transform = transforms.GriffinLim(n_fft=2048, win_length=int(24000*0.05), hop_length=int(24000*0.0125))
    waveform = transform(spectrogram).detach()
    return waveform

In [None]:
def code_text(text):
    text = dataset.preprocessor.transform_single_text(text)
    text = torch.tensor(text).unsqueeze(0)
    return text

In [None]:
sample_text = code_text("Привет! Это проверка генерации речи из текста!")

In [None]:
model = Tacotron(38)

In [None]:
model.predict(sample_text)

In [None]:
from torch.nn.functional import relu

In [None]:
model.predict(sample_text).shape

In [None]:
ttt[3].shape

In [None]:
relu = torch.nn.ReLU()

In [None]:
specs = reconstruct_audio(relu(model.predict(sample_text)))


In [None]:
specs

In [None]:
display.Audio(specs[0].numpy(), rate=24000)
display.Audio(specs[0].numpy(), rate=24000)

In [None]:
specs[0]