In [2]:
import torch
import torchaudio.datasets as datasets

In [3]:
dataset = datasets.LJSPEECH("../data")

In [4]:
waveform, sample_rate, transcript, normalized_transcript = dataset[0]

waveform, sample_rate, normalized_transcript

(tensor([[-7.3242e-04, -7.6294e-04, -6.4087e-04,  ...,  7.3242e-04,
           2.1362e-04,  6.1035e-05]]),
 22050,
 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition')

In [5]:
waveforms = [dataset[i][0].view(1, 1, -1) for i in range(3)]

smallest_size = min(list(map(lambda x: x.size(-1), waveforms)))
clipped_inputs = list(map(lambda x: x[:,:,:smallest_size], waveforms))

inputs = torch.stack(clipped_inputs).squeeze(2)
inputs.shape

torch.Size([3, 1, 41885])

In [6]:
import torch.nn.functional as f

# Represent speaker identity
global_inputs = torch.stack((torch.tensor([[0]]), torch.tensor([[1]]), torch.tensor([[0]])))
global_inputs.shape

torch.Size([3, 1, 1])

Since the global input will be constant through out the entire speach generation we just have to expand the size of it to the number of time steps

In [7]:
categories_channels = global_inputs.size(1)
batch_size, _, time_steps = inputs.size()
batch_size, categories_channels, time_steps

(3, 1, 41885)

In [8]:
(inputs + global_inputs).shape

torch.Size([3, 1, 41885])

it seems like we can just use brodcasting to add them together. Now lets upsample the local inputs to match time steps

In [19]:
local_inputs = torch.stack([torch.randn((1,5,6)) for _ in range(3)]).squeeze(1)
categories_channels = local_inputs.size(1)
local_inputs.shape

torch.Size([3, 5, 6])

In [20]:
import torch.nn as nn

In [21]:
local_inputs.size(-1)

6

In [24]:
from math import ceil

scaling_factor = ceil(time_steps / local_inputs.size(-1))
upsampler = nn.ConvTranspose1d(categories_channels, categories_channels,scaling_factor,scaling_factor)
upsampled_data = upsampler(local_inputs)

upsampled_data.shape

torch.Size([3, 5, 41886])

In [26]:
(inputs + upsampled_data[:,:,time_steps:]).shape

torch.Size([3, 5, 41885])

The consideration are now how to we get the scaling factor for each time step since in each dialted convolution the shape of the inputs get smaller so what must we do to get the correct size after each dialted convolution

Since I don't know how to get the size of the upsampling convolution to work for different sizes I will just replicate the local size until it reach over the amount of time steps needed then clip

In [42]:
local_inputs.repeat(1, 1, 6981)[:,:,:time_steps].shape

torch.Size([3, 5, 41885])