MULTI-LANG TOKENIZER 

MEL-LINEAR

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LinearNorm(torch.nn.Module):
    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
        super(LinearNorm, self).__init__()
        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

        torch.nn.init.xavier_uniform_(
            self.linear_layer.weight,
            gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, x):
        return self.linear_layer(x)

class MelLinear(nn.Module):
    def __init__(self, in_dim, out_dim, dropout=0.1):
        super(MelLinear, self).__init__()
    
        self.layer1 = LinearNorm(in_dim,int(in_dim//2))
        self.layer2 = LinearNorm(int(in_dim//2),out_dim)

        self.drop1 = nn.Dropout(dropout)
        self.drop2 = nn.Dropout(dropout)

    def forward(self, x):
        x = self.drop1(F.relu(self.layer1(x)))
        x = self.drop2(self.layer2(x))

        return x


In [2]:
#713 // 2

TACOTRON POST-NET

In [3]:
from utils import load_configs
import utils
import torch
import torch.nn as nn

configs = load_configs('hparams.yaml')

class ConvNorm(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
        super(ConvNorm, self).__init__()
        if padding is None:
            assert(kernel_size % 2 == 1)
            padding = int(dilation * (kernel_size - 1) / 2)

        self.conv = torch.nn.Conv1d(in_channels, out_channels,
                                    kernel_size=kernel_size, stride=stride,
                                    padding=padding, dilation=dilation,
                                    bias=bias)

        torch.nn.init.xavier_uniform_(
            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, signal):
        conv_signal = self.conv(signal)
        return conv_signal

class Postnet(nn.Module):
    """Postnet
        - Five 1-d convolution with 512 channels and kernel size 5
    """
    def __init__(self, configs, dropout):
        super(Postnet, self).__init__()
        self.convolutions = nn.ModuleList()
        self.dropout = nn.Dropout(dropout)

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(configs['Audio_Configs']['num_mels'], configs['Postnet_Configs']['postnet_embedding_dim'],
                         kernel_size=configs['Postnet_Configs']['postnet_kernel_size'], stride=1,
                         padding=int((configs['Postnet_Configs']['postnet_kernel_size'] - 1) / 2),
                         dilation=1, w_init_gain='tanh'),
                nn.BatchNorm1d(configs['Postnet_Configs']['postnet_embedding_dim']))
        )

        for i in range(1, configs['Postnet_Configs']['postnet_n_convolutions'] - 1):
            self.convolutions.append(
                nn.Sequential(
                    ConvNorm(configs['Postnet_Configs']['postnet_embedding_dim'],
                             configs['Postnet_Configs']['postnet_embedding_dim'],
                             kernel_size=configs['Postnet_Configs']['postnet_kernel_size'], stride=1,
                             padding=int((configs['Postnet_Configs']['postnet_kernel_size'] - 1) / 2),
                             dilation=1, w_init_gain='tanh'),
                    nn.BatchNorm1d(configs['Postnet_Configs']['postnet_embedding_dim']))
            )

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(configs['Postnet_Configs']['postnet_embedding_dim'], 
                         configs['Audio_Configs']['num_mels'],
                         kernel_size=configs['Postnet_Configs']['postnet_kernel_size'], stride=1,
                         padding=int((configs['Postnet_Configs']['postnet_kernel_size'] - 1) / 2),
                         dilation=1, w_init_gain='linear'),
                nn.BatchNorm1d(configs['Audio_Configs']['num_mels']))
            )

    def forward(self, x):
        x = x.transpose(1,2)
        for i in range(len(self.convolutions) - 1):
            x = self.dropout(torch.tanh(self.convolutions[i](x)))
        x = self.dropout(self.convolutions[-1](x))

        return x.transpose(1,2)



In [4]:
in_dim = 768
out_dim = 80
# Creating the Prenet model
prenet_model = MelLinear(in_dim, out_dim)
batch_size = 30
random_input = torch.rand(batch_size, 18, in_dim)
output = prenet_model(random_input)


In [5]:
#torch.rand(batch_size, in_dim).shape

In [6]:
output.shape

torch.Size([30, 18, 80])

In [7]:
postnet_model = Postnet(configs, 0.1)
#MelLinear(in_dim, out_dim)

In [8]:
postnet_model(output).shape

torch.Size([30, 18, 80])

Combined Both

In [31]:
import torch.nn as nn
import torch.nn.functional as F

class CombinedModel(nn.Module):
    def __init__(self, configs, dropout=0.1):
        super(CombinedModel, self).__init__()
        
        self.layer1 = LinearNorm(configs['EncDec_Configs']['embed_dim'],int(configs['EncDec_Configs']['embed_dim'] // 2))
        self.layer2 = LinearNorm(int(configs['EncDec_Configs']['embed_dim'] // 2),configs['Audio_Configs']['num_mels'])
        self.drop1 = nn.Dropout(dropout)
        self.drop2 = nn.Dropout(dropout)

        #postnet starts
        self.convolutions = nn.ModuleList()
        self.dropout = nn.Dropout(dropout)
        self.convolutions.append(
            nn.Sequential(
                ConvNorm(configs['Audio_Configs']['num_mels'], configs['Postnet_Configs']['postnet_embedding_dim'],
                         kernel_size=configs['Postnet_Configs']['postnet_kernel_size'], stride=1,
                         padding=int((configs['Postnet_Configs']['postnet_kernel_size'] - 1) / 2),
                         dilation=1, w_init_gain='tanh'),
                nn.BatchNorm1d(configs['Postnet_Configs']['postnet_embedding_dim']))
        )

        for i in range(1, configs['Postnet_Configs']['postnet_n_convolutions'] - 1):
            self.convolutions.append(
                nn.Sequential(
                    ConvNorm(configs['Postnet_Configs']['postnet_embedding_dim'],
                             configs['Postnet_Configs']['postnet_embedding_dim'],
                             kernel_size=configs['Postnet_Configs']['postnet_kernel_size'], stride=1,
                             padding=int((configs['Postnet_Configs']['postnet_kernel_size'] - 1) / 2),
                             dilation=1, w_init_gain='tanh'),
                    nn.BatchNorm1d(configs['Postnet_Configs']['postnet_embedding_dim']))
            )

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(configs['Postnet_Configs']['postnet_embedding_dim'], 
                         configs['Audio_Configs']['num_mels'],
                         kernel_size=configs['Postnet_Configs']['postnet_kernel_size'], stride=1,
                         padding=int((configs['Postnet_Configs']['postnet_kernel_size'] - 1) / 2),
                         dilation=1, w_init_gain='linear'),
                nn.BatchNorm1d(configs['Audio_Configs']['num_mels']))
            )

        self.stop_linear = nn.Linear(configs['EncDec_Configs']['embed_dim'], 1)

    def forward(self, x):
        # all_mel_linear
        x_mel_linear = self.drop1(F.relu(self.layer1(x)))
        x_mel_linear = self.drop2(self.layer2(x_mel_linear))
        
        #stop_linear
        stoplinear_output = self.stop_linear(x)

        #all postnet
        x_postnet = x_mel_linear.transpose(1,2)
        for i in range(len(self.convolutions) - 1):
            x_postnet = self.dropout(torch.tanh(self.convolutions[i](x_postnet)))
        x_postnet = self.dropout(self.convolutions[-1](x_postnet))
        x_postnet = x_postnet.transpose(1,2)

        mel_out = x_mel_linear + x_postnet


        return x_mel_linear, stoplinear_output, mel_out

# The classes LinearNorm and ConvNorm are not provided in the code snippet, 
# so make sure to include their definitions when using the CombinedModel class.


In [32]:
all_ = CombinedModel(configs)

In [33]:
all_

CombinedModel(
  (layer1): LinearNorm(
    (linear_layer): Linear(in_features=768, out_features=384, bias=True)
  )
  (layer2): LinearNorm(
    (linear_layer): Linear(in_features=384, out_features=80, bias=True)
  )
  (drop1): Dropout(p=0.1, inplace=False)
  (drop2): Dropout(p=0.1, inplace=False)
  (convolutions): ModuleList(
    (0): Sequential(
      (0): ConvNorm(
        (conv): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,))
      )
      (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1-3): 3 x Sequential(
      (0): ConvNorm(
        (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
      )
      (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (4): Sequential(
      (0): ConvNorm(
        (conv): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,))
      )
      (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=Tru

In [34]:
random_input = torch.rand(30, 13, 768)
linear_output, stop_linear_output, mel_spectogram_output = all_(random_input)

In [36]:
stop_linear_output.shape

torch.Size([30, 13, 1])

In [None]:
postnet_output.shape

torch.Size([30, 13, 80])

In [None]:
class OutLinear(nn.Module):
    def __init__(self, in_dim, out_dim, dropout=0.1):
        super(OutLinear, self).__init__()
    
        #self.layer1 = LinearNorm(in_dim,int(in_dim//2))
        self.layer2 = nn.Linear(int(in_dim//2),1)
        self.drop1 = nn.Dropout(dropout)

    def forward(self, x):
        #x = self.drop1(F.relu(self.layer1(x)))
        x = self.drop1(self.layer2(x))

        return x


In [None]:
prenet_model

OutLinear(
  (layer2): Linear(in_features=384, out_features=1, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
)

In [None]:
in_dim = 768
out_dim = 80
# Creating the Prenet model
prenet_model = OutLinear(in_dim, out_dim)
batch_size = 30
random_input = torch.rand(batch_size, 18, in_dim)
output = prenet_model(random_input)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (540x768 and 384x1)

In [20]:
import torch
import torch.nn as nn

class StopLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super(StopLinear, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        output = self.linear(x)
        return output


In [22]:
input_data.shape

torch.Size([4, 10, 256])

In [21]:
import torch

hl = 256
output_size = 1
linear_layer = StopLinear(hl, output_size)

bs = 4
T = 10
input_data = torch.randn(bs, T, hl)

output_data = linear_layer(input_data)

print(output_data)
print(output_data.shape)


tensor([[[ 0.0091],
         [ 0.1513],
         [-0.3235],
         [-0.0387],
         [ 0.8807],
         [-0.3860],
         [-0.3836],
         [-1.6093],
         [ 0.0095],
         [ 0.0115]],

        [[ 0.3198],
         [ 0.3496],
         [ 0.4344],
         [-0.5858],
         [ 0.6487],
         [-0.4532],
         [ 0.1446],
         [-0.6371],
         [ 0.6340],
         [ 1.0134]],

        [[-0.8025],
         [-0.4314],
         [-0.0752],
         [ 1.1128],
         [ 1.0425],
         [ 0.3279],
         [ 0.3437],
         [ 0.0180],
         [ 0.6263],
         [ 0.3281]],

        [[-0.0099],
         [ 1.4135],
         [-0.4209],
         [ 0.7863],
         [ 0.2518],
         [ 0.3779],
         [-1.3510],
         [-1.0199],
         [-0.1495],
         [-0.0173]]], grad_fn=<ViewBackward0>)
torch.Size([4, 10, 1])
