In [1]:
### Scripts to analyze audio

In [2]:
BASEDIR="/usr2/asetlur/GraphNeuralTTS/Tacotron-pytorch/training-accentdb-char-baseline-with-additional-info/"
LOGDIR="log"

### Mel-spectogram classifier

In [3]:
from torch.utils.data import Dataset
import torch
import json
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import glob
import os
from collections import Counter
from tensorboardX import SummaryWriter
from tqdm import tqdm
import subprocess

#### Dataloader/Dataset

In [4]:
def _pad_2d(x, max_len):
    x = np.pad(x, [(0, max_len - len(x)), (0, 0)],
               mode="constant", constant_values=0)
    return x

class MelDataset(Dataset):
    
    def __init__(self, pth):
        self.mel_files = glob.glob(f"{BASEDIR}/*mel*")
        print(f"{len(self.mel_files)} mel-files found")
        # the files are supposed to be named accent_speaker_*.wav, 
        # e.g. australian_s02_362.wav
        self.labels = [os.path.basename(mel_file_pth).split("_")[:2] for mel_file_pth in self.mel_files]
        labels = [os.path.basename(mel_file_pth).split("_")[:2] for mel_file_pth in self.mel_files]
        
        # get accent labels, make a dict
        self.accent_labels = [l[0] for l in self.labels]
        self.accent_label_dict = {k: i for i, k in enumerate(sorted(Counter(self.accent_labels).keys()))}
        print(self.accent_label_dict)
        
        # same processing for the speakers
        self.speaker_labels = [" ".join(l) for l in self.labels]
        self.speaker_label_dict = {k: i for i, k in enumerate(sorted(Counter(self.speaker_labels).keys()))}
        print(self.speaker_label_dict)

    def __getitem__(self, i):
        return np.load(self.mel_files[i])
    
    def __len__(self):
        return len(self.mel_files)

    @staticmethod
    def batchify(dataset, bsz, shuffle=True):
        idx = list(range(len(dataset)))
        if shuffle:
            np.random.shuffle(idx)

        for begin in range(0, len(dataset), bsz):
            end = min(begin + bsz, len(dataset))
            num_elems = end - begin
    
            
            # read all the mels for this batch, find the max length
            mels = [dataset[idx[i]] for i in range(begin, end)]
            seq_lengths = torch.LongTensor([len(mel) for mel in mels])
            max_target_len = seq_lengths.max().item()
            
            
            b = np.array([_pad_2d(mel, max_target_len) for mel in mels],
                 dtype=np.float32)
            mel_batch = torch.FloatTensor(b)
            speaker_labels = torch.LongTensor([dataset.speaker_label_dict[dataset.speaker_labels[idx[i]]]\
                                               for i in range(begin, end)])
            accent_labels = torch.LongTensor([dataset.accent_label_dict[dataset.accent_labels[idx[i]]]\
                                              for i in range(begin, end)])
            
            
            seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
            
            
            yield mel_batch[perm_idx], speaker_labels[perm_idx], accent_labels[perm_idx], seq_lengths
            
            

In [5]:
# dataset sanity checks
dataset = MelDataset(BASEDIR)

# check shape of one mel file
print(dataset[0].shape)

# dataloader check
dataloader = MelDataset.batchify(dataset, 32)

# check two batches for correct batchification:
for _ in range(2):
    mel, speaker, accent, input_lengths = next(dataloader)
    print(mel.shape, speaker.shape, accent.shape)

14999 mel-files found
{'american': 0, 'australian': 1, 'bangla': 2, 'british': 3, 'indian': 4, 'malayalam': 5, 'odiya': 6, 'telugu': 7, 'welsh': 8}
{'american s01': 0, 'american s02': 1, 'american s03': 2, 'american s04': 3, 'american s05': 4, 'american s06': 5, 'american s07': 6, 'american s08': 7, 'australian s01': 8, 'australian s02': 9, 'bangla s01': 10, 'bangla s02': 11, 'british s01': 12, 'british s02': 13, 'indian s01': 14, 'indian s02': 15, 'malayalam s01': 16, 'malayalam s02': 17, 'malayalam s03': 18, 'odiya s01': 19, 'telugu s01': 20, 'telugu s02': 21, 'welsh s01': 22}
(154, 80)
torch.Size([32, 293, 80]) torch.Size([32]) torch.Size([32])
torch.Size([32, 276, 80]) torch.Size([32]) torch.Size([32])


#### Model

In [6]:
def get_1dconv(in_channels, out_channels, max_pool=False):
    return nn.Sequential(nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1),
                      nn.ELU(),
                      nn.BatchNorm1d(out_channels),
                      nn.MaxPool1d(3, stride=2) if max_pool else nn.Identity(),
                      nn.Dropout(p=0.1))

In [7]:
class MelClassifier(nn.Module):
    def __init__(self, 
                 num_class,
                 mel_spectogram_dim: int = 80,
                 gru_hidden_size=32,
                 gru_num_layers=2):
        super(MelClassifier, self).__init__()
        self.num_class = num_class
        self.conv_blocks = nn.Sequential(
                        get_1dconv(in_channels=mel_spectogram_dim, out_channels=64),
                        get_1dconv(in_channels=64, out_channels=128),
                        get_1dconv(in_channels=128, out_channels=128, max_pool=True),
                        get_1dconv(in_channels=128, out_channels=128, max_pool=True),
                        get_1dconv(in_channels=128, out_channels=128, max_pool=True))
            
        self.gru = nn.GRU(input_size=128, hidden_size=gru_hidden_size, num_layers=gru_num_layers,\
                          bidirectional=True, batch_first=True, dropout=0.3)
        num_directions = 2
        self.mlp = nn.Linear(gru_hidden_size * gru_num_layers * num_directions, self.num_class)

    def forward(self, mel_batch, input_lengths):
        batch_size = len(mel_batch)
        # mel_batch -> (batch_size, max_time_step, 80)
        conv_output = self.conv_blocks(mel_batch.permute(0, 2, 1)).permute(0, 2, 1)
        # conv_output -> (batch_size, max_time_step, 32)

        output, h_n = self.gru(conv_output)
        # h_n -> (4, batch_size, 32)
        
        h_n = h_n.permute(1, 0, 2).reshape(batch_size, -1)
        return h_n, self.mlp(h_n)
        

#### Training loop

In [8]:
device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

In [9]:
model = MelClassifier(len(dataset.speaker_label_dict)).to(device)

In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3,
                                    betas=(0.9, 0.99),
                                    eps=1e-6,
                                    weight_decay=0.01)

In [11]:
writer = SummaryWriter(LOGDIR)

In [12]:
args = ("rm", "-rf", f"{LOGDIR}/*")
subprocess.call("%s %s %s" % args, shell=True)

0

In [13]:
num_epochs = 10
dataset = MelDataset(BASEDIR)
loss_func = nn.CrossEntropyLoss()

losses = []
accuracy = []
for epoch in range(num_epochs):
    
    dataloader = MelDataset.batchify(dataset, 32)
    
    # training
    for i, (mels, speakers, accents, input_lengths) in enumerate(dataloader):
        mels = mels.to(device)
        speakers = speakers.to(device)
        accents = accents.to(device)
        optimizer.zero_grad()
        
        h_n, logits = model(mels, input_lengths)
        loss = loss_func(logits, speakers).mean()
        accuracy.append(sum(torch.argmax(logits, dim=1) == speakers).item() * 100. / len(accents))
    
        
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        if i % 50 == 0:
            print(f"Epoch = {epoch} iter = {i} Loss = {round(np.array(losses).mean(), 2)} Acc = {round(np.array(accuracy).mean(), 2)}")
            losses = []
            
    print("Extracting Embedding")
    dataloader = MelDataset.batchify(dataset, 32)
    metadata_speaker = []
    metadata_accent = []   
    embeddings = []

    for i, (mels, speakers, accents, input_lengths) in tqdm(enumerate(dataloader), total=len(dataset) // 32):

        metadata_speaker += speakers.numpy().tolist()
        metadata_accent += accents.numpy().tolist()
        mels = mels.to(device)
        speakers = speakers.to(device)
        accents = accents.to(device)

        with torch.no_grad():
            h_n, logits = model(mels, input_lengths)
            embeddings.append(h_n.cpu())

    embeddings = torch.cat(embeddings, dim=0)
    writer.add_embedding(tag="accent",
                        mat=embeddings,
                        global_step=epoch,
                        metadata=metadata_accent)
    writer.add_embedding(tag="speaker",
            mat=embeddings,
            global_step=epoch,
            metadata=metadata_speaker)

14999 mel-files found
{'american': 0, 'australian': 1, 'bangla': 2, 'british': 3, 'indian': 4, 'malayalam': 5, 'odiya': 6, 'telugu': 7, 'welsh': 8}
{'american s01': 0, 'american s02': 1, 'american s03': 2, 'american s04': 3, 'american s05': 4, 'american s06': 5, 'american s07': 6, 'american s08': 7, 'australian s01': 8, 'australian s02': 9, 'bangla s01': 10, 'bangla s02': 11, 'british s01': 12, 'british s02': 13, 'indian s01': 14, 'indian s02': 15, 'malayalam s01': 16, 'malayalam s02': 17, 'malayalam s03': 18, 'odiya s01': 19, 'telugu s01': 20, 'telugu s02': 21, 'welsh s01': 22}
Epoch = 0 iter = 0 Loss = 3.1 Acc = 9.38
Epoch = 0 iter = 50 Loss = 2.64 Acc = 25.0
Epoch = 0 iter = 100 Loss = 1.86 Acc = 36.14
Epoch = 0 iter = 150 Loss = 1.36 Acc = 45.9
Epoch = 0 iter = 200 Loss = 0.97 Acc = 53.89
Epoch = 0 iter = 250 Loss = 0.67 Acc = 60.27
Epoch = 0 iter = 300 Loss = 0.55 Acc = 64.87
Epoch = 0 iter = 350 Loss = 0.41 Acc = 68.52
Epoch = 0 iter = 400 Loss = 0.33 Acc = 71.59
Epoch = 0 iter =

  1%|          | 4/468 [00:00<00:12, 38.11it/s]

Extracting Embedding


469it [00:16, 25.39it/s]                         


Epoch = 1 iter = 0 Loss = 0.31 Acc = 74.71
Epoch = 1 iter = 50 Loss = 0.22 Acc = 76.63
Epoch = 1 iter = 100 Loss = 0.24 Acc = 78.07
Epoch = 1 iter = 150 Loss = 0.2 Acc = 79.37
Epoch = 1 iter = 200 Loss = 0.18 Acc = 80.56
Epoch = 1 iter = 250 Loss = 0.17 Acc = 81.6
Epoch = 1 iter = 300 Loss = 0.12 Acc = 82.6
Epoch = 1 iter = 350 Loss = 0.13 Acc = 83.42
Epoch = 1 iter = 400 Loss = 0.14 Acc = 84.13
Epoch = 1 iter = 450 Loss = 0.15 Acc = 84.76


  1%|          | 4/468 [00:00<00:12, 36.10it/s]

Extracting Embedding


469it [00:16, 28.08it/s]                         


Epoch = 2 iter = 0 Loss = 0.12 Acc = 85.01
Epoch = 2 iter = 50 Loss = 0.1 Acc = 85.63
Epoch = 2 iter = 100 Loss = 0.11 Acc = 86.18
Epoch = 2 iter = 150 Loss = 0.08 Acc = 86.73
Epoch = 2 iter = 200 Loss = 0.07 Acc = 87.23
Epoch = 2 iter = 250 Loss = 0.08 Acc = 87.67
Epoch = 2 iter = 300 Loss = 0.07 Acc = 88.07
Epoch = 2 iter = 350 Loss = 0.05 Acc = 88.49
Epoch = 2 iter = 400 Loss = 0.11 Acc = 88.77
Epoch = 2 iter = 450 Loss = 0.07 Acc = 89.1


  1%|          | 3/468 [00:00<00:20, 22.77it/s]

Extracting Embedding


469it [00:18, 25.65it/s]                         


Epoch = 3 iter = 0 Loss = 0.08 Acc = 89.22
Epoch = 3 iter = 50 Loss = 0.07 Acc = 89.52
Epoch = 3 iter = 100 Loss = 0.08 Acc = 89.78
Epoch = 3 iter = 150 Loss = 0.07 Acc = 90.04
Epoch = 3 iter = 200 Loss = 0.05 Acc = 90.31
Epoch = 3 iter = 250 Loss = 0.05 Acc = 90.57
Epoch = 3 iter = 300 Loss = 0.04 Acc = 90.81
Epoch = 3 iter = 350 Loss = 0.05 Acc = 91.02
Epoch = 3 iter = 400 Loss = 0.07 Acc = 91.22
Epoch = 3 iter = 450 Loss = 0.05 Acc = 91.42


  1%|          | 3/468 [00:00<00:18, 24.97it/s]

Extracting Embedding


469it [00:18, 25.00it/s]                         


Epoch = 4 iter = 0 Loss = 0.03 Acc = 91.51
Epoch = 4 iter = 50 Loss = 0.05 Acc = 91.68
Epoch = 4 iter = 100 Loss = 0.06 Acc = 91.84
Epoch = 4 iter = 150 Loss = 0.06 Acc = 91.99
Epoch = 4 iter = 200 Loss = 0.07 Acc = 92.13
Epoch = 4 iter = 250 Loss = 0.06 Acc = 92.27
Epoch = 4 iter = 300 Loss = 0.03 Acc = 92.42
Epoch = 4 iter = 350 Loss = 0.04 Acc = 92.56
Epoch = 4 iter = 400 Loss = 0.07 Acc = 92.68
Epoch = 4 iter = 450 Loss = 0.06 Acc = 92.79


  1%|          | 3/468 [00:00<00:17, 26.41it/s]

Extracting Embedding


469it [00:17, 27.43it/s]                         


Epoch = 5 iter = 0 Loss = 0.05 Acc = 92.84
Epoch = 5 iter = 50 Loss = 0.05 Acc = 92.96
Epoch = 5 iter = 100 Loss = 0.05 Acc = 93.07
Epoch = 5 iter = 150 Loss = 0.04 Acc = 93.19
Epoch = 5 iter = 200 Loss = 0.02 Acc = 93.31
Epoch = 5 iter = 250 Loss = 0.04 Acc = 93.42
Epoch = 5 iter = 300 Loss = 0.05 Acc = 93.52
Epoch = 5 iter = 350 Loss = 0.05 Acc = 93.61
Epoch = 5 iter = 400 Loss = 0.05 Acc = 93.69
Epoch = 5 iter = 450 Loss = 0.03 Acc = 93.78


  1%|          | 3/468 [00:00<00:19, 23.39it/s]

Extracting Embedding


469it [00:18, 25.45it/s]                         


Epoch = 6 iter = 0 Loss = 0.04 Acc = 93.81
Epoch = 6 iter = 50 Loss = 0.03 Acc = 93.9
Epoch = 6 iter = 100 Loss = 0.06 Acc = 93.98
Epoch = 6 iter = 150 Loss = 0.03 Acc = 94.07
Epoch = 6 iter = 200 Loss = 0.02 Acc = 94.16
Epoch = 6 iter = 250 Loss = 0.02 Acc = 94.24
Epoch = 6 iter = 300 Loss = 0.05 Acc = 94.31
Epoch = 6 iter = 350 Loss = 0.04 Acc = 94.38
Epoch = 6 iter = 400 Loss = 0.03 Acc = 94.45
Epoch = 6 iter = 450 Loss = 0.03 Acc = 94.52


  0%|          | 2/468 [00:00<00:23, 19.59it/s]

Extracting Embedding


469it [00:18, 26.16it/s]                         


Epoch = 7 iter = 0 Loss = 0.02 Acc = 94.55
Epoch = 7 iter = 50 Loss = 0.02 Acc = 94.63
Epoch = 7 iter = 100 Loss = 0.02 Acc = 94.7
Epoch = 7 iter = 150 Loss = 0.03 Acc = 94.76
Epoch = 7 iter = 200 Loss = 0.03 Acc = 94.82
Epoch = 7 iter = 250 Loss = 0.04 Acc = 94.88
Epoch = 7 iter = 300 Loss = 0.04 Acc = 94.93
Epoch = 7 iter = 350 Loss = 0.05 Acc = 94.97
Epoch = 7 iter = 400 Loss = 0.04 Acc = 95.03
Epoch = 7 iter = 450 Loss = 0.02 Acc = 95.08


  1%|          | 3/468 [00:00<00:18, 25.43it/s]

Extracting Embedding


469it [00:18, 25.76it/s]                         


Epoch = 8 iter = 0 Loss = 0.02 Acc = 95.11
Epoch = 8 iter = 50 Loss = 0.03 Acc = 95.16
Epoch = 8 iter = 100 Loss = 0.03 Acc = 95.21
Epoch = 8 iter = 150 Loss = 0.02 Acc = 95.26
Epoch = 8 iter = 200 Loss = 0.02 Acc = 95.32
Epoch = 8 iter = 250 Loss = 0.02 Acc = 95.36
Epoch = 8 iter = 300 Loss = 0.02 Acc = 95.41
Epoch = 8 iter = 350 Loss = 0.02 Acc = 95.46
Epoch = 8 iter = 400 Loss = 0.03 Acc = 95.51
Epoch = 8 iter = 450 Loss = 0.02 Acc = 95.55


  1%|          | 3/468 [00:00<00:20, 22.96it/s]

Extracting Embedding


469it [00:19, 24.31it/s]                         


Epoch = 9 iter = 0 Loss = 0.01 Acc = 95.57
Epoch = 9 iter = 50 Loss = 0.05 Acc = 95.6
Epoch = 9 iter = 100 Loss = 0.03 Acc = 95.65
Epoch = 9 iter = 150 Loss = 0.01 Acc = 95.69
Epoch = 9 iter = 200 Loss = 0.04 Acc = 95.72
Epoch = 9 iter = 250 Loss = 0.02 Acc = 95.77
Epoch = 9 iter = 300 Loss = 0.02 Acc = 95.8
Epoch = 9 iter = 350 Loss = 0.03 Acc = 95.84
Epoch = 9 iter = 400 Loss = 0.03 Acc = 95.87
Epoch = 9 iter = 450 Loss = 0.02 Acc = 95.91


  1%|          | 4/468 [00:00<00:15, 30.09it/s]

Extracting Embedding


469it [00:18, 26.04it/s]                         
