In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio
import pandas as pd
import librosa
import numpy as np
from sklearn.pipeline import make_pipeline
import tqdm
from torch import nn
from torch.nn import functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import random
import torchaudio.transforms as transforms

In [3]:
#load data
train_dataset = torchaudio.datasets.LIBRITTS('./', url="train-clean-100", download=True)
test_dataset = torchaudio.datasets.LIBRITTS('./', url="test-clean", download=True)

In [4]:
speakers = pd.read_csv("/content/LibriTTS/speakers.tsv", sep='\t')
speakers = speakers[['READER', 'GENDER']]
speakers = speakers.to_dict().get('READER')

In [5]:
#do some checks
df = pd.read_csv("/content/LibriTTS/speakers.tsv", sep='\t')
train_speakers = set()
test_speakers = set()

for row in df.iloc:
    if row['SUBSET NAME'] == 'train-clean-100':
        train_speakers.add(row['READER'])
    if row['SUBSET NAME'] == 'test-clean':
        test_speakers.add(row['READER'])
        
train_speakers.intersection(test_speakers)  # ensure test dataset does not contain train speakers to avoid overtraining

for i in range(len(train_dataset)):
    if speakers.get(train_dataset[i][4]) is None:
        print("Warning: Not every speaker has information about gender in train dataset")

for i in range(len(test_dataset)):
    if speakers.get(test_dataset[i][4]) is None:
        print("Warning: Not every speaker has information about gender in test dataset")

In [6]:
def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    
set_random_seed(42)

In [7]:
crop_size = 50000
resampled_freq = 4000

def pad_sequence(batch):
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)


def collate_fn(batch, train=False):
    tensors = []
    targets = []

    for waveform, f, _, _, s, _, _ in batch:
        w = torchaudio.transforms.Resample(f, resampled_freq)(waveform)
        if w.shape[1] > crop_size:
            st = np.random.randint(0, w.shape[1] - crop_size)
            w = w[:, st:]

        if train:
            w += np.random.normal(0, torch.std(w).item() / 10, size=w.shape)  # simple augmentation
        
        tensors.append(waveform)
        g = int(speakers[s] == 'M')
        t = torch.LongTensor([0, 0])
        t[g] = 1
        targets.append(g)
        
    tensors = pad_sequence(tensors)
    targets = torch.LongTensor(targets)
    # targets = torch.stack(targets)
    # print(targets)

    return tensors, targets

In [8]:
batch_size = 128

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: collate_fn(batch, train=True))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [9]:
# https://pytorch.org/tutorials/intermediate/speech_command_classification_with_torchaudio_tutorial.html

class Model(nn.Module):
    def __init__(self, n_output=35, stride=16, n_channel=32, reception_field=80):
        super().__init__()
        self.conv1 = nn.Conv1d(1, n_channel, kernel_size=reception_field, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.softmax(x, dim=2)

In [10]:
from IPython.display import clear_output

def train_one_epoch(model, train_dataloader, criterion, optimizer, device="cuda:0"):
    model.to(device).train()
    cum_loss = 0
    n_objects = 0
    for features, y in tqdm.notebook.tqdm(train_dataloader):
        preds = model(features.to(device)).squeeze()
        loss = criterion(preds, y.to(device))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        cum_loss += loss.detach().cpu().numpy() * features.shape[0]
        n_objects += features.shape[0]
        
    return cum_loss / n_objects


def predict(model, test_dataloder, criterion, device="cuda:0"):
    model.to(device).eval()
    with torch.no_grad():        
        predicts = torch.tensor([])
        true_values = torch.tensor([])
        cum_loss = 0
        n_objects = 0
        for features, y in tqdm.notebook.tqdm(test_dataloder):
            cur = model(features.to(device)).cpu().squeeze()
            predicts = torch.cat([predicts, np.argmax(cur, axis=1)])
            true_values = torch.cat([true_values, y])
            n_objects += features.shape[0]
            cum_loss += criterion(cur, y).item() * features.shape[0]
        
        return cum_loss / n_objects, predicts, true_values
    

def train(model, train_dataloader, test_dataloader, criterion, optimizer, device="cuda:0", n_epochs=10, scheduler=None):
    model.to(device)
    for epoch in range(n_epochs):
        print('Train')
        train_loss = train_one_epoch(model, train_dataloader, criterion, optimizer, device)
        print('Evaluate')
        val_loss, predicted, true = predict(model, test_dataloader, criterion, device)
        if scheduler is not None:
            scheduler.step(val_loss)
            
        accuracy = accuracy_score(predicted, true)
        print('Epoch {}, val loss {:.3f}, train loss {:.3f}, accuracy {:.3f}'\
              .format(epoch + 1, val_loss, train_loss, accuracy))

In [11]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [12]:
model = Model(n_output=2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.7, verbose=True)

In [13]:
train(model, train_dataloader, test_dataloader, criterion, optimizer, device, 1, scheduler)

Train


  0%|          | 0/260 [00:00<?, ?it/s]

Evaluate


  0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1, val loss 0.386, train loss 0.438, accuracy 0.946


In [14]:
train(model, train_dataloader, test_dataloader, criterion, optimizer, device, 1, scheduler)

Train


  0%|          | 0/260 [00:00<?, ?it/s]

Evaluate


  0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1, val loss 0.404, train loss 0.372, accuracy 0.898


In [15]:
train(model, train_dataloader, test_dataloader, criterion, optimizer, device, 1, scheduler)

Train


  0%|          | 0/260 [00:00<?, ?it/s]

Evaluate


  0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1, val loss 0.390, train loss 0.360, accuracy 0.924


In [16]:
train(model, train_dataloader, test_dataloader, criterion, optimizer, device, 1, scheduler)

Train


  0%|          | 0/260 [00:00<?, ?it/s]

Evaluate


  0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1, val loss 0.369, train loss 0.353, accuracy 0.958


In [17]:
train(model, train_dataloader, test_dataloader, criterion, optimizer, device, 1, scheduler)

Train


  0%|          | 0/260 [00:00<?, ?it/s]

Evaluate


  0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1, val loss 0.362, train loss 0.348, accuracy 0.964


In [18]:
train(model, train_dataloader, test_dataloader, criterion, optimizer, device, 1, scheduler)

Train


  0%|          | 0/260 [00:00<?, ?it/s]

Evaluate


  0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1, val loss 0.342, train loss 0.344, accuracy 0.976


In [19]:
val_loss, predicted, true = predict(model, test_dataloader, criterion, device)
print('Accuracy:', accuracy_score(predicted, true))

  0%|          | 0/38 [00:00<?, ?it/s]

Accuracy: 0.9745710150919992
