In [1]:
import glob
import os
import random

import numpy as np
import torch

In [2]:
THRESHOLD = 0.5
TRAIN_DIR = 'data/train'
VAL_DIR = 'data/val'

In [3]:
def set_seed(seed=22):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

Поскольку последовательности в батче для обучения могут быть разного размера, добавим padding до максимальной длины спектрограммы.

In [4]:
def prepare_data(records):
    max_length = max(record.shape[0] for record in records)
    embedding_dim = records[0].shape[1]
    X = np.empty(shape=(len(records), max_length, embedding_dim))
    for i, record in enumerate(records):
        if len(record) < max_length:
            max_offset = max_length - len(record)
            offset = np.random.randint(max_offset) if max_offset else max_offset
            record = np.pad(record, ((max_offset, 0), (0, 0)), "constant")
        X[i,] = record
    return X

На вход сети поочередно подаем батч с чистой речью, затем зашумленный батч

In [5]:
def get_batch(directory):
    clean_path = os.path.join(directory, 'clean')
    noisy_path = os.path.join(directory, 'noisy')
    for subdir in os.listdir(clean_path):
        clean_samples = [np.load(f) for f in glob.glob(os.path.join(clean_path, '{}/*.npy'.format(subdir)))]
        clean_lables = np.zeros(len(clean_samples))
        yield clean_samples, clean_lables
        noisy_samples = [np.load(f) for f in glob.glob(os.path.join(noisy_path, '{}/*.npy'.format(subdir)))]
        noisy_lables = np.ones(len(noisy_samples))
        yield noisy_samples, noisy_lables

Поскольку mel-спектрограмма является последовательностью данных во времени, для решения поставленной задачи применим рекуррентную архитектуру сети, а именно LSTM, с fc-слоем в конце для классификации. Чтобы избежать переобучения добавим dropout-слой. Также рекуррентная архитектура упрощает работу с последовательностями различной длины.

In [6]:
class LSTMTagger(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
        self.dropout_layer = torch.nn.Dropout(p=0.2)
        self.hidden2out = torch.nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        outputs, (ht, ct) = self.lstm(sentence.view(sentence.shape[1], sentence.shape[0], self.embedding_dim))
        output = self.dropout_layer(ht[-1])
        output = self.hidden2out(output)
        return torch.sigmoid(output)

In [7]:
def train_model(model, loss, optimizer, num_epochs, scheduler=None):
    loss_history = []
    train_history = []
    val_history = []
    for epoch in range(num_epochs):
        model.train()
        
        loss_accum = 0
        correct_samples = 0
        total_samples = 0
        for batch, lables in get_batch(TRAIN_DIR):
            optimizer.zero_grad()
            x_gpu = torch.tensor(prepare_data(batch), dtype=torch.float32).cuda()
            y_gpu = torch.tensor(lables, dtype=torch.float32).view(len(lables), 1).cuda()
            prediction = model(x_gpu)
            loss_value = loss(prediction, y_gpu)
            loss_value.backward()

            optimizer.step()

            indices = (prediction > THRESHOLD).float()
            correct_samples += torch.sum(indices == y_gpu)
            loss_accum += loss_value
                
            total_samples += len(batch)
            
        ave_loss = loss_accum / total_samples
        loss_history.append(float(ave_loss))
        
        train_accuracy = float(correct_samples) / total_samples
        train_history.append(train_accuracy)
        
        val_accuracy = compute_accuracy(model)
        val_history.append(val_accuracy)
        
        print("Average loss: {:.5}, Train accuracy: {:.5}, Val accuracy: {:.5}".format(
            ave_loss, train_accuracy, val_accuracy))
        
        if scheduler:
            scheduler.step()
    return loss_history, train_history, val_history

In [8]:
def compute_accuracy(model, ):
    model.eval()
    
    accuracy = 0
    correct_samples = 0
    total_samples = 0
    for batch, lables in get_batch(VAL_DIR):
        x_gpu = torch.tensor(prepare_data(batch), dtype=torch.float32).cuda()
        y_gpu = torch.tensor(lables, dtype=torch.float32).view(len(lables), 1).cuda()
        prediction = model(x_gpu)

        indices = (prediction > THRESHOLD).float()
        correct_samples += torch.sum(indices == y_gpu)
            
        total_samples += len(batch)
    if total_samples:
        accuracy = correct_samples.data.float() / total_samples
    return accuracy

В функцию тестирования нужно передать модель и mel-спектрограмму. В ответ возвращается 0 или 1, 1 - значит соответствующая спектрограмма содержит шум. 

In [9]:
def test_model(model, mel):
    x_gpu = torch.tensor(mel, dtype=torch.float32).cuda()
    prediction = model(x_gpu.unsqueeze(0))
    return (prediction > THRESHOLD).item()

Гиперпараметры модели и оптимизации подбирались по сетке до достижения приемлемого качества классификации, с ошибкой ~0.5%

In [10]:
NUM_EPOCHS = 20
EMBEDDING_DIM = 80
HIDDEN_DIM = 14

In [11]:
set_seed()
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, 1).cuda()
loss_function = torch.nn.BCELoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
loss_history, train_history, val_history = train_model(model, loss_function, optimizer, NUM_EPOCHS, scheduler)

Average loss: 0.017475, Train accuracy: 0.90317, Val accuracy: 0.92925
Average loss: 0.013535, Train accuracy: 0.92067, Val accuracy: 0.94925
Average loss: 0.011492, Train accuracy: 0.94012, Val accuracy: 0.969
Average loss: 0.0082945, Train accuracy: 0.95388, Val accuracy: 0.964
Average loss: 0.0082619, Train accuracy: 0.9555, Val accuracy: 0.97375
Average loss: 0.0061697, Train accuracy: 0.97537, Val accuracy: 0.98525
Average loss: 0.0044598, Train accuracy: 0.98279, Val accuracy: 0.9855
Average loss: 0.004288, Train accuracy: 0.98746, Val accuracy: 0.99175
Average loss: 0.0035776, Train accuracy: 0.99042, Val accuracy: 0.98775
Average loss: 0.0045591, Train accuracy: 0.98337, Val accuracy: 0.99025
Average loss: 0.0029836, Train accuracy: 0.99179, Val accuracy: 0.9965
Average loss: 0.0020312, Train accuracy: 0.99462, Val accuracy: 0.9895
Average loss: 0.0027822, Train accuracy: 0.98913, Val accuracy: 0.99325
Average loss: 0.0016414, Train accuracy: 0.99596, Val accuracy: 0.9935
Avera

In [24]:
clean_test_mel = np.load(os.path.join(VAL_DIR, 'clean/8897/8897_294717_8897-294717-0001.npy'))

In [25]:
noisy_test_mel = np.load(os.path.join(VAL_DIR, 'noisy/8897/8897_294717_8897-294717-0001.npy'))

In [26]:
test_model(model, clean_test_mel)

0

In [27]:
test_model(model, noisy_test_mel)

1