In [1]:
import glob
import os
import random

import librosa # for mel-spectrogram estimation
import soundfile as sf # for opening .flac audio
from matplotlib import pyplot as plt
import numpy as np
import torch

from pylab import rcParams
rcParams['figure.figsize'] = 20, 40

In [43]:
def seed_torch(seed=22):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [32]:
THRESHOLD = 0.5

In [4]:
TRAIN_DIR = 'data/train'
VAL_DIR = 'data/val'

In [None]:
clean_audio, framerate = sf.read('data/audio_samples/' + '20-205-0000.flac')
noisy_audio, framerate = sf.read('data/audio_samples/' + '20-205-0000_noisy.wav')

In [None]:
print(len(clean_audio) / float(framerate)) # length in seconds
print(len(noisy_audio) / float(framerate))

In [None]:
# normalized log-mel-spectrogram of clean and noisy audios
clean_mel = 1 + np.log(1.e-12 + librosa.feature.melspectrogram(clean_audio, sr=16000, n_fft=1024, hop_length=256, fmin=20, fmax=8000, n_mels=80)).T / 10.
noisy_mel = 1 + np.log(1.e-12 + librosa.feature.melspectrogram(noisy_audio, sr=16000, n_fft=1024, hop_length=256, fmin=20, fmax=8000, n_mels=80)).T / 10.

In [None]:
plt.imshow(clean_mel.T);

In [None]:
plt.imshow(noisy_mel.T);

In [5]:
def prepare_data(records):
    max_length = max(record.shape[0] for record in records)
    X = np.empty(shape=(len(records), input_length, 80))
    for i, record in enumerate(records):
        if len(record) < max_length:
            max_offset = max_length - len(record)
            offset = np.random.randint(max_offset) if max_offset else max_offset
            record = np.pad(record, ((max_offset, 0), (0, 0)), "constant")
        X[i,] = record
    return X

In [6]:
def get_batch(directory):
    clean_path = os.path.join(directory, 'clean')
    noisy_path = os.path.join(directory, 'noisy')
    for subdir in os.listdir(clean_path):
        clean_samples = [np.load(f) for f in glob.glob(os.path.join(clean_path, '{}/*.npy'.format(subdir)))]
        clean_lables = np.zeros(len(batch))
        yield clean_samples, clean_lables
        noisy_samples = [np.load(f) for f in glob.glob(os.path.join(noisy_path, '{}/*.npy'.format(subdir)))]
        noisy_lables = np.ones(len(noisy_samples))
        yield noisy_samples, noisy_lables

In [28]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
        self.dropout_layer = torch.nn.Dropout(p=0.2)
        self.hidden2out = torch.nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        outputs, (ht, ct) = self.lstm(sentence.view(sentence.shape[1], sentence.shape[0], self.embedding_dim))
        output = self.dropout_layer(ht[-1])
        output = self.hidden2out(output)
        return torch.sigmoid(output)

In [8]:
def train_model(model, loss, optimizer, num_epochs, scheduler=None):
    loss_history = []
    train_history = []
    val_history = []
    for epoch in range(num_epochs):
        model.train()
        
        loss_accum = 0
        correct_samples = 0
        total_samples = 0
        for batch, lables in get_batch(TRAIN_DIR):
            optimizer.zero_grad()
            x_gpu = torch.tensor(prepare_data(batch), dtype=torch.float32).cuda()
            y_gpu = torch.tensor(lables, dtype=torch.float32).view(len(lables), 1).cuda()
            prediction = model(x_gpu)
            loss_value = loss(prediction, y_gpu)
            loss_value.backward()

            optimizer.step()

            indices = (prediction > THRESHOLD).float()
            correct_samples += torch.sum(indices == y_gpu)
            loss_accum += loss_value
                
            total_samples += len(batch)
            
        ave_loss = loss_accum / total_samples
        loss_history.append(float(ave_loss))
        
        train_accuracy = float(correct_samples) / total_samples
        train_history.append(train_accuracy)
        
        val_accuracy = compute_accuracy(model)
        val_history.append(val_accuracy)
        
        print("Average loss: {}, Train accuracy: {}, Val accuracy: {}".format(
            ave_loss, train_accuracy, val_accuracy))
        
        if scheduler:
            scheduler.step()
    return loss_history, train_history, val_history

In [9]:
def compute_accuracy(model):
    model.eval()
    
    accuracy = 0
    correct_samples = 0
    total_samples = 0
    for batch, lables in get_batch(VAL_DIR):
        x_gpu = torch.tensor(prepare_data(batch), dtype=torch.float32).cuda()
        y_gpu = torch.tensor(lables, dtype=torch.float32).view(len(lables), 1).cuda()
        prediction = model(x_gpu)

        indices = (prediction > THRESHOLD).float()
        correct_samples += torch.sum(indices == y_gpu)
            
        total_samples += len(batch)
    if total_samples:
        accuracy = correct_samples.data.float() / total_samples
    return accuracy

In [50]:
seed_torch()
model = LSTMTagger(80, 14, 1).cuda()
loss_function = nn.BCELoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
loss_history, train_history, val_history = train_model(model, loss_function, optimizer, 20, scheduler)

Average loss: 0.017475, Train accuracy: 0.903167, Val accuracy: 0.929250
Average loss: 0.013535, Train accuracy: 0.920667, Val accuracy: 0.949250
Average loss: 0.011492, Train accuracy: 0.940125, Val accuracy: 0.969000
Average loss: 0.008295, Train accuracy: 0.953875, Val accuracy: 0.964000
Average loss: 0.008262, Train accuracy: 0.955500, Val accuracy: 0.973750
Average loss: 0.006170, Train accuracy: 0.975375, Val accuracy: 0.985250
Average loss: 0.004460, Train accuracy: 0.982792, Val accuracy: 0.985500
Average loss: 0.004288, Train accuracy: 0.987458, Val accuracy: 0.991750
Average loss: 0.003578, Train accuracy: 0.990417, Val accuracy: 0.987750
Average loss: 0.004559, Train accuracy: 0.983375, Val accuracy: 0.990250
Average loss: 0.002984, Train accuracy: 0.991792, Val accuracy: 0.996500
Average loss: 0.002031, Train accuracy: 0.994625, Val accuracy: 0.989500
Average loss: 0.002782, Train accuracy: 0.989125, Val accuracy: 0.993250
Average loss: 0.001641, Train accuracy: 0.995958, V

In [92]:
def test_model(model, mel):
    x_gpu = torch.tensor(mel, dtype=torch.float32).cuda()
    prediction = model(x_gpu.unsqueeze(0))
    return (prediction > THRESHOLD).item()

In [93]:
test_model(model, test_mel)

0