## Preprocessing (run once unless data already preprocessed)

### Utility functions

In [80]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#import matplotlib.pyplot as plt
#import torchaudio
import torch
import numpy as np
import pandas as pd
import os
import pickle
import re
#import torchaudio.transforms as T
import math
#import librosa
#import librosa.display
#import matplotlib.patches as patches
from glob import glob

torch.manual_seed(1)

def convert_to_ms(times_list):
    ms_time_list = []
    for elt in times_list:
        ms_time_list.append([int(elt[0]*1000),int(elt[1]*1000)])
    return ms_time_list

def load_data(pkl_path = '/project/graziul/ra/ajays/whitelisted_vad_dict.pkl'):
    file = open(pkl_path,'rb')
    vad_dict = pickle.load(file)
    file.close()
    input_list = []
    labels_list = []

    for idx,key in enumerate(vad_dict):
        print(idx)
        if idx > 100000000:
            break
        a = audio_file(key)
        a.get_slices(vad_dict)
        input_list.append(a.get_split_mfcc()) 
        a.get_split_frames()
        labels_list.append(a.get_split_labels()) 
        #a.get_plots()
    input_list = torch.cat(input_list)
    input_list = torch.transpose(input_list,1,2)
    labels_list = torch.from_numpy(np.concatenate(labels_list,axis = 0)).float()
    return input_list, labels_list

def load_data_limit(k=10000000,pkl_path = '/project/graziul/ra/ajays/whitelisted_vad_dict.pkl'): #load a subset of data
    #pkl_path = '/project/graziul/data/Zone1/2018_08_04/2018_08_04vad_dict.pkl'
    file = open(pkl_path,'rb')
    vad_dict = pickle.load(file)
    file.close()
    input_list = []
    labels_list = []

    for idx,key in enumerate(vad_dict):
        print(idx)
        if(idx == k):
            break
        a = audio_file(key)
        a.get_slices(vad_dict)
        input_list.append(a.get_split_mfcc()) 
        a.get_split_frames()
        labels_list.append(a.get_split_labels()) 
        #a.get_plots()
    input_list = torch.cat(input_list)
    input_list = torch.transpose(input_list,1,2)
    labels_list = torch.from_numpy(np.concatenate(labels_list,axis = 0)).float()
    return input_list, labels_list

def divide_audio(datafile, div_size = 30): #Divide the audio clip into bits of 1 minute each
#resizes input arrays from (1,feature_length, time) to (div_size,feature_length,time/div_length)
    return np.reshape(datafile,[div_size,datafile.shape[1],datafile.shape[2]//div_size])

class audio_file():
    def __init__(self, name,new_flag = 1):
        self.name = name
        self.vad_slices = None
        self.frames = None
        self.frames_labels = None
        self.mfcc = None
        self.n_clips = 300
        self.sample_rate = 22050
        self.flag = new_flag
    
    def get_slices(self, vad_dict):
        if self.flag == 1:
            self.vad_slices = vad_dict[self.name]['nonsilent_slices']
        else:
            self.vad_slices = vad_dict[self.name]['pydub'][-24]['nonsilent_slices']
        return self.vad_slices
    
    def get_frames(self):
        ms_2_sample = self.sample_rate/1000
        frames_array = np.zeros(self.mfcc.shape[2])

        for v in self.vad_slices:
            start = math.floor(v[0]*ms_2_sample)
            end = math.ceil(v[1]*ms_2_sample)
            #print(v)
            for i in range(start,end):
                n = math.floor(i/220)
                j = i%220
                if j <= 110:
                    frames_array[n-2] += 1
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                elif j>=111 and j<=220:
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                elif j>=221 and j<=330:
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                    frames_array[n+1] += 1
                elif j>=331 and j<=440:
                    frames_array[n+1] += 1
                    frames_array[n] += 1
                elif j>=441:
                    frames_array[n+2] += 1
                    frames_array[n+1] += 1
                    frames_array[n] += 1
            
        self.frames = frames_array
        return self.frames
        
    def get_split_frames(self):
        ms_2_sample = self.sample_rate/1000
        frames_array = np.zeros(self.mfcc.shape[2]*self.n_clips)
        print(frames_array.shape)

        for v in self.vad_slices:
            start = math.floor(v[0]*ms_2_sample)
            end = math.ceil(v[1]*ms_2_sample)
            for i in range(start,end):
                n = min(math.floor(i/220),len(frames_array)-1)
                j = i%220
                if j <= 110:
                    frames_array[n-2] += 1
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                elif j>=111 and j<=220:
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                elif j>=221 and j<=330:
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                    frames_array[n+1] += 1
                elif j>=331 and j<=440:
                    frames_array[n+1] += 1
                    frames_array[n] += 1
                elif j>=441:
                    frames_array[n+2] += 1
                    frames_array[n+1] += 1
                    frames_array[n] += 1
        
        self.clip_size = self.mfcc.shape[2]
        frame_arr_list = []
        for j in range(self.n_clips):
            frame_arr_list.append(np.expand_dims(frames_array[j*self.clip_size:(j+1)*self.clip_size],axis=0))
        self.frames = np.concatenate(frame_arr_list,axis=0)
        return self.frames
        
    def get_labels(self): 
        self.frames_labels = np.zeros(len(self.frames))
        self.frames_labels[np.where(self.frames>0)] = 1
        return self.frames_labels
    
    def get_split_labels(self):
        self.frames_labels = np.zeros_like(self.frames)
        self.frames_labels[np.where(self.frames>0)] = 1
        return self.frames_labels
        
    def get_mfcc(self): 
        if self.flag == 0:
            file_name = '/project/graziul/data/Zone1/2018_08_04/' + self.name
        else:
            file_name = self.name
        self.waveform, self.sample_rate = torchaudio.load(file_name)
        pad_array = torch.zeros((1,10000*self.sample_rate))
        pad_array[:,:self.waveform.shape[1]] = self.waveform
        self.waveform = pad_array
        self.waveform = self.waveform[:,:1800*self.sample_rate] #Clip the file at 1800s
        n_fft = 2048
        win_length = 551
        hop_length = 220
        n_mels = 40
        n_mfcc = 40

        mfcc_transform = T.MFCC(
            sample_rate=self.sample_rate,
            n_mfcc=n_mfcc,
            melkwargs={
              'n_fft': n_fft,
              'n_mels': n_mels,
              'hop_length': hop_length,
              'mel_scale': 'htk',
            }
        )

        self.mfcc = mfcc_transform(self.waveform)
        return self.mfcc
    
    def get_split_mfcc(self):
        if self.flag == 1:
            file_name = self.name
        else:
            file_name = '/project/graziul/data/Zone1/2018_08_04/' + self.name
        self.waveform, self.sample_rate = torchaudio.load(file_name)
        pad_array = torch.zeros((1,10000*self.sample_rate))
        pad_array[:,:self.waveform.shape[1]] = self.waveform
        self.waveform = pad_array
        self.waveform = self.waveform[:,:1800*self.sample_rate] #Clip the file at 1800s
        clip_size = math.floor(self.waveform.shape[1]/self.n_clips)
        n_clips = self.n_clips
        mfcc_list = []
        n_fft = 2048
        win_length = 551
        hop_length = 220
        n_mels = 40
        n_mfcc = 40
        mfcc_transform = T.MFCC(
                sample_rate=self.sample_rate,
                n_mfcc=n_mfcc,
                melkwargs={
                  'n_fft': n_fft,
                  'n_mels': n_mels,
                  'hop_length': hop_length,
                  'mel_scale': 'htk',
                }
            )
        for i in range(n_clips):
            mfcc_list.append(mfcc_transform(self.waveform[:,i*clip_size:(i+1)*clip_size]))
        self.mfcc = torch.cat(mfcc_list)
        return self.mfcc

### Preprocess the BPC data

In [None]:
input_list, labels_list = load_data_limit()
torch.save(input_list, 'bpc_input')
torch.save(labels_list, 'bpc_labels')

## Simple LSTM

### Load the BPC data and select the training set

In [38]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

input_list = torch.load('bpc_input')
labels_list = torch.load('bpc_labels')

shuffle = False
batch_size = 100
sequence_length = 602 # the length of each truncated segment
input_size = 40 # the number of MFCC's
train_end = 80 # the end index of the training set
data_size = sequence_length*train_end # the length of the training data

train_data = TensorDataset(input_list[:train_end].reshape([train_end*sequence_length, input_size]), labels_list[:train_end].reshape((train_end*sequence_length)))
train_loader = DataLoader(train_data, shuffle=shuffle, batch_size=batch_size)

### Define the model

In [39]:
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

hidden_size = 128
num_layers = 2
num_classes = 2
num_epochs = 2
learning_rate = 0.001

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Set initial hidden and cell states 
        h0 = torch.zeros(self.num_layers, self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, self.hidden_size).to(device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, :])
        return out

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Train the model

In [40]:
total_step = len(train_loader)
for epoch in range(num_epochs):  
    running_loss = 0.0     
    for i, (mfccs, labels) in enumerate(train_loader):
        #mfccs = mfccs.reshape(-1, sequence_length, input_size)#.to(device)
        #labels = labels#.to(device)
        #mfccs = mfccs.reshape(mfccs.shape[0],1,mfccs.shape[1])
        labels = labels.long().to(device)
        # Forward pass
        outputs = model(mfccs).to(device)
        loss = criterion(outputs, labels)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()    
        running_loss += loss.item() * batch_size
    epoch_loss = running_loss / data_size
    print(epoch_loss)

0.11719018083443905
0.09855696779314739


### Select the test set

In [41]:
batch_size = 100
test_start = 80 # the start index of the test set
test_end = 100 # the end index of the test set
test_length = test_end-test_start # the length of the test set

test_data = TensorDataset(input_list[test_start:test_end].reshape([test_length*sequence_length, input_size]), labels_list[test_start:test_end].reshape([test_length*sequence_length]))
test_loader = DataLoader(test_data, shuffle=shuffle, batch_size=batch_size)

### Test the model

In [42]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    predictions_array = np.empty([0])
    labels_array = np.empty([0])
    for mfccs, labels in test_loader:
        labels = labels.long()
        labels_array = np.append(labels_array, labels.numpy())
        #labels = labels#.to(device)
        outputs = model(mfccs)
        _, predicted = torch.max(outputs.data, 1)
        predictions_array = np.append(predictions_array, predicted.numpy())
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print('Test Accuracy of the model: {} %'.format(100 * correct / total)) 
# Save the model checkpoint
torch.save(model.state_dict(), 'lstm.ckpt')

Test Accuracy of the model: 100.0 %


In [43]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(labels_array, predictions_array, average='weighted')

(1.0, 1.0, 1.0, None)

### Results

2 layers, 128 units, 481,600 frames (~200 mins), not shuffled, alpha=0.001, batch size 100, 2 epochs
    
    BPC precision: 0.9081431961609046, recall: 0.9077325581395349, f1: 0.9079344068144921
    ATC0 precision: 0.9488357406336964, recall: 0.9489027431421446, f1: 0.9488683496558303

2 layers, 128 units, 481,600 frames (~200 mins), shuffled, alpha=0.001, batch size 100, 2 epochs

    precision: 0.8985289906640821, recall: 0.9031395348837209, f1: 0.9002618553001477

2 layers, 128 units, 481,600 frames (~200 mins), not shuffled, alpha=0.01, batch size 100, 4 epochs
    
    precision: 0.891586545627616, recall: 0.8993023255813953, f1: 0.8934031460609343
    
2 layers, 128 units, 481,600 frames (~200 mins), shuffled, alpha=0.01, batch size 100, 4 epochs
    
    precision: 0.8988646210040573, recall: 0.9013704318936877, f1: 0.8999850823134596

## Bi-directional LSTM

### Define the model

In [75]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

hidden_size = 128
num_layers = 2
num_classes = 2
num_epochs = 2
learning_rate = 0.001

class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
    
    def forward(self, x):
        # Set initial states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Train the model

In [76]:
total_step = len(train_loader)
for epoch in range(num_epochs):  
    running_loss = 0.0     
    for i, (mfccs, labels) in enumerate(train_loader):
        #mfccs = mfccs.reshape(-1, sequence_length, input_size)#.to(device)
        #labels = labels#.to(device)
        mfccs = mfccs.reshape(mfccs.shape[0],1,mfccs.shape[1])
        labels = labels.long().to(device)
        # Forward pass
        outputs = model(mfccs).to(device)
        loss = criterion(outputs, labels)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()    
        running_loss += loss.item() * batch_size
    epoch_loss = running_loss / data_size
    print(epoch_loss)

0.10596395130038865
0.08907226200261543


### Test the model

In [78]:
with torch.no_grad():
    correct = 0
    total = 0
    for mfccs, labels in test_loader:
        mfccs = mfccs.reshape(mfccs.shape[0],1,mfccs.shape[1])
        #labels = labels.squeeze()
        labels = labels.to(device)
        outputs = model(mfccs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model : {} %'.format(100 * correct / total)) 

# Save the model checkpoint
torch.save(model.state_dict(), 'bilstm.ckpt')

Test Accuracy of the model : 100.0 %


In [79]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(labels, predicted, average='weighted')

(1.0, 1.0, 1.0, None)