### Preprocessing

In [None]:
from sklearn.utils import shuffle
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import os
import pickle
import re
import math
from glob import glob

In [None]:
def convert_to_ms(times_list):
    '''Utility function for converting seconds to milliseconds'''
    ms_time_list = []
    for elt in times_list:
        ms_time_list.append([int(elt[0]*1000),int(elt[1]*1000)])
    return ms_time_list

def process_atc_label_line(line):
    '''Utility function for parsing transcription file'''
    lines_list = str.split(line, "\n")
    times_list = []
    for elt in lines_list:
        if elt[0:4] == " (TI":
            num_string = ['1','2','3','4','5','6','7','8','9','0',' ','.']
            new_elt = ''.join(c for c in elt if c in num_string)
            new_elt_tok = new_elt.split() 
            new_elt_tok = [float(num) for num in new_elt_tok]
            times_list.append(new_elt_tok)
    return convert_to_ms(times_list)

class atc_audio_file():
    def __init__(self, name,audio_path, labels_path,new_flag = 0):
        self.name = name
        self.vad_slices = None
        self.frames = None
        self.frames_labels = None
        self.mfcc = None
        self.n_clips = 100
        self.flag = new_flag
        self.labels_path = labels_path
        self.audio_path = audio_path
    
    def get_slices(self):
        labels_path = self.labels_path
        with open(labels_path, 'r') as f:
            label = f.read()
        label = process_atc_label_line(label)
        self.vad_slices = label
        return self.vad_slices
            
    def get_split_frames(self):
        ms_2_sample = self.sample_rate/1000
        frames_array = np.zeros(self.mfcc.shape[2]*self.n_clips)
        
        for v in self.vad_slices:
            start = math.floor(v[0]*ms_2_sample)
            end = math.ceil(v[1]*ms_2_sample)
            for i in range(start,end):
                n = min(math.floor(i/220),len(frames_array)-1)
                j = i%220
                if j <= 110:
                    frames_array[n-2] += 1
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                elif j>=111 and j<=220:
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                elif j>=221 and j<=330:
                    frames_array[n-1] += 1
                    frames_array[n] += 1
                    frames_array[n+1] += 1
                elif j>=331 and j<=440:
                    frames_array[n+1] += 1
                    frames_array[n] += 1
                elif j>=441:
                    frames_array[n+2] += 1
                    frames_array[n+1] += 1
                    frames_array[n] += 1
        
        self.clip_size = self.mfcc.shape[2]
        frame_arr_list = []
        for j in range(self.n_clips):
            frame_arr_list.append(np.expand_dims(frames_array[j*self.clip_size:(j+1)*self.clip_size],axis=0))
        self.frames = np.concatenate(frame_arr_list,axis=0)
        self.frames = np.clip(self.frames,0,1)
        return self.frames
    
    def get_split_labels(self):
        self.frames_labels = np.zeros_like(self.frames)
        self.frames_labels[np.where(self.frames>0)] = 1
        return self.frames_labels
    
    def get_split_mfcc(self):
        if self.flag == 1:
            file_name = self.name
        else:
            file_name = self.audio_path
        waveform, sample_rate = torchaudio.load(file_name)
        effects = [['rate', '22050']]
        self.waveform, self.sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
        pad_array = torch.zeros((1,10000*self.sample_rate))
        pad_array[:,:self.waveform.shape[1]] = self.waveform
        self.waveform = pad_array
        self.waveform = self.waveform[:,:4000*self.sample_rate]
        clip_size = math.floor(self.waveform.shape[1]/self.n_clips)
        n_clips = self.n_clips
        mfcc_list = []
        n_fft = 2048
        win_length = 551
        hop_length = 220
        n_mels = 40
        n_mfcc = 40
        mfcc_transform = T.MFCC(
                sample_rate=self.sample_rate,
                n_mfcc=n_mfcc,
                melkwargs={
                  'n_fft': n_fft,
                  'n_mels': n_mels,
                  'hop_length': hop_length,
                  'mel_scale': 'htk',
                }
            )
        for i in range(n_clips):
            mfcc_list.append(mfcc_transform(self.waveform[:,i*clip_size:(i+1)*clip_size]))
        self.mfcc = torch.cat(mfcc_list)
        return self.mfcc

def process_file_atc0(filename,audio_path, labels_path):
    fname = filename
    audio_file = atc_audio_file(fname, audio_path, labels_path)
    audio_file.get_slices()
    mfcc = audio_file.get_split_mfcc()
    frames = audio_file.get_split_frames()
    print(filename)
    return mfcc, frames    
    
def process_atc0_files(k=100):
    input_list = []
    labels_list = []
    #paths = ['/project/graziul/data/corpora/atc0_comp/atc0_bos/data/audio/', '/project/graziul/data/corpora/atc0_comp/atc0_dca/data/audio/', '/project/graziul/data/corpora/atc0_comp/atc0_dfw/data/audio/']
    paths = ['/project/graziul/data/corpora/atc0_comp/atc0_bos/data/audio/']
    for idx,path in enumerate(paths):
        for fpath in glob(path + '*.sph'):
            if(idx > k):
                break
            filename = fpath[-12:-4]
            label_file = path[:-6] + 'transcripts/' + filename + '.txt'
            x,y = process_file_atc0(filename, fpath, label_file)
            input_list.append(x)
            labels_list.append(y)
            idx = idx+1
        if(idx>k):
            break
    input_list = torch.cat(input_list)
    input_list = torch.transpose(input_list,1,2)
    labels_list = torch.from_numpy(np.concatenate(labels_list,axis = 0)).float()
    return input_list, labels_list

In [None]:
# Generate the MFCC data and the "ground truth" labels from the transcriptions
input_list, labels_list = process_atc0_files()
# Save the input data and labels
torch.save(input_list, 'atc0_data')
torch.save(labels_list, 'atc0_labels')

### Ensemble SVM

In [None]:
# Load the data 
input_list = torch.load('atc0_data')
labels_list = torch.load('atc0_labels')

# Shuffle the data and labels in unison 
x, y = shuffle(input_list, labels_list, random_state=0)

In [None]:
# Perform a grid search on the "first" 1% of the data to determine the hyperparameters
x_model_select = x[:1]
y_model_select = y[:1]

parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10], 'gamma':[0.001, 0.01, 0.1, 1]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=2)
clf.fit(x_model_select.reshape([4010,40]), y_model_select.reshape(4010))

In [None]:
means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
# The training data is split into 5 segments and a seperate SVM is trained on each
svm1 = svm.SVC(kernel='rbf', C=0.1, gamma=0.001)
svm1.fit(x[10:20].reshape([40100,40]), y[10:20].reshape(40100))
svm2 = svm.SVC(kernel='rbf', C=0.1, gamma=0.001)
svm2.fit(x[20:30].reshape([40100,40]), y[20:30].reshape(40100))
svm3 = svm.SVC(kernel='rbf', C=0.1, gamma=0.001)
svm3.fit(x[30:40].reshape([40100,40]), y[30:40].reshape(40100))
svm4 = svm.SVC(kernel='rbf', C=0.1, gamma=0.001)
svm4.fit(x[40:50].reshape([40100,40]), y[40:50].reshape(40100))
svm5 = svm.SVC(kernel='rbf', C=0.1, gamma=0.001)
svm5.fit(x[50:60].reshape([40100,40]), y[50:60].reshape(40100))

In [None]:
# Each SVM makes predictions for the validation data and these predictions are used as the feature set for the 6th SVM
svm1_predictions = svm1.predict(input_list[60:70].reshape([40100,40]))
svm2_predictions = svm2.predict(input_list[60:70].reshape([40100,40]))
svm3_predictions = svm3.predict(input_list[60:70].reshape([40100,40]))
svm4_predictions = svm4.predict(input_list[60:70].reshape([40100,40]))
svm5_predictions = svm5.predict(input_list[60:70].reshape([40100,40]))

In [None]:
stacked_predictions = np.vstack([svm1_predictions, svm2_predictions, svm3_predictions, svm4_predictions, svm5_predictions])
stacked_predictions

In [None]:
np.save('ensemble_svm_stacked', stacked_predictions)

In [None]:
svm6 = svm.SVC(kernel='rbf', C=0.1, gamma=0.001)
svm6.fit(stacked_predictions.T, labels_list[60:70].reshape(40100))
predicted = svm6.predict(stacked_predictions.T)

In [None]:
precision_recall_fscore_support(labels_list[60:70].reshape(40100), predicted, average='weighted')

### Neural Network

In [None]:
import torch
import tensorflow as tf

x_np = x.numpy()
x_tensorflow = tf.convert_to_tensor(x_np)

y_np = y.numpy()
y_tensorflow = tf.convert_to_tensor(y_np)

In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(12, input_dim=40, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(tf.reshape(x_tensorflow[:80], [320800,40], name=None), tf.reshape(y_tensorflow[:80],[320800]), epochs=100, batch_size=100)

In [None]:
model.save('nn_100epochs')

In [None]:
predictions = model.predict(tf.reshape(x_tensorflow[80:], [3689200,40], name=None))

In [None]:
precision_recall_fscore_support(labels_list[80:].reshape(3689200), (predictions > .5).astype(int), average='weighted')

In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(12, input_dim=40, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(tf.reshape(x_tensorflow[:800], [3208000,40], name=None), tf.reshape(y_tensorflow[:800],[3208000]), epochs=10, batch_size=100)

In [None]:
model.save('nn_10epochs')

In [None]:
predictions = model.predict(tf.reshape(x_tensorflow[800:], [802000,40], name=None))
precision_recall_fscore_support(labels_list[800:].reshape(802000), (predictions > .5).astype(int), average='weighted')

### BiLSTM

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

batch_size = 100

train_data = TensorDataset(input_list[:800], labels_list[:800])
test_data = TensorDataset(input_list[800:1000], labels_list[800:1000])

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
sequence_length = 4010 #28
input_size = 40 #28
hidden_size = 1024 #128
num_layers = 4 #2
num_classes = 2 #10
num_epochs = 2
learning_rate = 0.003

# Bidirectional recurrent neural network
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
    
    def forward(self, x):
        # Set initial states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (mfccs, labels) in enumerate(train_loader):
        mfccs = mfccs.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(mfccs)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')