# Voice Activity Detection Data Generation
This jupyter notebook takes audio files and there corresponding transcripts of the AMI corpus to prepare the Mel Spectrogram features of the parts in which a speaker is speaking and those in which there is scilence. These two categories of Mel Spectrogram features are further used as training and testing data for voice activity detection in a Deep Nural Network.

In [1]:
import os        #Importing essential libraries
import sys
import time
import glob
import librosa
import numpy as np
import xml.etree.ElementTree as ET

In [2]:
def print_progress(done,total):          #For displaying the progress bar while preprocessing audio files
    x = int(done*50.0/total)
    sys.stdout.write('['+str('='*x)+'>'+str('-'*(50-x))+']  '+str(done)+'/'+str(total)+'\r')
    sys.stdout.flush()

def progress(entity):
    print_progress(0,len(entity))
    i = 0
    for ent in entity:
        yield ent
        i+=1
        print_progress(i,len(entity))
    sys.stdout.write("\n")
    sys.stdout.flush()

In [3]:
def GetAudioFiles():                    #Returns a dictionary of all the audio file names available with
    files = {}                          #their corresponding address
    filenames = glob.glob('./amicorpus/*')
    for f in filenames:
        name = f.split('/')[-1]
        f2 = glob.glob(f+'/audio/*')[0]
        files[name] = f2
    return files

In [4]:
#Returns a dictionary of Speaker IDs with corresponding xml transcript for each audio.

def MatchSpeakers(files):               
    speakerids = {}                     
    root = ET.parse('./ami_public_manual_1.6.2/corpusResources/meetings.xml').getroot()
    for child in root:
        meetingID = child.get('observation')
        if meetingID in files:
            for type_tag in child.findall('speaker'):
                speakerID = type_tag.get('global_name')
                speaker_code = type_tag.get('nxt_agent')
                meeting_file = meetingID+'.'+speaker_code+'.segments.xml'
                speakerids[meeting_file] = speakerID
    return speakerids

In [5]:
#Generates Mel Spectrogram fectures of 'audiolen' timelength within the 'start' and 'end' time and adds 
#labels '1' and '0' respectively based on whether someone was speaking at that time or not. 

def mel_constructor(mel_collection,labels,speach,start,end,audiolen,file_loc):
    partitions = int((end-start-audiolen)/audiolen)
    for i in range(partitions):
        y, sr = librosa.load(file_loc,sr = 22050,offset = start+i*audiolen, duration = audiolen)
        mel = librosa.feature.melspectrogram(y=y,sr=sr)
        mel = mel.T
        mel_collection.append(mel)
        labels.append(speach)

In [6]:
files = GetAudioFiles()
print('Processing the following audio files:')   #list of audio files available
for file in files:
    print(file)

speakerids = MatchSpeakers(files)

Processing the following audio files:
TS3004c
IS1004c
IB4001
TS3008c
IS1007c
TS3007c
IB4003
TS3003c
IB4004
IS1002c
TS3012c
IB4005
IS1003c
IS1001c
TS3011c
IS1006c
IS1008c
IB4002
IS1005c
TS3005c
TS3009c
IS1009c
TS3006c
TS3010c
IB4010
IS1000c
IB4011


In [7]:
#For the given list of files, it uses the above defined functions to get the Mel features and the
#corresponding speaker labels and timestamps for each audio file individually to distiguish the
#speach and non-speach segments and collect the corresponding labeled Mel features

def mel_spectrum_generator(files,accept):
    mel_collection = []
    labels = []
    audiolen = 1       ##seconds
    time_limit = 300        ##seconds for each audio clip

    for filename in files:

        if filename not in accept:
            continue

        print('Proccessing file: '+filename)
        speaker = 0
        
        segments = []

        for fileaddr in speakerids:
            if fileaddr[:-15]==filename:
                speakerid = speakerids[fileaddr]
                speaker = speaker+1
                root = ET.parse('./ami_public_manual_1.6.2/segments/'+fileaddr).getroot()
                for type_tag in root.findall('segment'):
                    start = float(type_tag.get('transcriber_start'))
                    end = float(type_tag.get('transcriber_end'))
                    if start>=time_limit: continue
                    if end>time_limit: end = time_limit
                    segments.append([start,end])
                    
        segments.sort()

        last_end = 0
        for start,end in progress(segments):
            if start>last_end:
                mel_constructor(mel_collection,labels,0,last_end,start,audiolen,files[filename])
            mel_constructor(mel_collection,labels,1,start,end,audiolen,files[filename])
            last_end = end
    
    return mel_collection,labels

In [9]:
training_data = ['TS3004c',
                 'IS1004c',
                 'IB4001',
                 'TS3008c',
                 'IS1007c',
                 'TS3007c',
                 'IB4003',
                 'TS3003c',
                 'IB4004',
                 'IS1002c',
                 'TS3012c',
                 'IB4005',
                 'IS1003c',
                 'IS1001c',
                 'TS3011c',
                 'IS1006c',
                 'IS1008c',
                 'IB4002',
                 'IS1005c',
                 'TS3005c',
                 'TS3009c',
                 'IS1009c',
                 'TS3006c',
                 'IB4010',]

In [12]:
#Get training data
vad_mel_collection_train,vad_labels_train = mel_spectrum_generator(files,training_data)
np.save('./vad_dataset/vad_mel_collection_train',vad_mel_collection_train)
np.save('./vad_dataset/vad_labels_train',vad_labels_train)

Proccessing file: TS3004c
Proccessing file: IS1004c
Proccessing file: IB4001
Proccessing file: TS3008c
Proccessing file: IS1007c
Proccessing file: TS3007c
Proccessing file: IB4003
Proccessing file: TS3003c
Proccessing file: IB4004
Proccessing file: IS1002c
Proccessing file: TS3012c
Proccessing file: IB4005
Proccessing file: IS1003c
Proccessing file: IS1001c
Proccessing file: TS3011c
Proccessing file: IS1006c
Proccessing file: IS1008c
Proccessing file: IB4002
Proccessing file: IS1005c
Proccessing file: TS3005c
Proccessing file: TS3009c
Proccessing file: IS1009c
Proccessing file: TS3006c
Proccessing file: IB4010


In [11]:
#Get unseen testing data
test_data = ['TS3010c','IS1000c','IB4011']
for a in test_data:
    vad_mel_collection_test,vad_labels_test = mel_spectrum_generator(files,[a])
    np.save('./vad_dataset/vad_mel_collection_test'+a,vad_mel_collection_test)
    np.save('./vad_dataset/vad_labels_test'+a,vad_labels_test)

Proccessing file: TS3010c
Proccessing file: IS1000c
Proccessing file: IB4011
