# Data Preperation
This jupyter notebook takes audio files and there corresponding transcripts of the AMI corpus to prepare the MFCC features and corresponding speaker labels for the train and test data. For training data we combine speakers of 24 different audio files of average length 20-30 mins which gives us a total of 96 different speakers. While for the test data we use 3 unseen audio files.

In [1]:
import os        #Importing essential libraries
import sys
import time
import glob
import librosa
import numpy as np
import xml.etree.ElementTree as ET

In [2]:
def print_progress(done,total):          #For displaying the progress bar while preprocessing audio files
    x = int(done*50.0/total)
    sys.stdout.write('['+str('='*x)+'>'+str('-'*(50-x))+']  '+str(done)+'/'+str(total)+'\r')
    sys.stdout.flush()

def progress(entity):
    print_progress(0,len(entity))
    i = 0
    for ent in entity:
        yield ent
        i+=1
        print_progress(i,len(entity))
    sys.stdout.write("\n")
    sys.stdout.flush()

In [3]:
def GetAudioFiles():                    #Returns a dictionary of all the audio file names available with
    files = {}                          #their corresponding address
    filenames = glob.glob('./amicorpus/*')
    for f in filenames:
        name = f.split('/')[-1]
        f2 = glob.glob(f+'/audio/*')[0]
        files[name] = f2
    return files

In [4]:
#Returns a dictionary of Speaker IDs with corresponding xml transcript for each audio. Useful for 
#recognising and accomodating same speaker accross diffent audio files.

def MatchSpeakers(files):               
    speakerids = {}                     
    root = ET.parse('./ami_public_manual_1.6.2/corpusResources/meetings.xml').getroot()
    for child in root:
        meetingID = child.get('observation')
        if meetingID in files:
            for type_tag in child.findall('speaker'):
                speakerID = type_tag.get('global_name')
                speaker_code = type_tag.get('nxt_agent')
                meeting_file = meetingID+'.'+speaker_code+'.segments.xml'
                speakerids[meeting_file] = speakerID
    return speakerids

In [5]:
#Generates MFCC fectures of 'audiolen' timelength within the 'start' and 'end' time along with the corresponding
#timestamps for the MFCC feature.

def mel_constructor(mel_collection,timestamp_collection,speakerid,start,end,audiolen,file_loc):
    partitions = int((end-start-audiolen)/audiolen)
    for i in range(partitions):
        y, sr = librosa.load(file_loc,sr = 22050,offset = start+i*audiolen, duration = audiolen)
        mel = librosa.feature.mfcc(y=y, sr=sr)
        mel = mel.T
        if speakerid not in mel_collection:
            mel_collection[speakerid] = [mel]
            timestamp_collection[speakerid] = [[start,end]]
        else:
            mel_collection[speakerid].append(mel)
            timestamp_collection[speakerid].append([start+i*audiolen,start+(i+1)*audiolen])

In [6]:
files = GetAudioFiles()
print('Processing the following audio files:')   #list of audio files available
for file in files:
    print(file)

speakerids = MatchSpeakers(files)

Processing the following audio files:
TS3004c
IS1004c
IB4001
TS3008c
IS1007c
TS3007c
IB4003
TS3003c
IB4004
IS1002c
TS3012c
IB4005
IS1003c
IS1001c
TS3011c
IS1006c
IS1008c
IB4002
IS1005c
TS3005c
TS3009c
IS1009c
TS3006c
TS3010c
IB4010
IS1000c
IB4011


In [7]:
#For the given list of files, it uses the above defined functions to get the MFCC features and the
#corresponding speaker labels and timestamps for all audio files combined.

def mel_spectrum_generator(files,speakerids,accept = None):
    mel_collection = {}
    timestamp_collection = {}
    audiolen = 1     ##seconds

    for filename in files:

        if accept is not None and filename not in accept:
            continue

        print('Proccessing file: '+filename)
        speaker = 0

        for fileaddr in speakerids:
            if fileaddr[:-15]==filename:
                speakerid = speakerids[fileaddr]
                speaker = speaker+1
                print('Speaker : '+str(speaker))
                root = ET.parse('./ami_public_manual_1.6.2/segments/'+fileaddr).getroot()
                for type_tag in progress(root.findall('segment')):
                    start = float(type_tag.get('transcriber_start'))
                    end = float(type_tag.get('transcriber_end'))
                    mel_constructor(mel_collection,timestamp_collection,speakerid,start,end,audiolen,files[filename])
                    
    print('Folloing is the list of speaker IDs and their corresponding no. of Mel spectrums:')
    for u in mel_collection:
        print(u,len(mel_collection[u]))
                    
    return mel_collection,timestamp_collection

In [8]:
#Takes the MFCC features, labels and corresponding timestamps and saves these in .npy files that
#will be used for training and testing

def save_mel_spectrums(mel_collection,timestamp_collection,max_mels,name1,name2,name3):

    mel_save = []
    mel_labels = []
    timestamps_save = []

    speaker_no = 0
    for speakerid in mel_collection:
        speaker_no = speaker_no+1
        for i in range(min(len(mel_collection[speakerid]),max_mels)):
            mel_save.append(mel_collection[speakerid][i])
            mel_labels.append(speaker_no)
            if timestamp_collection is not None:
                timestamps_save.append(timestamp_collection[speakerid][i])

    np.save(name1,np.array(mel_save))
    np.save(name2,np.array(mel_labels))
    if timestamp_collection is not None:
        np.save(name3,np.array(timestamps_save))

In [9]:
training_data = ['TS3004c',       # Files to be used for model training
                 'IS1004c',
                 'IB4001',
                 'TS3008c',
                 'IS1007c',
                 'TS3007c',
                 'IB4003',
                 'TS3003c',
                 'IB4004',
                 'IS1002c',
                 'TS3012c',
                 'IB4005',
                 'IS1003c',
                 'IS1001c',
                 'TS3011c',
                 'IS1006c',
                 'IS1008c',
                 'IB4002',
                 'IS1005c',
                 'TS3005c',
                 'TS3009c',
                 'IS1009c',
                 'TS3006c',
                 'IB4010',]

In [12]:
#Get training data
mel_collection_train,_ = mel_spectrum_generator(files,speakerids,accept = training_data)
save_mel_spectrums(mel_collection_train,None,200,'mel_save_train','mel_labels_train',None)

Proccessing file: TS3004c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IS1004c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IB4001
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: TS3008c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IS1007c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: TS3007c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IB4003
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: TS3003c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IB4004
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IS1002c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: TS3012c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IB4005
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IS1003c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Proccessing file: IS1001c
Speaker : 1
Spea

In [13]:
#Get unseen testing data
test_data = ['TS3010c','IS1000c','IB4011']
for a in test_data:
    mel_collection_test,timestamp_collection = mel_spectrum_generator(files,speakerids,accept = [a])
    save_mel_spectrums(mel_collection_test,timestamp_collection,1000000000,'mel_save_test_'+a,'mel_labels_test_'+a,'timestamps_save_'+a)

Proccessing file: TS3010c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Folloing is the list of speaker IDs and their corresponding no. of Mel spectrums:
MTD037PM 295
MTD038ID 504
MTD039UID 154
MTD040ME 183
Proccessing file: IS1000c
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Folloing is the list of speaker IDs and their corresponding no. of Mel spectrums:
MIO016 447
MIO082 308
FIE081 319
MIO050 555
Proccessing file: IB4011
Speaker : 1
Speaker : 2
Speaker : 3
Speaker : 4
Folloing is the list of speaker IDs and their corresponding no. of Mel spectrums:
MIO036 554
MIO095 425
MIO046 519
FIE038 378
