# Feature Extraction From Audio Files
In this file we will be extracting eccential features from the VoxConverse 2020 dataset. We will parse the transcripts to obatain the timestampe. On those timestamps we will extract data which will include the MFCC features of 1sec audio segments, the true speaker labels, starting and ending time of the segments.

In [10]:
import os
import sys
import librosa
import numpy as np

In [2]:
audio_len = 1.0
training_folder = '../Data/Audio_Dataset/Training/'
transcripts_folder = '../Data/Audio_Dataset/Transcripts/'

In [3]:
def print_progress(done,total):          #For displaying the progress bar while preprocessing audio files
    x = int(done*50.0/total)
    sys.stdout.write('['+str('='*x)+'>'+str('-'*(50-x))+']  '+str(done)+'/'+str(total)+'\r')
    sys.stdout.flush()

def progress(entity):
    print_progress(0,len(entity))
    i = 0
    for ent in entity:
        yield ent
        i+=1
        print_progress(i,len(entity))
    sys.stdout.write("\n")
    sys.stdout.flush()

In [4]:
def get_mfcc(audio_folder, transcript_folder, audio_len):   # Extracting the features
    speaker_id = 0
    mfcc_store = []
    timestamps = []
    labels = []
    verify = 0
    files = os.listdir(audio_folder)
    file_number = 0
    for file in files:    # Itera
        file_number+=1
        print("Processing File: "+str(file_number)+"/"+str(len(files))+"  ("+file+")")
        store = {}
        transcript_file = transcript_folder+file.split('.')[0]+'.rttm'
        log = open(transcript_file,'r')
        log = log.readlines()
        for line in progress(log):     # Parsing the transcript of the audio file
            words = line.split()
            speaker = words[7]
            if speaker not in store.keys():
                store[speaker] = []
            start = float(words[3])
            duration = float(words[4])
            verify+=int(duration)
            end = start+duration
            while start+audio_len<=end:
                audio,sr = librosa.load(audio_folder+file,sr = 16000,offset = start, duration = audio_len)
                mfcc = librosa.feature.mfcc(y = audio, sr = sr, n_mfcc=40)  # Extracting MFCC features
                mfcc = mfcc.T
                store[speaker].append([mfcc,start])
                start+=audio_len
                
        for key in store:
            limit = 300
            speaker_id+=1
            for mfcc,start in store[key]:    # Collecting the MFCC features corresponding to the labels
                mfcc_store.append(mfcc)
                labels.append(speaker_id)
                timestamps.append([start,start+audio_len])
                limit-=1
                if limit==0: break
                
    return mfcc_store,labels,timestamps,verify

In [5]:
training_mfcc,training_labels,training_timestamps,verify = get_mfcc(training_folder,transcripts_folder,audio_len)

Processing File: 1/180  (gwtwd.wav)
Processing File: 2/180  (jnivh.wav)
Processing File: 3/180  (rxgun.wav)
Processing File: 4/180  (dhorc.wav)
Processing File: 5/180  (ehpau.wav)
Processing File: 6/180  (gqdxy.wav)
Processing File: 7/180  (kkwkn.wav)
Processing File: 8/180  (qppll.wav)
Processing File: 9/180  (bkwns.wav)
Processing File: 10/180  (cmfyw.wav)
Processing File: 11/180  (cyyxp.wav)
Processing File: 12/180  (tucrg.wav)
Processing File: 13/180  (ndkwv.wav)
Processing File: 14/180  (jcako.wav)
Processing File: 15/180  (mdbod.wav)
Processing File: 16/180  (ioasm.wav)
Processing File: 17/180  (szsyz.wav)
Processing File: 18/180  (mekog.wav)
Processing File: 19/180  (cobal.wav)
Processing File: 20/180  (sikkm.wav)
Processing File: 21/180  (gpjne.wav)
Processing File: 22/180  (pnook.wav)
Processing File: 23/180  (oenox.wav)
Processing File: 24/180  (qouur.wav)
Processing File: 25/180  (rcxzg.wav)
Processing File: 26/180  (tjkfn.wav)
Processing File: 27/180  (dvngl.wav)
Processing

Processing File: 168/180  (mqxsf.wav)
Processing File: 169/180  (dscgs.wav)
Processing File: 170/180  (kuduk.wav)
Processing File: 171/180  (bxpwa.wav)
Processing File: 172/180  (oklol.wav)
Processing File: 173/180  (oxxwk.wav)
Processing File: 174/180  (eziem.wav)
Processing File: 175/180  (oekmc.wav)
Processing File: 176/180  (bravd.wav)
Processing File: 177/180  (tplwz.wav)
Processing File: 178/180  (hgeec.wav)
Processing File: 179/180  (esrit.wav)
Processing File: 180/180  (exymw.wav)


In [6]:
print(len(training_mfcc),len(training_labels),len(training_timestamps),verify)

52153 52153 52153 56013


In [7]:
np.save('../Data/MFCC_Features/Training/training_mfcc',training_mfcc)       #Saving files obtained
np.save('../Data/MFCC_Features/Training/training_labels',training_labels)
np.save('../Data/MFCC_Features/Training/training_timestamps',training_timestamps)