Loading the MFCC for training set of audio. The first 25 coefficients are extracted for each frame. Next the per-frame values for each coefficient are summarized across time using the following summary statistics: minimum, maximum, median, mean, varience, skewness, kurtosis. The result vector for training has 175 features.

In [1]:
import numpy as np
import pandas as pd
from xml.dom import minidom
import scipy.stats.stats as st

startMFCC = 0

#summary statistics
def minMFCC(mfccCoeffs):
    mfccMins = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMins.append(min(mfccCoeffs[:,i]))
    return mfccMins

def maxMFCC(mfccCoeffs):
    mfccMaxs = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMaxs.append(max(mfccCoeffs[:,i]))
    return mfccMaxs

def medianMFCC(mfccCoeffs):
    mfccMedians = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMedians.append(np.median(mfccCoeffs[:,i]))
    return mfccMedians

def meanMFCC(mfccCoeffs):
    mfccMeans = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMeans.append(np.mean(mfccCoeffs[:,i]))
    return mfccMeans

def varianceMFCC(mfccCoeffs):
    mfccVars = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccVars.append(np.var(mfccCoeffs[:,i]))
    return mfccVars

def skewnessMFCC(mfccCoeffs):
    mfccSkews = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccSkews.append(st.skew(mfccCoeffs[:,i]))
    return mfccSkews

def kurtosisMFCC(mfccCoeffs):
    mfccKurts = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccKurts.append(st.kurtosis(mfccCoeffs[:,i]))
    return mfccKurts

In [None]:
freq_rate = 32000
frame_size = 1024

trainData = []
SNRnum = 1

for xmlnum in range(1, 67):
    mfccs = np.loadtxt("./MFCC/training/{0}_{1}.txt".format(str("%05d" % (xmlnum)), str(SNRnum)))

    x = minidom.parse("./XML/training/{0}.xml".format(str("%05d" % (xmlnum))))
    events = x.getElementsByTagName('events')[0]
    itemlist = events.getElementsByTagName('item')

    time_ranges = []
    for item in itemlist:
        startsecond = float(item.getElementsByTagName("STARTSECOND")[0].firstChild.nodeValue)
        endsecond = float(item.getElementsByTagName("ENDSECOND")[0].firstChild.nodeValue)
        classid = int(item.getElementsByTagName("CLASS_ID")[0].firstChild.nodeValue)
        time_ranges.append((startsecond, endsecond, classid))

    #append event row
    for event in time_ranges:
        start_frame =  int(event[0] * freq_rate / (frame_size / 2))
        end_frame = int(event[1] * freq_rate / (frame_size / 2))
        event_frames = mfccs[start_frame:end_frame]
        sum_mfcc = []
        sum_mfcc.extend(minMFCC(event_frames))
        sum_mfcc.extend(maxMFCC(event_frames))
        sum_mfcc.extend(medianMFCC(event_frames))
        sum_mfcc.extend(meanMFCC(event_frames))
        sum_mfcc.extend(varianceMFCC(event_frames))
        sum_mfcc.extend(skewnessMFCC(event_frames))
        sum_mfcc.extend(kurtosisMFCC(event_frames))
        sum_mfcc.append(event[2])
        
        trainData.append(sum_mfcc)
    
    #append neutral row
    start_n = 0
    for event in time_ranges[:(int(len(time_ranges)/3))]:
        start_frame =  int(event[0] * freq_rate / (frame_size / 2))
        end_n = start_frame - 1
        event_frames = mfccs[start_n:end_n]
        
        end_frame = int(event[1] * freq_rate / (frame_size / 2))
        start_n = end_frame + 1
        
        sum_mfcc = []
        sum_mfcc.extend(minMFCC(event_frames))
        sum_mfcc.extend(maxMFCC(event_frames))
        sum_mfcc.extend(medianMFCC(event_frames))
        sum_mfcc.extend(meanMFCC(event_frames))
        sum_mfcc.extend(varianceMFCC(event_frames))
        sum_mfcc.extend(skewnessMFCC(event_frames))
        sum_mfcc.extend(kurtosisMFCC(event_frames))
        sum_mfcc.append(1)
        
        trainData.append(sum_mfcc)
        
np.savetxt("./ml/trainset/trainset_{0}.txt".format(str(SNRnum)), trainData, fmt='%f')

Formation of the training set using a window (32 frames with 50% overlap). Event window is determined by the prevailing therein frames of small events.

In [8]:
validation = pd.read_csv('./ml/trainset_w.csv', sep='\t', index_col=0)
events = validation[validation.neutral == False]
neutral = validation[validation.neutral == True]

for SNRnum in range(1, 7):
    trainData = []
    for audio_num in range(1, 67):
        mfccs = np.loadtxt("./MFCC/training/{0}_{1}.txt".format(str("%05d" % (audio_num)), str(SNRnum)))
        
        events = validation[(validation.neutral == False) & (validation.audio_num == audio_num)]
        for i, event in events.iterrows():
            st_small_frame = int(event.frame_num * 16)
            end_small_frame = st_small_frame + 32
            event_frames = mfccs[st_small_frame:end_small_frame]
            ev_class = -1
            if (event.glass):
                ev_class = 2
            elif (event.gunshot):
                ev_class = 3
            elif (event.scream):
                ev_class = 4
            
            sum_mfcc = []
            sum_mfcc.extend(minMFCC(event_frames))
            sum_mfcc.extend(maxMFCC(event_frames))
            sum_mfcc.extend(medianMFCC(event_frames))
            sum_mfcc.extend(meanMFCC(event_frames))
            sum_mfcc.extend(varianceMFCC(event_frames))
            sum_mfcc.extend(skewnessMFCC(event_frames))
            sum_mfcc.extend(kurtosisMFCC(event_frames))
            sum_mfcc.append(ev_class)

            trainData.append(sum_mfcc)
        
        neutral = validation[(validation.neutral == True) & (validation.audio_num == audio_num)]
        for i, event in neutral[:int(len(neutral) / 3)].iterrows():
            st_small_frame = int(event.frame_num * 16)
            end_small_frame = st_small_frame + 32
            event_frames = mfccs[st_small_frame:end_small_frame]
            ev_class = 1
            sum_mfcc = []
            sum_mfcc.extend(minMFCC(event_frames))
            sum_mfcc.extend(maxMFCC(event_frames))
            sum_mfcc.extend(medianMFCC(event_frames))
            sum_mfcc.extend(meanMFCC(event_frames))
            sum_mfcc.extend(varianceMFCC(event_frames))
            sum_mfcc.extend(skewnessMFCC(event_frames))
            sum_mfcc.extend(kurtosisMFCC(event_frames))
            sum_mfcc.append(ev_class)

            trainData.append(sum_mfcc)
        print('audio ' + str(audio_num) + ' is done')
    np.savetxt("./ml/trainset_w/trainset_{0}.txt".format(str(SNRnum)), trainData, fmt='%f')

audio 1 is done
audio 2 is done
audio 3 is done
audio 4 is done
audio 5 is done
audio 6 is done
audio 7 is done
audio 8 is done
audio 9 is done
audio 10 is done
audio 11 is done
audio 12 is done
audio 13 is done
audio 14 is done
audio 15 is done
audio 16 is done
audio 17 is done
audio 18 is done
audio 19 is done
audio 20 is done
audio 21 is done
audio 22 is done
audio 23 is done
audio 24 is done
audio 25 is done
audio 26 is done
audio 27 is done
audio 28 is done
audio 29 is done
audio 30 is done
audio 31 is done
audio 32 is done
audio 33 is done
audio 34 is done
audio 35 is done
audio 36 is done
audio 37 is done
audio 38 is done
audio 39 is done
audio 40 is done
audio 41 is done
audio 42 is done
audio 43 is done
audio 44 is done
audio 45 is done
audio 46 is done
audio 47 is done
audio 48 is done
audio 49 is done
audio 50 is done
audio 51 is done
audio 52 is done
audio 53 is done
audio 54 is done
audio 55 is done
audio 56 is done
audio 57 is done
audio 58 is done
audio 59 is done
audio 