Loading the MFCC for training set of audio. The first 25 coefficients are extracted for each frame. Next the per-frame values for each coefficient are summarized across time using the following summary statistics: minimum, maximum, median, mean, varience, skewness, kurtosis. The result vector for training has 175 features.

In [None]:
import numpy as np
import pandas as pd
from xml.dom import minidom
import scipy.stats.stats as st

startMFCC = 0

#summary statistics
def minMFCC(mfccCoeffs):
    mfccMins = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMins.append(min(mfccCoeffs[:,i]))
    return mfccMins

def maxMFCC(mfccCoeffs):
    mfccMaxs = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMaxs.append(max(mfccCoeffs[:,i]))
    return mfccMaxs

def medianMFCC(mfccCoeffs):
    mfccMedians = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMedians.append(np.median(mfccCoeffs[:,i]))
    return mfccMedians

def meanMFCC(mfccCoeffs):
    mfccMeans = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMeans.append(np.mean(mfccCoeffs[:,i]))
    return mfccMeans

def varianceMFCC(mfccCoeffs):
    mfccVars = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccVars.append(np.var(mfccCoeffs[:,i]))
    return mfccVars

def skewnessMFCC(mfccCoeffs):
    mfccSkews = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccSkews.append(st.skew(mfccCoeffs[:,i]))
    return mfccSkews

def kurtosisMFCC(mfccCoeffs):
    mfccKurts = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccKurts.append(st.kurtosis(mfccCoeffs[:,i]))
    return mfccKurts

In [None]:
freq_rate = 32000
frame_size = 1024

trainData = []
SNRnum = 1

for xmlnum in range(1, 67):
    mfccs = np.loadtxt("./MFCC/training/{0}_{1}.txt".format(str("%05d" % (xmlnum)), str(SNRnum)))

    x = minidom.parse("./XML/training/{0}.xml".format(str("%05d" % (xmlnum))))
    events = x.getElementsByTagName('events')[0]
    itemlist = events.getElementsByTagName('item')

    time_ranges = []
    for item in itemlist:
        startsecond = float(item.getElementsByTagName("STARTSECOND")[0].firstChild.nodeValue)
        endsecond = float(item.getElementsByTagName("ENDSECOND")[0].firstChild.nodeValue)
        classid = int(item.getElementsByTagName("CLASS_ID")[0].firstChild.nodeValue)
        time_ranges.append((startsecond, endsecond, classid))

    #append event row
    for event in time_ranges:
        start_frame =  int(event[0] * freq_rate / (frame_size / 2))
        end_frame = int(event[1] * freq_rate / (frame_size / 2))
        event_frames = mfccs[start_frame:end_frame]
        sum_mfcc = []
        sum_mfcc.extend(minMFCC(event_frames))
        sum_mfcc.extend(maxMFCC(event_frames))
        sum_mfcc.extend(medianMFCC(event_frames))
        sum_mfcc.extend(meanMFCC(event_frames))
        sum_mfcc.extend(varianceMFCC(event_frames))
        sum_mfcc.extend(skewnessMFCC(event_frames))
        sum_mfcc.extend(kurtosisMFCC(event_frames))
        sum_mfcc.append(event[2])
        
        trainData.append(sum_mfcc)
    
    #append neutral row
    start_n = 0
    for event in time_ranges[:(int(len(time_ranges)/3))]:
        start_frame =  int(event[0] * freq_rate / (frame_size / 2))
        end_n = start_frame - 1
        event_frames = mfccs[start_n:end_n]
        
        end_frame = int(event[1] * freq_rate / (frame_size / 2))
        start_n = end_frame + 1
        
        sum_mfcc = []
        sum_mfcc.extend(minMFCC(event_frames))
        sum_mfcc.extend(maxMFCC(event_frames))
        sum_mfcc.extend(medianMFCC(event_frames))
        sum_mfcc.extend(meanMFCC(event_frames))
        sum_mfcc.extend(varianceMFCC(event_frames))
        sum_mfcc.extend(skewnessMFCC(event_frames))
        sum_mfcc.extend(kurtosisMFCC(event_frames))
        sum_mfcc.append(1)
        
        trainData.append(sum_mfcc)
        
np.savetxt("./ml/trainset/trainset_{0}.txt".format(str(SNRnum)), trainData, fmt='%f')