### Load training data

In [1]:
import numpy as np
import pandas as pd
from itertools import chain

In [2]:
trainData4 = np.loadtxt('./ml/trainset/trainset_4.txt')
trainData2 = np.loadtxt('./ml/trainset/trainset_2.txt')

Add to training data with SNR 10dB neutral events with SNR 20dB

In [3]:
index = [i for i in range(0, len(trainData2))]
columns = [i for i in range(1,177)]
columns[175] = 'ev'
trainData2_df = pd.DataFrame(data=trainData2, index=index, columns = columns)

index = [i for i in range(0, len(trainData4))]
trainData4_df = pd.DataFrame(data=trainData4, index=index, columns = columns)

trainData = trainData2_df.append(trainData4_df[trainData4_df.ev == 1.0])

### Machine learning

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm


X = trainData.as_matrix(columns = trainData.columns[0:175])
y = trainData.ev.values

### Detection

In [5]:
import scipy.stats.stats as st

startMFCC = 0

def minMFCC(mfccCoeffs):
    mfccMins = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMins.append(min(mfccCoeffs[:,i]))
    return mfccMins

def maxMFCC(mfccCoeffs):
    mfccMaxs = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMaxs.append(max(mfccCoeffs[:,i]))
    return mfccMaxs

def medianMFCC(mfccCoeffs):
    mfccMedians = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMedians.append(np.median(mfccCoeffs[:,i]))
    return mfccMedians

def meanMFCC(mfccCoeffs):
    mfccMeans = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccMeans.append(np.mean(mfccCoeffs[:,i]))
    return mfccMeans

def varianceMFCC(mfccCoeffs):
    mfccVars = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccVars.append(np.var(mfccCoeffs[:,i]))
    return mfccVars

def skewnessMFCC(mfccCoeffs):
    mfccSkews = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccSkews.append(st.skew(mfccCoeffs[:,i]))
    return mfccSkews

def kurtosisMFCC(mfccCoeffs):
    mfccKurts = []
    for i in range(startMFCC, len(mfccCoeffs[0])):
        mfccKurts.append(st.kurtosis(mfccCoeffs[:,i]))
    return mfccKurts

In [6]:
validation = pd.read_csv('./ml/validation.csv', sep='\t', index_col=0)
validation = validation[validation.neutral == False]
print(validation[:2])
print(len(validation))

    audio_num  frame_num neutral glass gunshot scream
20          1         20   False  True   False  False
21          1         21   False  True   False  False
3808


Test results without neutral events, only hazardous events.

In [8]:
for est_count in range(50, 501, 50):
    ml = RandomForestClassifier(n_estimators=est_count)
    ml = ml.fit(X,y)

    result_df = pd.DataFrame(columns=['SNR', 'concurrency'])

    for snr_num in range(6 , 0, -1):
        frames_big_df = pd.DataFrame(columns=['audio_num' ,'frame_num', 'neutral', 'glass', 'gunshot', 'scream'])

        for audio_num in range(1, 30):
            audio_mfcc = np.loadtxt('./MFCC/testing/{0}_{1}.txt'.format(str("%05d" % (audio_num)), 
                                                                        str(snr_num)))

            val_len = len(validation[validation.audio_num == audio_num])

            big_window_size = 32
            for frame_num, start_frame in enumerate(range(0, (val_len*16), int(big_window_size / 2))):
                frame_big = audio_mfcc[start_frame:start_frame + big_window_size]
                predict_obj = list(chain(minMFCC(np.asarray(frame_big)),
                                   maxMFCC(np.asarray(frame_big)),
                                   medianMFCC(np.asarray(frame_big)),
                                   meanMFCC(np.asarray(frame_big)),
                                   varianceMFCC(np.asarray(frame_big)),
                                   skewnessMFCC(np.asarray(frame_big)),
                                   kurtosisMFCC(np.asarray(frame_big))))
                r_cl = int(ml.predict(np.array(predict_obj).reshape(1,-1)))

                frames_big_df.loc[len(frames_big_df)] = [audio_num, frame_num, (1 == r_cl), (2 == r_cl), 
                                                        3 == r_cl, 4 == r_cl]

        frames_big_df = frames_big_df[frames_big_df.neutral == False]
        diff = frames_big_df[~frames_big_df.isin(validation).all(1)]
        res = ((len(validation) - len(diff)) / len(validation))

        result_df.loc[len(result_df)] = [snr_num * 5, round(res, 2)]

    print('predict with ' + str(est_count) + ' estimators')
    print(result_df)

predict with 50 estimators
   SNR  concurrency
0   30         0.77
1   25         0.77
2   20         0.78
3   15         0.76
4   10         0.72
5    5         0.41
predict with 100 estimators
   SNR  concurrency
0   30         0.78
1   25         0.78
2   20         0.77
3   15         0.76
4   10         0.72
5    5         0.41
predict with 150 estimators
   SNR  concurrency
0   30         0.78
1   25         0.78
2   20         0.78
3   15         0.77
4   10         0.74
5    5         0.42
predict with 200 estimators
   SNR  concurrency
0   30         0.78
1   25         0.79
2   20         0.78
3   15         0.77
4   10         0.73
5    5         0.42
predict with 250 estimators
   SNR  concurrency
0   30         0.77
1   25         0.77
2   20         0.77
3   15         0.77
4   10         0.73
5    5         0.42
predict with 300 estimators
   SNR  concurrency
0   30         0.77
1   25         0.77
2   20         0.77
3   15         0.76
4   10         0.71
5    5       

22

In [18]:
validation[:2] #true answer

Unnamed: 0,audio_num,frame_num,neutral,glass,gunshot,scream
20,1,20,False,True,False,False
21,1,21,False,True,False,False


In [19]:
frames_big_df[:2] #prediction

Unnamed: 0,audio_num,frame_num,neutral,glass,gunshot,scream
0,29,0,False,False,True,False
1,29,1,False,False,True,False


Test results with neutral events

In [8]:
result_df #400 tree

Unnamed: 0,SNR,concurrency
0,30,0.92
1,25,0.92
2,20,0.92
3,15,0.91
4,10,0.87
5,5,0.55


In [11]:
result_df #300 tree

Unnamed: 0,SNR,concurrency
0,30,0.93
1,25,0.93
2,20,0.92
3,15,0.91
4,10,0.87
5,5,0.54


In [14]:
result_df #200 tree

Unnamed: 0,SNR,concurrency
0,30,0.93
1,25,0.93
2,20,0.93
3,15,0.91
4,10,0.87
5,5,0.55


In [7]:
result_df #100 tree

Unnamed: 0,SNR,concurrency
0,30,0.92
1,25,0.92
2,20,0.91
3,15,0.9
4,10,0.86
5,5,0.55
