In [11]:
import numpy as np
import os
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from keras.regularizers import l1,l2
from keras.models import load_model
from keras.models import Sequential
from sklearn.utils import shuffle
from keras.layers import Dropout
from keras.layers import Dense
from sklearn import preprocessing
%matplotlib inline
import pandas as pd

In [2]:
# our utils functions
from src.utils import *

# our classes
from classes.PreprocessData import *
from classes.EvaluateModel import *

In [3]:
folder_name="1_vs_1/mix"

### GET DATA 

In [5]:
# read data
train_df = pd.read_csv('data/mix/data41mix_train.csv')
test_df = pd.read_csv('data/mix/data41mix_test.csv')

In [4]:
# label_dict={0:1,1:6,2:11,3:13,4:15,5:17,6:20,7:22,8:25,9:36}

class_size=41
label_dict={}
for i in range(class_size):
    label_dict[i]=i

### PREPROCESS DATA

In [6]:
# initialize preprocess class
preprocess = PreprocessData()

In [7]:
# # split data, normalize, shuffle
X_train, y_train = preprocess.preprocess_data(train_df, normalize=False)
X_test, y_test = preprocess.preprocess_data(test_df, normalize=False)
# new_test=test_df.loc[test_df['# target'].isin([1, 6, 11, 13, 15, 17, 20, 22, 25, 36])]
scaler = preprocessing.StandardScaler().fit(X_train)
X_test=scaler.transform(X_test)

In [8]:
print('Size of training matrix:', X_train.shape)
print('Size of testing matrix:', X_test.shape)

Size of training matrix: (26027, 2808)
Size of testing matrix: (4593, 2808)


In [9]:
# get unique labesl
unique_words = set(y_test)
print(unique_words)

{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0}


## Model Class

In [10]:
# class to handle NN processing
class NNTrainer(object):  
    '''
    Parameters
    ----------
    
    layer_activation: activation function for input and hidden layers
    covariance_type: activation function of end layer
    input_n_cols: numver of columns of input layer
    optimizer_function: optimization function
    loss_function: loss functions
    metrics_v: metric for evaluation result
    epochs_n: number of epoches to update train weights
    batch_size_n: batch size of fitted data
    validation_split_n: ratio of validation split in traning 
    
    choice of parameters depends on the data. 
    '''
    
    def __init__(self, layer_activation='relu', end_layer_activation='softmax',input_n_cols=2808,
                 optimizer_function='adam',loss_function='categorical_crossentropy',metrics_v='accuracy',
                epochs_n=15, batch_size_n=32, validation_split_n=0.2):
        
        # initialize variables
        self.layer_activation = layer_activation
        self.end_layer_activation = end_layer_activation
        self.input_n_cols = input_n_cols
        self.optimizer_function = optimizer_function
        self.loss_function = loss_function
        self.metrics_v = metrics_v
        self.epochs_n = epochs_n
        self.batch_size_n =batch_size_n
        self.validation_split_n = validation_split_n
        
        
        # define model
        self.model = Sequential()
        #add layers to model and initialize
        self.model.add(Dense(50, activation=self.layer_activation, input_shape=(self.input_n_cols,)))
        self.model.add(Dropout(0.2))
#         self.model.add(Dense(200, activation=self.layer_activation, input_shape=(self.input_n_cols,)))
#         self.model.add(Dense(200, activation=self.layer_activation, input_shape=(self.input_n_cols,)))

        self.model.add(Dense(2, activation=self.end_layer_activation))
        
        # compile model
        self.model.compile(optimizer=self.optimizer_function, 
                           loss=self.loss_function,metrics=[self.metrics_v])
            
    #train mode
    def train(self, X_train, y_train):
        # ingonre divisin by 0
        # np.seterr(all='ignore') 
        #train model
        self.model.fit(X_train, y_train, epochs=self.epochs_n,
                       batch_size=self.batch_size_n,validation_split=self.validation_split_n, verbose=0)
         
    # run the model on new data and get score
    def predict_probability(self, X_test):
        scores = self.model.predict_proba(X_test)
        return scores

    
    # return model
    def model_evaluate(self, X_test, y_test):
        scores = self.model.evaluate(X_test, y_test, verbose=1)
        return scores[1]*100
    
    # return model
    def get_nn_model():
        return self.model
    
    def save_modle(self, folder_name, model_name):
        # Creates a HDF5 file 'my_model.h5'
        self.model.save('models/{path}/{model}.h5'.format(path=folder_name,model=model_name))

In [27]:
model_group = {}

In [13]:
prob_group = {}
prob_group_2 = {}
for i in range(len(label_dict)):
    prob_group[i]=[]
    prob_group_2[i]=[]

In [14]:
prob_group

{0: [],
 1: [],
 2: [],
 3: [],
 4: [],
 5: [],
 6: [],
 7: [],
 8: [],
 9: [],
 10: [],
 11: [],
 12: [],
 13: [],
 14: [],
 15: [],
 16: [],
 17: [],
 18: [],
 19: [],
 20: [],
 21: [],
 22: [],
 23: [],
 24: [],
 25: [],
 26: [],
 27: [],
 28: [],
 29: [],
 30: [],
 31: [],
 32: [],
 33: [],
 34: [],
 35: [],
 36: [],
 37: [],
 38: [],
 39: [],
 40: []}

In [15]:
for i in range(0,len(label_dict)):
    
    for j in range(i+1, len(label_dict)):
        
        save_label = str(label_dict[i])+"-"+str(label_dict[j])
#         print(save_label)
                
        
        new_train_df=train_df.loc[train_df['# target'].isin([label_dict[i],label_dict[j]])]

        X_train, y_train = preprocess.preprocess_data(new_train_df, normalize=False)
        X_train=scaler.transform(X_train)
        
        target = np.zeros((len(y_train),2),dtype=int)
        
        for y_t_i,y_l_i in enumerate(y_train):
            
            if(int(y_l_i)==int(label_dict[i])):
                target[y_t_i][0]=1
            else: 
                target[y_t_i][1]=1
                
        
        # train model
        nn_trainer = NNTrainer()
        nn_trainer.train(X_train, target)

        nn_trainer.save_modle(folder_name, "{}".format(save_label))

        # append model
        model_group[save_label]=nn_trainer
        
        nn_trainer = None

In [None]:
# for i in range(0,10):
    
#     logprob = np.array([(model[0].predict_probability(X_test)[:,1],model[0].predict_probability(X_test)[:,0])[i<=model_i] for model_i, model in enumerate(model_group[i])]) ## prob of one gorup
#     prob_group[i] = logprob
   

In [24]:
 def atoi(text):
            return int(text) if str(text).isdigit() else text

def natural_keys(text):
    return [ atoi(c) for c in re.split('(\d+)', int(text) if str(text).isdigit() else text) ]

models=[]

files_list = os.listdir('models/1_vs_1/mix')
files_list.sort(key=natural_keys)

for w in files_list:
# check wheter files is .h5 or no
    if (w.find('.h5')!=-1):
        model_name=w
        model_group[model_name[:-3]] = models.append(load_model('models/1_vs_1/mix'+'/{model}'.format(model=model_name)))

0-1
0-2
0-3
0-4
0-5
0-6
0-7
0-8
0-9
0-10
0-11
0-12
0-13
0-14
0-15
0-16
0-17
0-18
0-19
0-20
0-21
0-22
0-23
0-24
0-25
0-26
0-27
0-28
0-29
0-30
0-31
0-32
0-33
0-34
0-35
0-36
0-37
0-38
0-39
0-40
1-2
1-3
1-4
1-5
1-6
1-7
1-8
1-9
1-10
1-11
1-12
1-13
1-14
1-15
1-16
1-17
1-18
1-19
1-20
1-21
1-22
1-23
1-24
1-25
1-26
1-27
1-28
1-29
1-30
1-31
1-32
1-33
1-34
1-35
1-36
1-37
1-38
1-39
1-40
2-3
2-4
2-5
2-6
2-7
2-8
2-9
2-10
2-11
2-12
2-13
2-14
2-15
2-16
2-17
2-18
2-19
2-20
2-21
2-22
2-23
2-24
2-25
2-26
2-27
2-28
2-29
2-30
2-31
2-32
2-33
2-34
2-35
2-36
2-37
2-38
2-39
2-40
3-4
3-5
3-6
3-7
3-8
3-9
3-10
3-11
3-12
3-13
3-14
3-15
3-16
3-17
3-18
3-19
3-20
3-21
3-22
3-23
3-24
3-25
3-26
3-27
3-28
3-29
3-30
3-31
3-32
3-33
3-34
3-35
3-36
3-37
3-38
3-39
3-40
4-5
4-6
4-7
4-8
4-9
4-10
4-11
4-12
4-13
4-14
4-15
4-16
4-17
4-18
4-19
4-20
4-21
4-22
4-23
4-24
4-25
4-26
4-27
4-28
4-29
4-30
4-31
4-32
4-33
4-34
4-35
4-36
4-37
4-38
4-39
4-40
5-6
5-7
5-8
5-9
5-10
5-11
5-12
5-13
5-14
5-15
5-16
5-17
5-18
5-19
5-20
5-21
5-22
5-23

In [None]:
for i in range(0,len(label_dict)):
    for j in range(i+1, len(label_dict)):
        
        save_label = str(label_dict[i])+"-"+str(label_dict[j])
        model=model_group[save_label]
        
        logprob_1=model.predict_proba(X_test)[:,0]
        logprob_2=model.predict_proba(X_test)[:,1]
        
        prob_group[i].append(logprob_1)
        prob_group[j].append(logprob_2)
        
        prob_group_2[i].append(logprob_2)
        prob_group_2[j].append(logprob_1)
        
for i in range(len(label_dict)):
    prob_group[i]=np.array(prob_group[i]).T

In [None]:
results=np.array

mean_l = []
max_l = []

mean_l_2 = []
min_l_2 = []
for i in range(0,len(label_dict)):
    mean_l.append(np.mean(prob_group[i],axis=1))
    max_l.append(np.max(prob_group[i],axis=1))
    
    mean_l_2.append(np.mean(prob_group_2[i],axis=1))
    min_l_2.append(np.min(prob_group_2[i],axis=1))

result_mean=np.vstack(mean_l).T
result_max=np.vstack(max_l).T

result_mean_2=np.vstack(mean_l).T
result_min_2=np.vstack(max_l).T

In [None]:

'''
how to evaluate evaluate model:
    select the ones with larget prob.
    if prob of ones are euqal for any model then select the model with min others prob
'''
logprob_1=result_mean
logprob_2=result_mean_2
result=np.zeros((y_test.shape[0]), dtype=int)

for i in range(y_test.shape[0]):
    max_array=[]
    max_n=-100
    idx=1
    max_min=100
    for j in range(len(label_dict)):
        max_array.append(logprob_1[i][j])
        if logprob_1[i][j]>max_n:
            max_n=logprob_1[i][j]
            max_min=logprob_2[i][j]
            idx=j
            
        if logprob_1[i][j]==max_n and logprob_2[i][j]<max_min:
            max_n=logprob_1[i][j]
            max_min=logprob_2[i][j]
            idx=j
       
    #sort max array
    max_array.sort()
    
    # compare result with the actual labels
    if(int(y_test[i])==int(label_dict[idx]) and max_array[-1]>=.0 and max_array[-1]-max_array[-2]>=0.0):
        result[i]=1
        
        
overall_acc = np.mean(result)*100
print('Overall unseen test accuracy: %.4f percent' % overall_acc)

### RESULTS

Layer: 1, node: 50

Max: 71.8579

Mean: 99.36

With new approach: logprob1, logprob2 / Layer: 1, node: 50
    
Mean: 99.76

Max: 76.68