In [1]:
import numpy as np
import io
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
%matplotlib inline

from utils import *

In [2]:
# class to handle HMM processing
class HMMTrainer(object):  
    '''
    Parameters
    ----------
    
    n_components: parameter defines the number of hidden states
    cov_type: defines the type of covariance in transition matrix
    n_iter: indicates the number of iterations for traning
    
    Choice of parameters depends on the data. 
    '''
    def __init__(self, model_name='GaussianHMM', n_components=3, covariance_type="diag",
                 init_params="cm", params="cmt", n_iter=1000):
        
        # initialize
        self.model_name = model_name
        self.n_components = n_components
        self.covariance_type = covariance_type
        self.init_params = init_params
        self.params = params
        self.n_iter = n_iter
        self.models = []


        # define model
        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components,
                                         covariance_type=self.covariance_type,
                                         init_params=self.init_params, 
                                         params=self.params,
                                         n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type')
            
            
    # train data is 2D aray, where each frow is k-dimensions
    def train(self, X, indices):
        
        # ingonre divisin by 0
        np.seterr(all='ignore')
        self.models.append(self.model.fit(X, indices))
        
    # run the model on input data and get score
    def get_score(self, input_data):
        return self.model.score(input_data)

### Get data

In [3]:
def load_mffc_data(file):
    
    data = []
    labels = []

    train = np.array([])

    # read features data
    with io.open(file, encoding='utf-8') as myfile:
        for line in myfile.readlines():

            # read line index
            index=line.find('#')

            # append labels
            labels.append(line[:index])

            # read data 
            line=line[index+1:]
            line=line.replace(" ", "")
            d=line.split(';')
            len_d = len(d)
            d=np.array(d, dtype='f')

            result=[]
            occurance=0
            for i in range(0,len(d),13):
                tmp_d=d[i:i+13]

                if (sum(tmp_d)!=0):
                    result.append(tmp_d)
                    occurance+=1
                else:
                    break

            result = np.resize(result,(occurance,13))

            data.append(result)

    return np.array(data), np.array(labels, dtype='f')

In [4]:
X_train, y_train = load_mffc_data('train_mfcc.txt')
X_test, y_test = load_mffc_data('test_mfcc.txt')
X_cost, y_cost = load_mffc_data('cross_mfcc.txt')

In [5]:
unique_labels = set(y_train) # get unique labels

#### Scale values

### Train HMM Models

In [6]:
hmm_models = [] # list to keep all models
for l in unique_labels:
    
    # get label datas
    tmp_label_data=X_train[y_train == l]
    

    # input vector indices
    indices = []
    
    # concate all label data
    # get first concatenate
    train_data=np.array([])
    train_data=tmp_label_data[0]
    
    # get first concatenate lenght
    indices.append(tmp_label_data[0].shape[0])
    
    
    
    for i in range(1,tmp_label_data.shape[0]):
        train_data=np.append(train_data,tmp_label_data[i],axis=0)
        indices.append(tmp_label_data[i].shape[0])
        
    print(train_data.shape)

    hmm_trainer = HMMTrainer()
    hmm_trainer.train(train_data,indices)
    hmm_models.append((hmm_trainer, l))
    hmm_trainer = None

(36806, 13)




(24161, 13)




(19999, 13)




(23173, 13)




(32810, 13)




(31761, 13)




(32634, 13)




(35491, 13)




(24541, 13)




(26365, 13)




(29533, 13)




(59858, 13)




(50326, 13)




(64254, 13)




(49275, 13)




(72369, 13)




(51195, 13)




(18675, 13)




(51024, 13)




(29594, 13)




(21154, 13)




(61287, 13)




(32278, 13)




(35616, 13)




(53172, 13)




(48774, 13)




(51697, 13)




(60603, 13)




(49872, 13)




(18930, 13)




(19740, 13)




(23340, 13)




(19476, 13)




(24642, 13)




(52173, 13)




(29417, 13)




(56985, 13)




(54097, 13)




(39923, 13)




(30567, 13)




(54682, 13)




#### Get Result

In [7]:
logprob = np.array([[m[0].get_score(i) for i in X_test] for m in hmm_models])
predicted_label = np.argmax(logprob, axis=0)
error = (predicted_label != y_test)
print('Overall test accuracy: %.2f percent' % (100 * (1 - np.mean(error))))

Overall test accuracy: 82.69 percent
