## Building Hidden Markov Models (*HMMs*) for speech recognition

We'll use *Gaussian HMMs* to model our data. *HMMs* are great tool for modeling time series data. As an audio signa is a time series signal, so the *HMMs* fit our needs. 

*HMMs* are popular because they can be trained automatically and are simple and computationally feasible to use. A *HMM* represent probability distrubutions over sequencs of observations.

In this document we use **hmmlearn** package built with Sphinx and **python_speech features** built for MFCC.


In [132]:
import os
import argparse 

import numpy as np
from scipy.io import wavfile 
from hmmlearn import hmm
from python_speech_features import mfcc

In [1]:
# Class to handle all HMM related processing
class HMMTrainer(object):
    def __init__(self, model_name='GaussianHMM', n_components=4, cov_type='diag', n_iter=1000):
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        self.models = []

        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components, 
                    covariance_type=self.cov_type, n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type')

    # X is a 2D numpy array where each row is 13D
    def train(self, X):
        np.seterr(all='ignore')
        self.models.append(self.model.fit(X))

    # Run the model on input data
    def get_score(self, input_data):
        return self.model.score(input_data)
    

if __name__=='__main__':
# init the variables of all HMM models
    hmm_models = []
    
    # define train folder
    dataset = 'train'
    input_folder = 'data\{}'.format(dataset)
    
    
    # get path of input files
    try:
        audiofiles = os.listdir(input_folder)
    except FileNotFoundError:
        assert False, "Folder not found"

    # pars the input directory that contains audio files
    for dirname in audiofiles:
        # get the name of the subfolder
        subfolder = os.path.join(input_folder, dirname)
        
        if not os.path.isdir(subfolder):
            continue
        
        print(subfolder)
        
        
        # extract the label
        label = subfolder[subfolder.rfind('\\')+1:]
        
        # initialize intput variables and labels
        X = np.array([])
        y_words = []
        
        # iterate through the audio files
        for filename in [x for x in os.listdir(subfolder) if x.endswith('.wav')]:
            # Read the input file
            filepath = os.path.join(subfolder, filename)
            sampling_freq, audio = wavfile.read(filepath)
            
            # Extract MFCC features
            mfcc_features = mfcc(audio, sampling_freq)

            # Append to the variable X
            if len(X) == 0:
                X = mfcc_features
            else:
                X = np.append(X, mfcc_features, axis=0)
            
            # Append the label
            y_words.append(label)

        print 'X.shape =', X.shape
        # Train and save HMM model
        hmm_trainer = HMMTrainer()
        hmm_trainer.train(X)
        hmm_models.append((hmm_trainer, label))
        hmm_trainer = None


SyntaxError: Missing parentheses in call to 'print'. Did you mean print('X.shape =', X.shape)? (<ipython-input-1-4c376b001d7e>, line 77)

We need train data to build our speech recognizer. We will use the database available at [here](https://code.google.com/archive/p/hmm-speech-recognition/downloads). This data set contains seven different words, where each word has 15 audio files associated with it. We'll build an *HMM* model for each class by training our model on given dataset. Then after build model, given new input file, we need to run all the models on this file and pick the one with the best score. 

In [134]:
if __name__ == '__main__':

    # init the variables of all HMM models
    hmm_models = []
    
    # define train folder
    dataset = 'train'
    input_folder = 'data\{}'.format(dataset)
    
    
    # get path of input files
    try:
        audiofiles = os.listdir(input_folder)
    except FileNotFoundError:
        assert False, "Folder not found"

    # pars the input directory that contains audio files
    for dirname in audiofiles:
        # get the name of the subfolder
        subfolder = os.path.join(input_folder, dirname)
        
        if not os.path.isdir(subfolder):
            continue
        
        print(subfolder)
        
        
        # extract the label
        label = subfolder[subfolder.rfind('\\')+1:]
        
        # initialize intput variables and labels
        X = np.array([])
        y_words = []
        
        # iterate through the audio files
        for filename in [x for x in os.listdir(subfolder) if x.endswith('.wav')]:
            # read the input file
            filepath = os.path.join(subfolder, filename)
            sampling_freq, audio = wavfile.read(filepath)
            
            # extarct MFFC features
            mfcc_features = mfcc(audio, sampling_freq)
            
            # append to the variable X
            
            if len(X) == 0:
                X = mfcc_features
            else:
                X = np.append(X, mfcc_features, axis=0)
                
            # append the label
            y_words.append(label)

data\train\apple
data\train\banana
data\train\kiwi
data\train\lime
data\train\orange
data\train\peach
data\train\pineapple


After extracted features from all the files, train and save the HMM model.
As HMM is a generative model for unsupervised learning, we don't need labels
to build HMM modes for each class. We explitly assume that seperate HMM models
will be built for each class

In [135]:
        # Train and save HMM model
        hmm_trainer = HMMTrainer()
        hmm_trainer.train(X)
        hmm_models.append(hmm_trainer, label)
        hmm_trainer = None



TypeError: append() takes exactly one argument (2 given)

In [97]:
    # define test folder
    dataset = 'test'
    input_folder = 'data\{}'.format(dataset)
    
    # test folder
    test_files = []

    # get path of test files
    try:
        audiofiles = os.listdir(input_folder)
    except FileNotFoundError:
        assert False, "Folder not found"

    # pars the input directory that contains audio files
    for dirname in audiofiles:

        # get the name of the subfolder
        subfolder = os.path.join(input_folder, dirname)

        if not os.path.isdir(subfolder):
                continue

        # iterate through the audio files
        for filename in [x for x in os.listdir(subfolder) if x.endswith('.wav')]:
            # read the input file

            filepath = os.path.join(subfolder, filename)
            test_files.append(filepath)      

In [98]:
    test_files

['data\\test\\apple\\apple15.wav',
 'data\\test\\banana\\banana15.wav',
 'data\\test\\kiwi\\kiwi15.wav',
 'data\\test\\lime\\lime15.wav',
 'data\\test\\orange\\orange15.wav',
 'data\\test\\peach\\peach15.wav',
 'data\\test\\pineapple\\pineapple15.wav']

In [115]:
    # classify input data
    for test_file in test_files:
        sampling_freq, audio = wavfile.read(test_file)
        
        #extract MFCC features
        mfcc_features = mfcc(audio, sampling_freq)
        
        # define variables
        max_score = None
        output_label = None
        
        # iterate thorugh all HMM models and pick the one with best score
        for item in hmm_models:
            hmm_model, label = item
            
            score = hmm_model.get_score(mfcc_features)
            if score > max_score:
                max_score = score
                output_label = label
                # print the output
                
        print ("\nTrue: " + test_file[test_file.find('\\')+6:test_file.rfind('\\')])
        print ("Predicted: " + str(output_label))


True: apple
Predicted: None

True: banana
Predicted: None

True: kiwi
Predicted: None

True: lime
Predicted: None

True: orange
Predicted: None

True: peach
Predicted: None

True: pineapple
Predicted: None
