# GMMHMM

In [1]:
import numpy as np

In [2]:
# mixture model class
class GMMInfo:
    def __init__(self):
        self.weight=[] #gmm weight
        self.mean=[] #gmm mean
        self.var=[] # gmm diagonal covariance
        self.num=0 # number of gmms
# hmm class
class HMMInfo:
    def __init__(self):
        self.init=[]
        self.edge_cost=[]
        self.mix=[]
        self.num=0

In [3]:
def log_Gaussian(m,C,x):
    left=0.5*np.sum(np.log((2*np.pi)*C),axis=1)
    right=0.5*np.sum(np.square((x-m))/C,axis=1)
    return left+right

def mixture_log_Gaussian(mix,x):
    mu=mix.mean
    var=mix.var
    w=mix.weight
    cost=log_Gaussian(mu,var,x)
    # print(w,cost)
    w=np.array(w)
    total_cost=np.sum(w*cost)
    return total_cost

def Gaussian(m,C,x):
    part1=np.sqrt((2*np.pi)**2*np.prod(C,axis=1))
    part2=0.5*np.sum(np.square(x-m)/C,axis=1)
    prob=(1/part1)*np.exp(-part2)
    return prob


def mix_Gaussian(mix,x):
    m=mix.mean
    var=mix.var
    w=mix.weight
    prob=log_Gaussian(m,var,x)
    total_prob=np.sum(w*prob)
    return total_prob

In [4]:
def GMMHMM_DTW(HMM,data):
    # matrix that records the edge costs
    T=HMM.edge_cost
    zeros=np.zeros([39])
    ones=np.zeros([39])+1
    mixture_models=[]
    #create a GMM for the initial state
    init_GMM=GMMInfo()
    init_GMM.weight=[1]
    init_GMM.num=1
    init_GMM.mean.append(zeros)
    init_GMM.var.append(ones)
    init_GMM.mean=np.array(init_GMM.mean)
    init_GMM.var=np.array(init_GMM.var)
    mixture_models.append(init_GMM)
    for mix_model in HMM.mix:
        mixture_models.append(mix_model)
    data=np.vstack([zeros,data])
    s=len(data)
    t=len(mixture_models)
    #Matrix that stores the costs
    P=np.zeros([t,s])
    #dynamic time programming algo
    for j in range(0,s):
        for i in range(t):
            #node score
            # print((i,j))
            Cij=mixture_log_Gaussian(mixture_models[i],data[j])
            
                
            if i>=2:
                P[i][j]=min(P[i][j-1]+T[i][i],P[i-1][j-1]+T[i-1][i],
                            P[i-2][j-1]+T[i-2][i])+Cij
            elif i-1>=0:
                P[i][j]=min(P[i][j-1]+T[i][i],P[i-1][j-1]+T[i-1][i])+Cij
            else:
                P[i][j]=P[i][j]+Cij
    P=P/s
    total_cost=P[-1][-1]
    return total_cost,get_states(P)

In [5]:
# Get the list that records which state each frame belongs to
def get_states(P):
    current_state,current_frame=np.array(P.shape)-1
    states=[current_state]
    while current_state>0 and current_frame>1:
      
      current_frame-=1
      if current_state>2:
          to_check=[P[current_state][current_frame-1],P[current_state-1][current_frame-1],P[current_state-2][current_frame-1]]
          track=np.argmin(to_check)
      elif current_state>1:
          to_check=[P[current_state][current_frame-1],P[current_state-1][current_frame-1]]
          track=np.argmin(to_check)
      else:
          track=0
      if track==0:
          states.insert(0,current_state)
      elif track==1:
          current_state-=1
          states.insert(0,current_state)
      else:
          current_state-=2
          states.insert(0,current_state)
    return states
    

In [6]:
def seperate_templates(templates, num):
    states_info=[]
    for i in range(len(templates)):
        # For the ith template, the number of nodes in each state is initialized evenly
        nodes_num=len(templates[i])//num
        remaining_nodes=len(templates[i])%num
        curr_states_info=np.zeros(len(templates[i])).astype(int)
        for state in range(1,num+1):
            curr_states_info[nodes_num*(state-1):nodes_num*state]=state
        if remaining_nodes>0:
            curr_states_info[-remaining_nodes:]=num
        states_info.append(curr_states_info)
    return states_info

In [7]:
def get_node_in_each_state(templates, state_num, states_info):
    node_in_each_state=[]
    for state in range(state_num+1):
        node_in_each_state.append([])
    for j in range(len(templates)):
        # print('len:',len(states_info[j]))
        for m in range(len(states_info[j])):
            k=int(states_info[j][m])
            node_in_each_state[k].append(templates[j][m])
    return node_in_each_state

In [8]:
def get_edge_cost(states_info,state_num):

    shift_prob=np.zeros((state_num+1,state_num+1))
    num_nodes_in_state=np.zeros(state_num+1)
    for i in range(len(states_info)):
        shift_prob[0][states_info[i][0]]+=1
    # count the state trainsition
    for i in range(len(states_info)):
        for j in range(len(states_info[i])-1):
            current_node=states_info[i][j]
            next_node=states_info[i][j+1]
            shift_prob[current_node][next_node]+=1
            num_nodes_in_state[current_node]+=1
        shift_prob[states_info[i][-2]][states_info[i][-1]]+=1
        num_nodes_in_state[states_info[i][-1]]+=1
    # get the probabilities of going from initial state to 1~state_num states 
    for j in range(state_num+1):
        N=len(states_info)
        N_0j=shift_prob[0][j]
        shift_prob[0][j]=N_0j/N
        if N_0j==0:
            shift_prob[0][j]=np.inf
        else:
            shift_prob[0][j]=-np.log(shift_prob[0][j])
    for i in range(1,state_num+1):
        for j in range(i,state_num+1):
            
            shift_prob[i][j]=shift_prob[i][j]/num_nodes_in_state[i]
            if shift_prob[i][j]==0:
                shift_prob[i][j]=np.inf
            else:
                shift_prob[i][j]=-np.log(shift_prob[i][j])
    return np.array(shift_prob)
        

In [9]:
def Kmeans2(nodes_for_Kmeans,num_Gaussian_distribution):
        #initialize with mean, var and weight, with one cluster
        num_templates=len(nodes_for_Kmeans)
        means=[]
        covs=[]
        weights=[1]
        mean=np.mean(nodes_for_Kmeans,axis=0)
        cov=np.diagonal(np.cov(np.array(nodes_for_Kmeans).T),offset=0, axis1=0, axis2=1)
        means.append(mean)
        covs.append(cov)
        
        current_num_of_cluster=1
        episolom=0.04
        #initial should be 1 mean
        mix = GMMInfo()
        mix.var = np.array(covs)
        mix.mean = np.array(means)
        mix.num = current_num_of_cluster
        mix.weight = np.array(weights)
        stop=False
        
        while num_Gaussian_distribution>current_num_of_cluster and not stop:
            #now split
            new_means=[]
            new_covs=[]
            current_num_of_cluster=current_num_of_cluster*2
            new_clusters=[]
            for cluster in range(len(means)):
                #append newly two cluster center
                new_clusters.append([])
                new_clusters.append([])
                #get splitted mean and cov
                new_mean1=means[cluster]*(1-episolom)
                new_mean2=means[cluster]*(1+episolom)
                new_cov1=covs[cluster]*(1-episolom)
                new_cov2=covs[cluster]*(1+episolom)
                new_means.append(new_mean1)
                new_means.append(new_mean2)
                new_covs.append(new_cov1)
                new_covs.append(new_cov2)
            #now assign the templated into new clusters
            new_means=np.array(new_means)
            new_covs=np.array(new_covs)
            for node in nodes_for_Kmeans:
                d=log_Gaussian(new_means,new_covs,node)
                cluster=np.argmin(d)
                new_clusters[cluster].append(node)
            #now, according to the new clustered result, we get updated weight,
            #mean and cov
            means=[]
            covs=[]
            weights=[]
            #
            # print("For {} clusters, each cluster has following nodes".format(current_num_of_cluster))
            for cluster in new_clusters:
                # print(len(cluster))
                if len(cluster)<2*num_Gaussian_distribution:
                    stop=True
                    # print("For this state, we only have 2 Gaussian Distributions")
                mean=np.mean(cluster,axis=0)
                cov=np.cov(np.array(cluster).T)
                # print(np.shape(cov))
                cov=np.diagonal(cov,offset=0, axis1=0, axis2=1)
                weight=len(cluster)/num_templates
                means.append(mean)
                covs.append(cov)
                weights.append(weight)
            #print(np.sum(weights))
            # print("get {} means".format(current_num_of_cluster))
            # now, we put all the information to mix
            mix = GMMInfo()
            mix.var = np.array(covs)
            mix.mean = np.array(means)
            mix.num = current_num_of_cluster
            mix.weight = np.array(weights)
        return mix

In [10]:
def initialize_HMM(templates, state_num, Gaussian_num):
    hmm=HMMInfo()
    hmm.init=np.zeros((state_num,1))
    hmm.init[0]=1
    hmm.num=state_num

    states_info=seperate_templates(templates, state_num)
    node_in_each_state=get_node_in_each_state(templates, state_num, states_info)
    hmm.edge_cost=get_edge_cost(states_info,state_num)
    
    mix_models=[]
    for i in range(state_num):
        node_in_curr_state=node_in_each_state[i+1]
        curr_state_mix_model=Kmeans2(node_in_curr_state, Gaussian_num[i])
        mix_models.append(curr_state_mix_model)
    hmm.mix=mix_models
    
    return hmm,states_info

In [11]:
# train hmm model
def trainhmm(templates,state_num,Gaussian_num):
    #initialize hmm model
    hmm,states_info=initialize_HMM(templates,state_num,Gaussian_num)
    best_dist=-np.inf
    curr_dist=0
    length=0
    # use at most 99 iterations to train the model
    for i in range(1,100):
        for j in range(len(templates)):
            # use dtw to get the score and update alignment of the templates
            dist,states_info[j]=GMMHMM_DTW(hmm,templates[j])
            
            curr_dist+=dist
        hmm.edge_cost=get_edge_cost(states_info,state_num)
        # according to the alignment, classify the vectors of templates into different states
        node_in_each_state=get_node_in_each_state(templates,state_num,states_info)
        GMMs=[]
        for state in range(state_num):
            curr_state_node=node_in_each_state[state+1]
            # kmeans these vectors into 4 clusters to get weight, mean, var
            # print(curr_state_node)
            curr_mixture=Kmeans2(curr_state_node,Gaussian_num[state])
            GMMs.append(curr_mixture)
        hmm.mix=GMMs
        # if it converges, break the iteration
        if abs(best_dist-curr_dist)<0.0015:
            length=len(node_in_each_state[state_num])
            break
        # update the best score
        best_dist=curr_dist
        # print(best_dist)
        curr_dist=0

    # Here we get the number of nodes in the last state and calculate the 
    # probability of going to the non-emitting state from the last state.
    new_edge_cost=np.zeros((state_num+1,state_num+2))
    num_nodes_at_last_state=length
    template_num=len(templates)
    non_emitting_state_prob=template_num/num_nodes_at_last_state
    log_prob=np.log(non_emitting_state_prob)
    # print(hmm.edge_cost,new_edge_cost)
    new_edge_cost[:state_num+1,:state_num+1]=hmm.edge_cost
    new_edge_cost[state_num][state_num+1]=log_prob
    hmm.edge_cost=new_edge_cost
    return hmm

In [12]:
# get MFCC of length 39
def getMFCC(wavename):
    import numpy as np
    import scipy.io.wavfile as wav
    from python_speech_features import mfcc
    fs, audio = wav.read(wavename)
    feature_mfcc = mfcc(audio, samplerate=fs)
    mfcc=[]
    mfcc.append(np.hstack([feature_mfcc[0],feature_mfcc[0],feature_mfcc[0]]))
    for i in range(1,len(feature_mfcc)-1):
        delta=np.zeros(13)
        for j in range(13):
            delta[j]=feature_mfcc[i+1][j]-feature_mfcc[i-1][j]
        mfcc.append(np.hstack([feature_mfcc[i],delta]))
    mfcc.append(np.hstack([feature_mfcc[-1],feature_mfcc[-1],feature_mfcc[-1]]))

    for i in range(1,len(mfcc)-1):
        acc=np.zeros(13)
        for j in range(13):
            acc[j]=mfcc[i+1][13+j]-mfcc[i-1][13+j]
        mfcc[i]=np.hstack([mfcc[i],acc])
    mfccs=np.array(mfcc)
    std=np.std(mfccs)
    var=np.var(mfccs,1)
    for i in range(len(mfccs)):
        for j in range(39):
            mfccs[i][j]=mfccs[i][j]/var[i]
    return mfccs

In [77]:
def GMMHMM(folder,Gaussian_num,words):
    models=[]
    for word in words:
        templates=[]
        
        
        state_num=len(Gaussian_num)
        # 5 templates
        for i in range(1,6):
            mfcc=getMFCC(folder+'/'+str(word)+''+str(i)+'.wav')
            templates.append(mfcc)
        # call trainhmm function to get the hmm model for each digit
        hmm=trainhmm(templates,state_num,Gaussian_num)
        models.append(hmm)
        print('hmm model '+str(word)+' trained')
    return models

In [78]:
folder='digit_record_new'
Gaussian_num=[2,2,2,2]
words=[i for i in range(10)]
gmmhmm_model=GMMHMM(folder,Gaussian_num,words)

hmm model 0 trained
hmm model 1 trained
hmm model 2 trained
hmm model 3 trained
hmm model 4 trained
hmm model 5 trained
hmm model 6 trained
hmm model 7 trained
hmm model 8 trained
hmm model 9 trained


In [15]:
hmm_models={}
for i in range(len(gmmhmm_model)):
    hmm_models[str(i)]=gmmhmm_model[i]

## Accuracy of single digit recognition

In [16]:
def get_templates(foldername,i,j):
    templates=[]
    for num in range(i,j):
        for digit in range(0,10):
            mfcc=getMFCC(foldername+"/"+str(digit)+str(num)+'.wav')
            templates.append(mfcc)
    return templates

In [17]:
def getGMM_HMM_accuracy(gmmhmm_model,data):
    accuracy=0
    for j in range(len(data)):
        current_digit=j%10
        smallest_distance=[np.inf,0]
        for i in range(10):
            distance=GMMHMM_DTW(gmmhmm_model[i],data[j])
            if distance[0]<smallest_distance[0]:
                smallest_distance[0]=distance[0]
                smallest_distance[1]=i
        recoginized_result=smallest_distance[1]
        if current_digit==recoginized_result:
            accuracy+=1
        print('test data: '+str(current_digit)+', recognized digit:',recoginized_result)
    print('accuracy:',accuracy/len(test_data))

In [18]:
foldername='digit_record_test'
test_data=get_templates(foldername,1,6)

In [19]:
getGMM_HMM_accuracy(gmmhmm_model,test_data)

test data: 0, recognized digit: 0
test data: 1, recognized digit: 9
test data: 2, recognized digit: 2
test data: 3, recognized digit: 3
test data: 4, recognized digit: 4
test data: 5, recognized digit: 5
test data: 6, recognized digit: 0
test data: 7, recognized digit: 7
test data: 8, recognized digit: 8
test data: 9, recognized digit: 9
test data: 0, recognized digit: 0
test data: 1, recognized digit: 9
test data: 2, recognized digit: 2
test data: 3, recognized digit: 3
test data: 4, recognized digit: 4
test data: 5, recognized digit: 9
test data: 6, recognized digit: 6
test data: 7, recognized digit: 7
test data: 8, recognized digit: 8
test data: 9, recognized digit: 9
test data: 0, recognized digit: 0
test data: 1, recognized digit: 9
test data: 2, recognized digit: 2
test data: 3, recognized digit: 3
test data: 4, recognized digit: 4
test data: 5, recognized digit: 9
test data: 6, recognized digit: 6
test data: 7, recognized digit: 7
test data: 8, recognized digit: 8
test data: 9, 

# Continuous Speech Recognition

## Node

In [20]:
class Node:
    def __init__(self,val, word):
        self.val=val
        self.next=[]
        self.word=word
        #three states: -1: root, 1: end, 0: others
        self.state=0        

## Lexical Tree

In [21]:
class LexicalTree:
    def __init__(self,models):
        self.getwords(models)
        zeros=np.zeros([39])
        ones=np.array([1 for i in range(39)])
        # set the initial state as a Gaussian mixture model
        initial_GMM=GMMInfo()
        initial_GMM.mean.append(zeros)
        initial_GMM.var.append(ones)
        initial_GMM.mean=np.array(initial_GMM.mean)
        initial_GMM.var=np.array(initial_GMM.var)
        initial_GMM.weight=[1]
        initial_GMM.num=1
        # let the initial state as the root of the lexical tree
        self.root=Node(initial_GMM,'*')
        self.root.state=-1
           
    # get model and transition cost for each digits
    def getwords(self,models):
        self.words=[]
        self.digits=list(models.keys())
        self.edge_cost={}
        for digit in self.digits:
            self.words.append(models[digit])
            self.edge_cost[digit]=models[digit].edge_cost

    def GenerateTree(self):
        for i in range(len(self.words)):
            word=self.words[i]
            digit=self.digits[i]
            # the initial state for ith digit
            previous=Node(word.mix[0],digit)
            # which is one of the next option for the root node
            self.root.next.append(previous)
            # other states for the digit
            for j in range(1,word.num):
                curr=Node(word.mix[j],digit)
                previous.next.append(curr)
                previous=curr
            # for the last state for the ith digit, its next node is root
            previous.next.append(self.root)
            previous.state=1 

In [22]:
lextree=LexicalTree(hmm_models)
lextree.GenerateTree()
root=lextree.root
edge_cost=lextree.edge_cost

## Continuous Speech Recognition

In [205]:
import copy
class ContinuousSpeechRecognition():
    def __init__(self):
        self.lextree=None
    # get the information from lextree    
    def get_tree_info(self,root,edge_cost):
        self.lextree=root
        self.get_nodes(self.lextree)
        self.edge_cost=edge_cost
        self.previous={}
        self.next={}
        self.end_nodes=[]
        for i in range(len(self.nodes)):
            node=self.nodes[i]
            # if the state of the node is 1, it is the end of a word(digit)
            if node.state==1:
                self.end_nodes.append(i)
            self.next[i]=[]
            if len(node.next)>0:
                for next_node in node.next:
                    # get the previous node(s) and next node(s) for each node
                    self.next[i].append(self.nodes.index(next_node))
                    self.previous[self.nodes.index(next_node)]=i

    def get_nodes(self,root):
        self.nodes=[root]
        self.init_nodes=[]
        self.states=[0]
        for node in root.next:
            state=0
            curr_node=node
            # add the first node of each word into init_nodes list
            self.init_nodes.append(node)
            # while the node is not the ending node of a word(digit)
            while curr_node.state!=1:
                state+=1
                self.nodes.append(curr_node)
                self.states.append(state)
                curr_node=curr_node.next[0]
            # add the ending node of a word to the nodes list
            state+=1
            self.nodes.append(curr_node)
            self.states.append(state)

    
    # continuous speech recognition for 4 or 7 digits phone number
    def digit_recognition_47(self,data,loop_cost=-100):
        zeros=np.zeros(np.shape(data)[1])
        data=np.vstack([zeros,data])
        cols=len(data)
        rows=len(self.nodes)
        # cost matrix
        costs=np.full([rows,cols],np.inf)
        # the cost from the initial state '*' to other nodes
        init_cost=copy.deepcopy(costs)
        init_cost[0][0]=0
        
        all_costs=[init_cost]
        for i in range(1,cols):
            for j in range(len(all_costs)):
                curr_costs=all_costs[j]
                for k in range(1,rows):
                    # calculate the log gaussian score of the vector in input data and the node
                    score=mixture_log_Gaussian(self.nodes[k].val,data[i])
                    cost=min(curr_costs[k][i-1]+self.edge_cost[self.nodes[k].word][self.states[k]][self.states[k]], #horizontal transition from itself
                             curr_costs[self.previous[k]][i-1]+self.edge_cost[self.nodes[k].word][self.states[self.previous[k]]][self.states[k]]) # diagonal transition from its previous node
                    # cost for this node = current best path score + edge cost + node cost (log Gaussian score)
                    curr_costs[k][i]=cost+score
                # find the index of the node with the minimum cost
                min_index=np.argmin(curr_costs[:,i])
                min_cost=min(curr_costs[:,i])
                # when the minimum cost is at one of the end nodes
                if min_index in self.end_nodes:
                    #if the jth costs matrix is in the middle of the all costs, we've already get the word(digit)
                    if len(all_costs)-1>j:
                        next_costs=all_costs[j]
                        next_costs[0,i]=min_cost+loop_cost
                    # if the j is the last costs matrix of the all costs
                    # create a new costs matrix for the alignment of a new digit, if the number of words (digits) is less than 7
                    elif len(all_costs)<7:
                        new_costs=copy.deepcopy(costs)
                        new_costs[0,i]=min_cost+loop_cost
                        all_costs.append(new_costs)
        # according to the all_costs matrix to get the words(digits)
        result=self.get_words_47(all_costs,i)
        return result
        
        
    def get_words_47(self,all_costs,i):
        if len(all_costs)>=7:
            # for 7 digits the costs matrix of the last digit is the 7th matrix
            min_cost_7=min(all_costs[6][self.end_nodes,i])
            # for 4 digits the costs matrix of the last digit is the 4th matrix
            min_cost_4=min(all_costs[3][self.end_nodes,i])
            
            # judge whether the data is more likely to be 4 digits or 7 digits by their minimum costs
            if min_cost_7<min_cost_4:
                end_index=6
            else:
                end_index=3
        else:
            end_index=3
            
        result=''
        # from the last word (digit) to the first
        for j in range(end_index,-1,-1):
            curr_word,i=self.get_curr_word(all_costs[j],i)
            result=curr_word+result
        return result

    def get_curr_word(self,curr_costs,i):
        # In the last col of the costs matrix, get the end node with minimum cost
        min_index=np.argmin(curr_costs[self.end_nodes,i])
        curr_node=self.end_nodes[min_index]
        
        while i>0 and curr_node>0:
            min_previous_cost=min(curr_costs[curr_node][i-1],curr_costs[self.previous[curr_node]][i-1])
            # horizontal move
            if min_previous_cost==curr_costs[curr_node][i-1]:
                i-=1
            # diagonal move
            elif min_previous_cost==curr_costs[self.previous[curr_node]][i-1]:
                i-=1
                curr_node=self.previous[curr_node]
                
        return self.nodes[self.end_nodes[min_index]].word,i

    def digit_recognition(self,data,loop_cost=150):
        zeros=np.zeros(np.shape(data)[1])
        data=np.vstack([zeros,data])
        cols=len(data)
        rows=len(self.nodes)
        # cost matrix
        costs=np.full([rows,cols],np.inf)
        costs[0][0]=0

        for i in range(1,cols):
            for j in range(1,rows):
                # calculate the log gaussian score of the vector in input data and the node
                score=mixture_log_Gaussian(self.nodes[j].val,data[i])
                cost=min(costs[j][i-1]+self.edge_cost[self.nodes[j].word][self.states[j]][self.states[j]],#horizontal transition from itself
                      costs[self.previous[j]][i-1]+self.edge_cost[self.nodes[j].word][self.states[self.previous[j]]][self.states[j]]) # diagonal transition from its previous node
                if not cost==np.inf:
                    costs[j][i]=cost+score
            # find the index of the node with the minimum cost
            min_index=np.argmin(costs[:,i])
            min_cost=min(costs[:,i])
            # when the minimum cost is at one of the end nodes
            if min_index in self.end_nodes and min_cost!=np.inf:
                # Now one word (digit) ends and we reach the non emitting state
                # We assume the probability from non-emitting state to the init state of each digit is same.
                # Thus, we only add the score for non-emitting state (the log probability from the end state of a digit to the non-emitting state)
                min_end_node=self.end_nodes.index(min_index)
                if min_end_node==10:
                    word='silence'
                else: 
                    word=str(min_end_node)
                non_emitting_state_score=self.edge_cost[word][-1][-1]
                costs[0,i]=min_cost+loop_cost+non_emitting_state_score
        # according to the costs matrix to get the words(digits)
        result=self.get_words(costs,i)

        return result

    def get_words(self,costs,i):
        result=''
        while i>0:
            curr_word,i=self.get_curr_word(costs,i)
            result=curr_word+result
        return result
        

In [24]:
csr=ContinuousSpeechRecognition()
csr.get_tree_info(root,edge_cost)

## Problem 2 Testing

In [25]:
import os
foldername="test_data/Problem_2/"
folder=os.listdir(foldername)
for filename in folder:
    phone_num=filename[:-4]
    data=getMFCC(foldername+filename)
    result=csr.digit_recognition(data)
    print("The test phone num is {}, the recongnition result is {}".format(phone_num,result))

The test phone num is 123456, the recongnition result is 123456
The test phone num is 2212, the recongnition result is 2212
The test phone num is 37472941, the recongnition result is 37472949
The test phone num is 55555, the recongnition result is 555556
The test phone num is 6890372344, the recongnition result is 6890372344
The test phone num is 72184347924, the recongnition result is 7018434079249
The test phone num is 7343332190377, the recongnition result is 7313332190377
The test phone num is 8212176342, the recongnition result is 82121763429
The test phone num is 826414052002, the recongnition result is 826414052002
The test phone num is 911385, the recongnition result is 011385


## Accuracy

In [26]:
wrong_digit_num=3+1+1+1+1+1
total_digit_num=6+4+5+8+10+11+13+10+12+6
print("Accuracy:", str((1-wrong_digit_num/total_digit_num)*100)+'%')

Accuracy: 90.58823529411765%


# Project 6

## Train an HMM model for 'silence'

In [82]:
folder='silence_record'
Gaussian_num=[2,2]
words=['silence']
silence_gmmhmm=GMMHMM(folder,Gaussian_num,words)

hmm model silence trained


In [92]:
hmm_models['silence']=silence_gmmhmm[0]

In [93]:
hmm_models['0'].num

4

## Continuous recording training

In [182]:
# store the inofrmation of each state
class State:
    def __init__(self,word,index):
        # the word the state belongs to
        self.word=word
        # ith state
        self.index=index
        self.mean=[]
        self.var=[]
        self.weight=[]
        # number of GMM
        self.num=0
        # the edge cost of a state is: [edge cost of self transition, edge cost to next state]
        self.edge_cost=np.array([0,0])

# store the information of each word
class Word:
    def __init__(self,word,state_num):
        self.state_num=state_num
        self.word=word
        # store all the state class in a word
        self.states=[]

# continuous recording training class
class CRT:
    def __init__(self,words,sequences):
        self.words=words
        self.sequences=sequences
        self.models={}
        
    # initialize the model by reconstruct the hmm model trained by single digit recording
    # self.models['word']->Word, Word.states[state]->State
    def init_models(self,hmm_models):
        for word in self.words:
            curr_hmm=hmm_models[word]
            new_word=Word(word,curr_hmm.num)
            for state in range(curr_hmm.num):
                curr_state=curr_hmm.mix[state]
                new_state=State(word,state)
                new_state.mean=curr_state.mean
                new_state.var=curr_state.var
                new_state.weight=curr_state.weight
                new_state.num=curr_state.num
                # In the trained single digit hmm model, the edge cost matrix is put in the hmm class
                # While, here we put the cost vector in each state class
                new_state.edge_cost=curr_hmm.edge_cost[state+1,state+1:state+3]
                new_word.states.append(new_state)
            self.models[word]=new_word
    # get training data (mfcc) of each word, and get concatenated sequence data
    def get_data(self,foldername,i,j):
        self.training_data={}
        self.concatenated_models={}
        for sequence in self.sequences:
            self.training_data[sequence]=[]
            # add mfcc spectrum of each recording to the training_data
            for index in range(i,j):
                spectrum=getMFCC(foldername+'/'+sequence+'_'+str(index)+'.wav')
                self.training_data[sequence].append(spectrum)
            self.concatenated_models[sequence]=[]
            words=['silence']
            # concatenate the GMMs in each state of a word as the required format 'silence+...+silence'
            for digit in sequence:
                words.append(digit)
            words.append('silence')
            for word in words:
                curr_word_model=self.models[word]
                for state in curr_word_model.states:
                    self.concatenated_models[sequence].append(state)
    
    # initialize nodes_in_states and end_of_states for training
    def init_training(self):
        self.nodes_in_state={}
        self.endnodes_in_state={}
        for word in self.words:
            self.nodes_in_state[word]={}
            self.endnodes_in_state[word]={}
            curr_word_model=self.models[word]
            for state in range(curr_word_model.state_num):
                self.nodes_in_state[word][state]=[]
                self.endnodes_in_state[word][state]=0
    
    # training the hmm model
    def train_model(self):
        previous_cost=-np.inf
        for i in range(99):
            curr_cost=0
            self.init_training()
            for sequence in self.sequences:
                # get trianing data
                curr_training_data=self.training_data[sequence]
                # get the concatenated model for the trianing data
                curr_concatenated_model=self.concatenated_models[sequence]
                for j in range(len(curr_training_data)):
                    data=curr_training_data[j]
                    cost=self.DTW(curr_concatenated_model,data)
                    curr_cost+=cost
            for word in self.words:
                self.update_model(word)
            if abs(previous_cost-curr_cost)<3:
                break
            previous_cost=curr_cost
    
    def update_model(self,word):
        word_model=self.models[word]
        for state in range(word_model.state_num):
            state_model=word_model.states[state]
            nodes=self.nodes_in_state[word][state]
            gmm_model=self.Kmeans(nodes,state_model.num)
            state_model.mean=gmm_model.mean
            state_model.var=gmm_model.var
            state_model.weight=gmm_model.weight
            state_model.num=gmm_model.num
            # the probability of self transition
            prob=(len(nodes)-self.endnodes_in_state[word][state])/len(nodes)
            state_model.edge_cost[0]=-np.log(prob)
            state_model.edge_cost[1]=-np.log((1-prob))
            print(word,state,state_model.edge_cost)
    
    # 没改变量名
    def Kmeans(self,nodes_for_Kmeans,num_Gaussian_distribution):
        #initialize with mean, var and weight, with one cluster
        num_templates=len(nodes_for_Kmeans)
        means=[]
        covs=[]
        weights=[1]
        mean=np.mean(nodes_for_Kmeans,axis=0)
        cov=np.diagonal(np.cov(np.array(nodes_for_Kmeans).T),offset=0, axis1=0, axis2=1)
        means.append(mean)
        covs.append(cov)
        
        current_num_of_cluster=1
        episolom=0.04
        #initial should be 1 mean
        mix = GMMInfo()
        mix.var = np.array(covs)
        mix.mean = np.array(means)
        mix.num = current_num_of_cluster
        mix.weight = np.array(weights)
        stop=False
        
        while num_Gaussian_distribution>current_num_of_cluster and not stop:
            #now split
            new_means=[]
            new_covs=[]
            current_num_of_cluster=current_num_of_cluster*2
            new_clusters=[]
            for cluster in range(len(means)):
                #append newly two cluster center
                new_clusters.append([])
                new_clusters.append([])
                #get splitted mean and cov
                new_mean1=means[cluster]*(1-episolom)
                new_mean2=means[cluster]*(1+episolom)
                new_cov1=covs[cluster]*(1-episolom)
                new_cov2=covs[cluster]*(1+episolom)
                new_means.append(new_mean1)
                new_means.append(new_mean2)
                new_covs.append(new_cov1)
                new_covs.append(new_cov2)
            #now assign the templated into new clusters
            new_means=np.array(new_means)
            new_covs=np.array(new_covs)
            for node in nodes_for_Kmeans:
                d=log_Gaussian(new_means,new_covs,node)
                cluster=np.argmin(d)
                new_clusters[cluster].append(node)
            #now, according to the new clustered result, we get updated weight,
            #mean and cov
            means=[]
            covs=[]
            weights=[]
            #
            # print("For {} clusters, each cluster has following nodes".format(current_num_of_cluster))
            for cluster in new_clusters:
                # print(len(cluster))
                if len(cluster)<2*num_Gaussian_distribution:
                    stop=True
                    # print("For this state, we only have 2 Gaussian Distributions")
                mean=np.mean(cluster,axis=0)
                cov=np.cov(np.array(cluster).T)
                # print(np.shape(cov))
                cov=np.diagonal(cov,offset=0, axis1=0, axis2=1)
                weight=len(cluster)/num_templates
                means.append(mean)
                covs.append(cov)
                weights.append(weight)
            #print(np.sum(weights))
            # print("get {} means".format(current_num_of_cluster))
            # now, we put all the information to mix
            mix = GMMInfo()
            mix.var = np.array(covs)
            mix.mean = np.array(means)
            mix.num = current_num_of_cluster
            mix.weight = np.array(weights)
        return mix

    def DTW(self,curr_concatenated_model,data):
        zeros=np.zeros(39)
        data=np.vstack([zeros,data])
        col=len(data)
        row=len(curr_concatenated_model)
        costs=np.full((row,col),np.inf)
        costs[0][0]=0
        for i in range(1,col):
            for j in range(row):
                cost=mixture_log_Gaussian(curr_concatenated_model[j],data[i])
                score=min(costs[j][i-1]+curr_concatenated_model[j].edge_cost[0],
                       costs[j-1][i-1]+curr_concatenated_model[j-1].edge_cost[1])
                if not score==np.inf:
                    costs[j][i]=score+cost
        self.get_result(costs,i,j,curr_concatenated_model,data)
        min_cost=costs[-1][-1]/len(data)
        return min_cost

    def get_result(self,costs,i,j,curr_concatenated_model,data):
        previous_state=curr_concatenated_model[-1].index
        while i>0 and j>0:
            previous_costs=[costs[j][i-1],costs[j-1][i-1]]
            idx=np.argmin(previous_costs)
            if idx==0:
                i=i-1
            elif idx==1:
                i=i-1
                j=j-1
            node=data[i]
            word=curr_concatenated_model[j].word
            state=curr_concatenated_model[j].index
            self.nodes_in_state[word][state].append(node)
            if previous_state!=state:
                self.endnodes_in_state[word][state]+=1
                previous_state=state
        
            

    
        
    

In [183]:
# create a list of all 10 words
words=[str(i) for i in range(10)]
words.append('silence')
sequences=['0123456789','9876543210','1234567890','0987654321','1357902468','8642097531']
foldername='project_6_training_data'

crt=CRT(words,sequences)
crt.init_models(hmm_models)
crt.get_data(foldername,0,5)

    

In [159]:
print(hmm_models['0'].edge_cost)

[[        inf -0.                 inf         inf         inf  0.        ]
 [ 0.          0.02915658  3.54961739         inf         inf  0.        ]
 [ 0.          0.          0.05026183  3.0155349          inf  0.        ]
 [ 0.          0.          0.          0.07878088  2.58021683  0.        ]
 [ 0.          0.          0.          0.         -0.         -3.42100001]]


In [184]:
crt.models['silence'].states[0].edge_cost

array([0.02469261, 3.37709983])

In [185]:
crt.train_model()

0 0 [0.02700434 3.62522943]
0 1 [0.05009395 3.01879755]
0 2 [0.06947237 2.70136121]
0 3 [0.05536469 2.92136786]
1 0 [0.04380262 3.14988295]
1 1 [0.06524052 2.76211742]
1 2 [0.04710447 3.07884718]
1 3 [0.31845373 1.29928298]
2 0 [0.05368015 2.95143201]
2 1 [0.11419237 2.22642373]
2 2 [0.05526268 2.92316158]
2 3 [0.05849621 2.8678989 ]
3 0 [0.03866461 3.27210059]
3 1 [0.14041718 2.03252462]
3 2 [0.04831858 3.05400118]
3 3 [0.12675171 2.12823171]
4 0 [0.06385147 2.78295151]
4 1 [0.10801858 2.27897523]
4 2 [0.11506933 2.21920348]
4 3 [0.0547582  2.93208225]
5 0 [0.06467771 2.77050322]
5 1 [0.09500811 2.40092099]
5 2 [0.06252036 2.80336038]
5 3 [0.08726464 2.48212501]
6 0 [0.05556985 2.91777073]
6 1 [0.0447836  3.12822146]
6 2 [0.0631789  2.79320801]
6 3 [1.55814462 0.23638878]
7 0 [0.13176928 2.09186406]
7 1 [0.10283024 2.32565037]
7 2 [0.02620237 3.6549779 ]
7 3 [0.16251893 1.89711998]
8 0 [0.0338251  3.40341714]
8 1 [0.05495888 2.92852352]
8 2 [0.12568822 2.13613689]
8 3 [0.06087072 2.82

In [186]:
crt.models['0'].states[3].edge_cost

array([0.06669137, 2.74084002])

In [187]:
new_hmm_models={}
for word in words:
    curr_word_model=crt.models[word]
    hmm=HMMInfo()
    hmm.num=curr_word_model.state_num
    hmm.mix=curr_word_model.states
    hmm.edge_cost=np.zeros([hmm.num+1,hmm.num+2])
    for state in range(hmm.num):
        curr_state_model=hmm.mix[state]
        hmm.edge_cost[state+1,state+1:state+3]=curr_state_model.edge_cost
    new_hmm_models[word]=hmm

## Testing

In [206]:
lextree=LexicalTree(new_hmm_models)
lextree.GenerateTree()
root=lextree.root
edge_cost=lextree.edge_cost

In [207]:
csr=ContinuousSpeechRecognition()
csr.get_tree_info(root,edge_cost)

In [208]:
import os
foldername="project_6_training_data/"
folder=os.listdir(foldername)
for filename in folder:
    phone_num=filename[:-4]
    data=getMFCC(foldername+filename)
    result=csr.digit_recognition(data)
    print("The test phone num is {}, the recongnition result is {}".format(phone_num,result))

The test phone num is 0123456789_0, the recongnition result is 10123456789
The test phone num is 0123456789_1, the recongnition result is 10123456789silence
The test phone num is 0123456789_2, the recongnition result is 10123456789silence
The test phone num is 0123456789_3, the recongnition result is 10123456789silence
The test phone num is 0123456789_4, the recongnition result is 10123456781silence
The test phone num is 0987654321_0, the recongnition result is 10987654321silence
The test phone num is 0987654321_1, the recongnition result is 10987654321silence
The test phone num is 0987654321_2, the recongnition result is silence098silence7654321
The test phone num is 0987654321_3, the recongnition result is 1987654silence321silence
The test phone num is 0987654321_4, the recongnition result is 1098silence7654321silence
The test phone num is 1234567890_0, the recongnition result is 1234567890silence
The test phone num is 1234567890_1, the recongnition result is 1234567890silence
The te