The goal is to design an hidden markov second order model to correct typos in texts without a dictionary.

Participants: TARGHI Amal et MOHAMED Mohamed


targhiamal@gmail.com , mohmed.abdelkhaleq@gmail.com

Question2: 
Second Order HMM: To improve the performance, we can increase the order of the HMM. Implement a second Order model for this task (this means that the probability of the next state depends on the current state and the previous one as well). A convenient way to implement a second order HMM, is to think about it as a variable
change. 

In [156]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys
import pprint, pickle

UNK = "<unk>"  # token to map all out-of-vocabulary words (OOVs)
UNKid = 0      # index for UNK



In [157]:
#Data

train10 =   pickle.load(open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /train10.pkl', 'rb'))

train20 =   pickle.load(open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /train20.pkl', 'rb'))

test10 =  pickle.load(open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /test10.pkl', 'rb'))

test20 =  pickle.load(open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /test20.pkl', 'rb'))

In [158]:
class HMM2:
        def __init__(self, state_list, observation_list,transition_proba = None,transition_head_proba=None,observation_proba = None,initial_state_proba = None, smoothing_obs = 0.01):
         
            """
            Builds a Hidden Markov Model odrer 2
            * state_list is the list of state symbols [q_0...q_(N-1)]
            * observation_list is the list of observation symbols [v_0...v_(M-1)]
            * transition_proba is the transition probability matrix 
            [a_kji] = Pr(Y_t = q_k | Y_t-1 = q_j , Y_t-2 = q_i)
            * transition_head_proba is transition probability matrix(transition between the first and the second state)
            [a_kj] = Pr(Y_t = q_k | Y_t-1 = q_j)
            * observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            * initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""
            
            print "HMM creating with: "
            
            self.N = len(state_list)       # number of states
            self.M = len(observation_list) # number of possible emissions
            print str(self.N)+" states"
            print str(self.M)+" observations"
            self.omega_Y = state_list
            self.omega_X = observation_list
            
            if transition_proba is None:
                self.transition_proba = zeros((self.N, self.N**2), float) 
            else:
                self.transition_proba=transition_proba
                
            if observation_proba is None:
                self.observation_proba = zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
                
                
            if initial_state_proba is None:
                self.initial_state_proba = zeros( (self.N,), float ) 
            else:
                self.initial_state_proba=initial_state_proba
                
            if transition_head_proba is None:
                self.transition_head_proba = zeros((self.N, self.N), float)
            else:
                self.transition_head_proba = transition_head_proba
                
            self.make_indexes() # build indexes, i.e the mapping between token and int
            self.smoothing_obs = smoothing_obs 
 
        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
            self.Y_index = {}
            for i in range(self.N):
                self.Y_index[self.omega_Y[i]] = i
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
            # the indice of each word in state and observation for facilite the     
            print "States:\n", self.Y_index
            print "Observations:\n", self.X_index
        #cobs,cstat,cpairs,ctrans,cinits
        #def get_observationIndices( self, cobs ):

        def get_observationIndices( self, observations ):
            """return observation indices, i.e 
            return [self.O_index[o] for o in observations]
            and deals with OOVs
            """
            indices = zeros( len(observations), int )
            k = 0
            for o in observations:
                if o in self.X_index:
                    indices[k] = self.X_index[o]
                else:
                    indices[k] = UNKid
                k += 1
            return indices
        
        """From one each word
        - extract the letters and correction of rache one
        - (letterObservation,LetterState)
        """
        def data2indices(self, word): 
    
            letterObservation = list()
            letterState  = list()
            for letter in word:
                observation = letter[0]
                state = letter[1]
                if observation in self.X_index:
                    letterObservation.append(self.X_index[observation])
                else:
                    letterObservation.append(UNKid)
                letterState.append(self.Y_index[state])
            return letterObservation,letterState
        
        def observation_estimation(self, cpairs):
            """ Build the observation distribution: 
                observation_proba is the observation probablility matrix
                [b_ki],  b_ki = Pr(X_t=v_k|Y_t=q_i)"""
         
            
            # fill with counts
            for pair in cpairs:
                
                observation=pair[0]
                state=pair[1]
                cpt=cpairs[pair]
                k = 0 # for <unk>
                if observation in self.X_index: 
                    k=self.X_index[observation]
                j=self.Y_index[state]
                self.observation_proba[k,j]=cpt
            # normalize
            self.observation_proba=self.observation_proba+self.smoothing_obs
            self.observation_proba=self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
        
        def transition_estimation(self, ctrans):
            """ Build the transition distribution: 
                transition_proba is the transition matrix with : 
                k is current state
            """
            # fill with counts
            for pair in ctrans:
                i = self.Y_index[pair[0][0]]
                j = self.Y_index[pair[0][1]]
                k = self.Y_index[pair[1]]
                self.transition_proba[k, (i * self.N + j)] = ctrans[pair]
            # normalize
            self.transition_proba = self.transition_proba + self.smoothing_obs
            self.transition_proba=self.transition_proba/self.transition_proba.sum(axis=0).reshape(1,self.N**2)
     
            
        def init_estimation(self, cinits):
            """Build the init. distribution"""
            # fill with counts
            for state in cinits:
                i=self.Y_index[state]
                self.initial_state_proba[i]=cinits[state]
            # normalize
            self.initial_state_proba=self.initial_state_proba/sum(self.initial_state_proba)
            
        def transition_head_estimation(self, trans_heads_counts):
       
        # fill with counts
            for pair in trans_heads_counts:
                j = self.Y_index[pair[0]]
                k = self.Y_index[pair[1]]
                self.transition_head_proba[k, j] = trans_heads_counts[pair]
        # normalize
            self.transition_head_proba = self.transition_head_proba + self.smoothing_obs
            self.transition_head_proba = self.transition_head_proba / self.transition_head_proba.sum(axis=0).reshape(1, self.N)

        
        def supervised_training(self, cpairs, ctrans,ctrans_head,cinits):
            """ Train the HMM's parameters. This function wraps everything"""
            self.observation_estimation(cpairs)
            self.transition_estimation(ctrans)
            self.transition_head_estimation(ctrans_head)
            self.init_estimation(cinits)
           
            # Viterbi Algortihm
            # Source: http://stackoverflow.com/questions/34219766/need-help-implementing-viterbi-for-a-second-order-hidden-markov-model-with-pytho
            
        def viterbi( self, obs ):
            """Find the most probable state sequence
            """

            # shortcuts
            B = self.observation_proba 
            A = self.transition_proba
            A_head = self.transition_head_proba
            T = len(obsids)
            N = self.N
            # initialisation
            delta = zeros( N, float )
            tmp = zeros(N, float)
            psi = zeros( (T, N), int )
            #updated delta
            delta_t = zeros( N, float )
            # apply initial_state probs to the first frame
            delta = B[obsids[0]] * self.initial_state_proba   
            # recursion
            for t in xrange(1, T):
                O_t = obsids[t]
                if t == 1:
                    # i current state
                    #the second oberservation 
                    #tmp proba of of different diff state when i
                    for i in range(N):
                        for j in range(N):
                            tmp[j] = delta[j] * A_head[i, j]
                        psi[t, i] = tmp.argmax()
                        delta_t[i] = tmp.max() * B[O_t, i]
                #third obs
                else:
                    for i in range (N):
                        for j in range(N):
                            tmp[j] = delta[j] * A[i, psi[t-1, j] * N + j]
                        psi[t, i] = tmp.argmax()
                        delta_t[i] = tmp.max() * B[O_t, i]
                delta, delta_t = delta_t, delta
                
            # reconstruction
            i_star = [delta.argmax()]                        
            for psi_t in psi[-1:0:-1]:
                i_star.append( psi_t[i_star[-1]] )                 
            i_star.reverse()
            return i_star

# Compter les lettres et les tags

In [159]:
def make_counts(corpus):
    """
    Build different count tables to train a HMM_2. Each count table is a dictionnary.
    Returns:
    * c_letterObservation: No correct Letter Count (Observation Letter)
    * c_letterState:  Correct Letters  ( State Letters)
    * c_pairs: count of pairs 
    * c_transitions: count of tag 3-gram
    * c_inits: count of 2-gram found in the first and second position
    """
    c_letterObservation = dict()
    c_letterState = dict()
    c_pairs = dict()
    c_transitions = dict()
    ctrans_head = dict()
    c_inits = dict()
    for word in corpus:
        # we use i because of the transition counts
        for i in range(len(word)):
            couple = word[i]
            letter = couple[0]
            tag = couple[1]
            # word counts
            if letter in c_letterObservation:
                c_letterObservation[letter] = c_letterObservation[letter] + 1
            else:
                c_letterObservation[letter] = 1
            # tag counts
            if tag in c_letterState:
                c_letterState[tag] = c_letterState[tag] + 1
            else:
                c_letterState[tag] = 1
            if i >= 2:
                # transition counts, z is combination of two previous states
                z = (word[i - 2][1], word[i - 1][1])
                trans = (z, tag)
                if trans in c_transitions:
                    c_transitions[trans] = c_transitions[trans] + 1
                else:
                    c_transitions[trans] = 1
            if i == 1:
                z = (word[i-1][1], tag)
                if z in ctrans_head:
                    ctrans_head[z] = ctrans_head[z] + 1
                else:
                    ctrans_head[z] = 1
            if i == 0:
                # init counts, i == 1 -> counts for initial states
                z = tag
                if z in c_inits:
                    c_inits[z] = c_inits[z] + 1
                else:
                    c_inits[z] = 1
            # observation counts
            o = (letter, tag)
            if o in c_pairs:
                c_pairs[o] = c_pairs[o] + 1
            else:
                c_pairs[o] = 1

    return c_letterObservation, c_letterState, c_pairs, c_transitions, ctrans_head, c_inits

# Creation of vocabulary according to the number of occurence for each letter.

In [160]:
def make_vocab(c_letterObservation, threshold):
    """ 
    return a vocabulary by thresholding word counts. 
    inputs: 
    * c_words : a dictionnary that maps word to its counts
    * threshold: count must be >= to the threshold to be included
    
    returns: 
    * a word list
    """
    voc = list()
    voc.append(UNK)
    for letter in c_letterObservation:
        if c_letterObservation[letter] >= threshold:
            voc.append(letter)
    return voc

In [161]:
clettero,cletterS,cpairs,ctrans,ctrans_head, cinits = make_counts(train10)
#i will test for train20 and train 10 (both  of them)
print "Nombre de phrases de train = "+str(len(train10))
print "Nombre de phrases de test  = "+str(len(test10))
print "Nombre de mots  : "+str(len(clettero))
print "Nombre de tags  : "+str(len(cletterS))
print "Nombre de paires: "+str(len(cpairs))
print "Nombre de trans : "+str(len(ctrans))+ " / "+ str(len(cletterS)*len(cletterS))
print "Nombre de init. : "+str(len(cinits))
vocab = make_vocab(clettero,10)
print "Vocabulaire :"+str(len(vocab))

Nombre de phrases de train = 29057
Nombre de phrases de test  = 1501
Nombre de mots  : 26
Nombre de tags  : 26
Nombre de paires: 127
Nombre de trans : 2489 / 676
Nombre de init. : 25
Vocabulaire :27


# Creation du HMM1

In [162]:
hmm2 = HMM2(state_list=cletterS.keys(), observation_list=vocab,
                 transition_proba = None,
                 transition_head_proba=None,
                 observation_proba = None,
                 initial_state_proba = None)

hmm2.supervised_training(cpairs, ctrans, ctrans_head , cinits)





HMM creating with: 
26 states
27 observations
States:
{'a': 0, 'c': 1, 'b': 2, 'e': 3, 'd': 4, 'g': 5, 'f': 6, 'i': 7, 'h': 8, 'k': 9, 'j': 10, 'm': 11, 'l': 12, 'o': 13, 'n': 14, 'q': 15, 'p': 16, 's': 17, 'r': 18, 'u': 19, 't': 20, 'w': 21, 'v': 22, 'y': 23, 'x': 24, 'z': 25}
Observations:
{'a': 1, 'c': 2, 'b': 3, 'e': 4, 'd': 5, 'g': 6, 'f': 7, 'i': 8, 'h': 9, 'k': 10, 'j': 11, 'm': 12, 'l': 13, 'o': 14, 'n': 15, 'q': 16, 'p': 17, 's': 18, 'r': 19, 'u': 20, 't': 21, 'w': 22, 'v': 23, 'y': 24, 'x': 25, 'z': 26, '<unk>': 0}


# Propabibilities

In [163]:
hmm2.observation_estimation(cpairs)
hmm2.transition_estimation(ctrans)
hmm2.init_estimation(cinits)
#print hmm.observation_proba.sum(axis=1)
 

# TEST

In [164]:
#test 10 
correct=0
total=0

for word in test10:
    obsids,statids = hmm2.data2indices(word)
    best_sequence = hmm2.viterbi(obsids)
    correct+=sum(np.array(best_sequence)==np.array(statids))
    total+=len(statids)
    
    
    #print statids
    #print obsids
    #print best_sequence
    
print('corrected ones are ' +str(correct))
print('total is ' +str(total))

print "Accuarcy is "+str(correct*100.0/total ) + " %"
print "Error is "+str(100 - correct*100.0/total ) + " %"



corrected ones are 6983
total is 7320
Accuarcy is 95.3961748634 %
Error is 4.60382513661 %


In [165]:
# results for  20 first data  ( to be more clear thaaan because the data est volumineuse)

    
#obsids is the word (observation) (false)
#statids is real word
#best_sequence is the correction of the word 
# there are just the indices of each word
# we can find the word by checking the result of creation of hmm
# for example for the first case
#[19, 4, 4, 13, 11, 9, 6, 18] = reeljhgs false word
#[6, 3, 3, 12, 7, 14, 5, 17] =  feelings  true word
#[18, 3, 3, 12, 7, 14, 5, 17] =  reelings after correction
#the hmm can't correct the word on this case

#[8, 15, 6, 19, 26, 8, 15, 4, 5] =  false word ingrzined
#[7, 14, 5, 18, 0, 7, 14, 3, 4] =  ture word ingrained 
#[7, 14, 5, 18, 0, 7, 14, 3, 4] = corrected word ingrained


from itertools import islice
head = list(islice(test10, 20))
#print head

for word in head:
    obsids,statids = hmm2.data2indices(word)
    best_sequence = hmm2.viterbi(obsids)
    correct+=sum(np.array(best_sequence)==np.array(statids))
    total+=len(word)
    
    print obsids
    print statids
    print best_sequence



print "Accuarcy is "+str(correct*100.0/total ) + " %"
print "Error is "+str(100 - correct*100.0/total ) + " %"


[21, 9, 4]
[20, 8, 3]
[20, 8, 3]
[13, 4, 7, 21, 8, 18, 21]
[12, 3, 6, 20, 7, 17, 20]
[12, 3, 6, 20, 7, 17, 20]
[8, 18]
[7, 17]
[7, 17]
[21, 14, 14]
[20, 13, 13]
[20, 13, 13]
[7, 1, 19]
[6, 0, 18]
[6, 0, 18]
[6, 14, 15, 4]
[5, 13, 14, 3]
[5, 13, 14, 3]
[7, 14, 19]
[6, 13, 18]
[6, 13, 18]
[21, 9, 1, 21]
[20, 8, 0, 20]
[20, 8, 0, 20]
[9, 8, 18]
[8, 7, 17]
[8, 7, 17]
[19, 4, 4, 13, 11, 9, 6, 18]
[6, 3, 3, 12, 7, 14, 5, 17]
[18, 3, 3, 12, 7, 14, 5, 17]
[14, 7]
[13, 6]
[13, 6]
[8, 15, 7, 4, 19, 8, 10, 19, 8, 6, 24]
[7, 14, 6, 3, 18, 7, 13, 18, 7, 20, 23]
[7, 14, 6, 3, 18, 7, 13, 18, 7, 20, 23]
[1, 19, 4]
[0, 18, 3]
[0, 18, 3]
[18, 13]
[17, 13]
[17, 13]
[8, 15, 6, 19, 26, 8, 15, 4, 5]
[7, 14, 5, 18, 0, 7, 14, 3, 4]
[7, 14, 5, 18, 0, 7, 14, 3, 4]
[21, 9, 1, 21]
[20, 8, 0, 20]
[20, 8, 0, 20]
[9, 4]
[8, 3]
[8, 3]
[2, 1, 15, 15, 14, 21]
[1, 0, 14, 14, 13, 20]
[1, 0, 14, 14, 13, 20]
[2, 14, 15, 2, 4, 8, 23, 4]
[1, 13, 14, 1, 3, 7, 22, 3]
[1, 13, 14, 1, 3, 7, 22, 3]
[14, 7]
[13, 6]
[13, 6]
Accuarcy

In [166]:
#test 20 
correct=0
total=0

for word in test20:
    obsids,statids = hmm2.data2indices(word)
    best_sequence = hmm2.viterbi(obsids)
    correct+=sum(np.array(best_sequence)==np.array(statids))
    total+=len(statids)
    
    

    
print('correct ones are ' +str(correct))
print('total is ' +str(total))

print "Accuarcy is "+str(correct*100.0/total ) + " %"
print "Error is "+str(100 - correct*100.0/total ) + " %"



correct ones are 15216
total is 16691
Accuarcy is 91.1629021628 %
Error is 8.83709783716 %
