In [32]:
# load pkl file using pickle 
import pprint, pickle

#training set
pkl_file = open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /train10.pkl', 'rb')
data_1 = pickle.load(pkl_file)
#print(data_1)

# Data_set
pkl_file = open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /test10.pkl', 'rb')
data_2 = pickle.load(pkl_file)

In [41]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys
epsilon=1e-100

class HMM:
        def __init__(self, state_list, observation_list,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None, smoothing_obs = 0.01):
            """
            Builds a Hidden Markov Model
            * state_list is the list of state symbols [q_0...q_(N-1)]
            * observation_list is the list of observation symbols [v_0...v_(M-1)]
            * transition_proba is the transition probability matrix
                [a_ij] a_ij = Pr(Y_(t+1)=q_i|Y_t=q_j)
            * observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            * initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""
            print "HMM creating with: "
            self.N = len(state_list)       # number of states
            self.M = len(observation_list) # number of possible emissions
            print str(self.N)+" states"
            print str(self.M)+" observations"
            self.omega_Y = state_list
            self.omega_X = observation_list
            if transition_proba is None:
                self.transition_proba = zeros( (self.N, self.N), float) 
            else:
                self.transition_proba=transition_proba
            if observation_proba is None:
                self.observation_proba = zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
            if initial_state_proba is None:
                self.initial_state_proba = zeros( (self.N,), float ) 
            else:
                self.initial_state_proba=initial_state_proba
            self.make_indexes() # build indexes, i.e the mapping between token and int
            self.smoothing_obs = smoothing_obs 
            
        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
            self.Y_index = {}
            for i in range(self.N):
                self.Y_index[self.omega_Y[i]] = i
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
        #cobs,cstat,cpairs,ctrans,cinits
        #def get_observationIndices( self, cobs ):
        
        def get_observationIndices( self, observations ):
            """return observation indices, i.e 
            return [self.O_index[o] for o in observations]
            and deals with OOVs
            """
            indices = zeros( len(observations), int )
            k = 0
            for o in observations:
                if o in self.X_index:
                    indices[k] = self.X_index[o]
                else:
                    indices[k] = UNKid
                k += 1
            return indices
           
        def data2indices(self, word): 
            """From one corrected letter: 
            - extract the lettres and the corrected ones
            - returns two list of indices, one for each
            -> (letterObservation, letterState)
            """
            obsids = list()
            statids  = list()
            for lettre in word:
                obs = lettre[0]
                stat = lettre[1]
                if obs in self.X_index:
                    obsids.append(self.X_index[obs])
                statids.append(self.Y_index[stat])
            return obsids,statids
            
        def observation_estimation(self, cpairs):
            """ Build the observation distribution: 
                observation_proba is the observation probablility matrix
                    [b_ki],  b_ki = Pr(X_t=v_k|Y_t=q_i)"""
            # fill with counts
            for pair in cpairs: 
                obs=pair[0]
                stat=pair[1]
                cpt=cpairs[pair]
                k = 0 # for <unk>
                if obs in self.X_index: 
                    k=self.X_index[obs]
                i=self.Y_index[stat]
                self.observation_proba[k,i]=cpt
            # normalize
            self.observation_proba=self.observation_proba+self.smoothing_obs
            self.observation_proba=self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
            
        
        def transition_estimation(self, ctrans):
            """ Build the transition distribution: 
                transition_proba is the transition matrix with : 
                [a_ij] a[i,j] = Pr(Y_(t+1)=q_i|Y_t=q_j)
            """
            # fill with counts
            for pair in ctrans:
                i=self.Y_index[pair[0]]
                j=self.Y_index[pair[1]]
                self.transition_proba[i,j]=ctrans[pair]
            # normalize
            self.transition_proba=self.transition_proba/self.transition_proba.sum(axis=0).reshape(1,self.N)
        
        def init_estimation(self, cinits):
            """Build the init. distribution"""
            # fill with counts
            for stat in cinits:
                i=self.Y_index[stat]
                self.initial_state_proba[i]=cinits[stat]
            # normalize
            self.initial_state_proba=self.initial_state_proba/sum(self.initial_state_proba)
             
        
        def supervised_training(self, cpairs, ctrans,cinits):
            """ Train the HMM's parameters. This function wraps everything"""
            self.observation_estimation(cpairs)
            self.transition_estimation(ctrans)
            self.init_estimation(cinits)
        
        def viterbi( self, obsids ):
            """Viterbi algorithm: 
            Find the states corresponding to the observations. 
            The observations must be converted in a list of indices. 
            """
            # shortcuts
            B = self.observation_proba 
            A = self.transition_proba
            T = len(obsids)
            N = self.N
            # initialisation
            delta = zeros( N, float )
            tmp = zeros( N, float )
            psi = zeros( (T, N), int )      
            delta_t = zeros( N, float )
            # apply initial_state probs to the first frame
            delta = B[obsids[0]] * self.initial_state_proba   
            # recursion
            for t in xrange(1, T):
                O_t = obsids[t]
                for j in range(N):
                    multiply( delta, A[:, j], tmp )
                    idx = psi[t, j] = tmp.argmax()       
                    delta_t[j] = tmp[idx] * B[O_t, j] 
                delta, delta_t = delta_t, delta
            # reconstruction
            i_star = [delta.argmax()]                        
            for psi_t in psi[-1:0:-1]:
                i_star.append( psi_t[i_star[-1]] )                 
            i_star.reverse()
            return i_star
        def logviterbi( self, obsids ):
            """Viterbi algorithm in logspace
            Find the states corresponding to the observations. 
            The observations must be converted in a list of indices. 
            """
            # shortcuts
            B = np.log(self.observation_proba+epsilon)
            A = np.log(self.transition_proba+epsilon)
            T = len(obsids)
            N = self.N
            # initialisation
            delta = zeros( N, float )
            tmp = zeros( N, float )
            psi = zeros( (T, N), int )      
            delta_t = zeros( N, float )
            # apply initial_state probs to the first frame
            delta = B[obsids[0]] +np.log(self.initial_state_proba+epsilon)
            # recursion
            for t in xrange(1, T):
                O_t = obsids[t]
                for j in range(N):
                    tmp=delta+A[:, j] 
                    idx = psi[t, j] = tmp.argmax()       
                    delta_t[j] = tmp[idx] + B[O_t, j] 
                delta, delta_t = delta_t,delta # switch the vectors, otherwise delta and delta_t will be the same

            # back-tracking
            i_star = [delta.argmax()]                        
            for psi_t in psi[-1:0:-1]:
                i_star.append( psi_t[i_star[-1]] )                 
            i_star.reverse()
            return i_star

In [23]:
#compter les letters observations et états
def make_counts(data_1):
    """ 
    Build different count tables to train a HMM. Each count table is a dictionnary. 
    Returns: 
    * c_obs: lettre non couriger counts x
    * c_stat: tag counts
    * c_pairs: count of pairs (nncorrectedletr,correctedletr)
    * c_transitions: count of tag bigram 
    * c_inits: count of tag found in the first position
    """
    c_obs = dict()
    c_stat = dict()
    c_pairs= dict()
    c_transitions = dict()
    c_inits = dict()
    for word in data_1:
        # we use i because of the transition counts
        for i in range(len(word)):
            couple=word[i]
            obs = couple[0]
            stat = couple[1]
            # word counts
            if obs in c_obs:
                c_obs[obs]=c_obs[obs]+1
            else:
                c_obs[obs]=1
            # tag counts
            if stat in c_stat:
                c_stat[stat]=c_stat[stat]+1
            else:
                c_stat[stat]=1
            # observation counts
            if couple in c_pairs:
                c_pairs[couple]=c_pairs[couple]+1
            else:
                c_pairs[couple]=1
            # i >  0 -> transition counts
            if i > 0:
                trans = (word[i-1][1],stat)
                if trans in c_transitions:
                    c_transitions[trans]=c_transitions[trans]+1
                else:
                    c_transitions[trans]=1
            # i == 0 -> counts for initial states
            else:
                if stat in c_inits:
                    c_inits[stat]=c_inits[stat]+1
                else:
                    c_inits[stat]=1
                    
    return c_obs,c_stat,c_pairs, c_transitions, c_inits


cobs,cstat,cpairs,ctrans,cinits = make_counts(data_1)
#pair=('a','a')
#print(ctrans)
print "Nombre de lettres non corriger  : "+str(len(cobs))
print "Nombre de lettres corriger  : "+str(len(cstat))
print "Nombre de paires des tuples : "+str(len(cpairs))
print "Nombre de transition : "+str(len(ctrans))+ " / "+ str(26*26)
print "Nombre de init. : "+str(len(cinits))



Nombre de lettres non corriger  : 26
Nombre de lettres corriger  : 26
Nombre de paires des tuples : 127
Nombre de transition : 403 / 676
Nombre de init. : 25


In [26]:
hmm = HMM(state_list=cstat.keys(), observation_list=cobs.keys(),
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None)

HMM creating with: 
26 states
26 observations


In [27]:
#apprentissage pas
hmm.transition_estimation(ctrans)
#print hmm.transition_proba
print hmm.transition_proba.sum(axis=0)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.]


In [6]:
hmm.init_estimation(cinits)
print sum(hmm.initial_state_proba)

1.0


In [7]:
hmm.observation_estimation(cpairs)
print hmm.observation_proba.sum(axis=0)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.]


In [30]:
#viterbi
ncorrect=0
tot=0
ccount=0
inccount=0
for word in data_2:
    obsids,statids = hmm.data2indices(word)
    answer = hmm.logviterbi(obsids)
    ncorrect+=sum(np.array(answer)==np.array(statids))
    tot+=len(statids)
    print( answer)
    
precision=ncorrect*100.0/tot
print "%precision = "+str(precision)
tauxerreur=100.0-precision
print"%taux d'erreur ="+str(tauxerreur)

[20, 7, 25]
[20, 8, 0, 20, 8, 3, 24]
[3, 24]
[20, 7, 25]
[20, 7, 25]
[20, 8, 3, 24]
[20, 7, 25]
[20, 8, 3, 24]
[20, 7, 25]
[0, 20, 8, 0, 20, 8, 3, 24]
[3, 24]
[0, 20, 8, 0, 20, 8, 0, 20, 8, 3, 24]
[20, 7, 25]
[3, 24]
[20, 8, 0, 20, 8, 0, 20, 7, 25]
[20, 8, 3, 24]
[3, 24]
[20, 8, 0, 20, 7, 25]
[0, 20, 8, 0, 20, 8, 3, 24]
[3, 24]
[20, 8, 0, 20, 8, 3, 24]
[3, 24]
[20, 8, 0, 20, 8, 0, 20, 8, 0, 20, 7, 25]
[20, 8, 0, 20, 7, 25]
[20, 7, 25]
[0, 20, 8, 0, 20, 8, 3, 24]
[0, 20, 8, 3, 24]
[20, 7, 25]
[20, 8, 0, 20, 8, 0, 20, 8, 0, 20, 7, 25]
[3, 24]
[20, 7, 25]
[20, 8, 0, 20, 8, 3, 24]
[3, 24]
[20, 7, 25]
[20, 8, 3, 24]
[20, 8, 0, 20, 7, 25]
[20, 8, 3, 24]
[3, 24]
[0]
[20, 8, 0, 20, 7, 25]
[3, 24]
[0]
[0, 20, 8, 3, 24]
[20, 8, 0, 20, 8, 0, 20, 8, 0, 20, 7, 25]
[3, 24]
[0]
[20, 8, 3, 24]
[0, 20, 8, 0, 20, 8, 3, 24]
[20, 8, 3, 24]
[0, 20, 8, 3, 24]
[3, 24]
[20, 8, 0, 20, 8, 0, 20, 8, 3, 24]
[20, 8, 0, 20, 8, 3, 24]
[20, 8, 0, 20, 7, 25]
[20, 7, 25]
[0, 20, 8, 0, 20, 8, 0, 20, 8, 3, 24]
[0, 20, 8,