In [None]:
# The goal is to design an hidden markov first order model to correct typos in texts without a dictionary.
# Participants: TARGHI Amal et MOHAMED Mohamed


# Email  : targhiamal@gmail.com , mohmed.abdelkhaleq@gmail.com

In [26]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys
import pprint, pickle

UNK = "<unk>"  # token to map all out-of-vocabulary words (OOVs)
UNKid = 0      # index for UNK
epsilon=1e-100


In [28]:
#Data

train10 =   pickle.load(open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /train10.pkl', 'rb'))

train20 =   pickle.load(open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /train20.pkl', 'rb'))

test10 =  pickle.load(open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /test10.pkl', 'rb'))

test20 =  pickle.load(open('/home/amal/TC4-Second-Order-HMM-for-typos-correction-/typos-data /test20.pkl', 'rb'))

In [18]:
class HMM:
        def __init__(self, state_list, observation_list,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None, smoothing_obs = 0.01):
            """
            Builds a Hidden Markov Model
            * state_list is the list of state symbols [q_0...q_(N-1)]
            * observation_list is the list of observation symbols [v_0...v_(M-1)]
            * transition_proba is the transition probability matrix
                [a_ij] a_ij = Pr(Y_(t+1)=q_i|Y_t=q_j)
            * observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            * initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""
            print "HMM creating with: "
            
            self.N = len(state_list)       # number of states
            self.M = len(observation_list) # number of possible emissions
            print str(self.N)+" states"
            print str(self.M)+" observations"
            self.omega_Y = state_list
            self.omega_X = observation_list
            if transition_proba is None:
                self.transition_proba = zeros( (self.N, self.N), float) 
            else:
                self.transition_proba=transition_proba
            if observation_proba is None:
                self.observation_proba = zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
            if initial_state_proba is None:
                self.initial_state_proba = zeros( (self.N,), float ) 
            else:
                self.initial_state_proba=initial_state_proba
            self.make_indexes() # build indexes, i.e the mapping between token and int
            self.smoothing_obs = smoothing_obs 
 
        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
            self.Y_index = {}
            for i in range(self.N):
                self.Y_index[self.omega_Y[i]] = i
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
        #cobs,cstat,cpairs,ctrans,cinits
        #def get_observationIndices( self, cobs ):

        def get_observationIndices( self, observations ):
            """return observation indices, i.e 
            return [self.O_index[o] for o in observations]
            and deals with OOVs
            """
            indices = zeros( len(observations), int )
            k = 0
            for o in observations:
                if o in self.X_index:
                    indices[k] = self.X_index[o]
                else:
                    indices[k] = UNKid
                k += 1
            return indices
        
        """From one each word
        - extract the letters and correction
        - 
        -> (letterObservation,LetterState)
        """
        def data2indices(self, word): 
    
            letterObservation = list()
            letterState  = list()
            for letter in word:
                observation = letter[0]
                state = letter[1]
                if observation in self.X_index:
                    letterObservation.append(self.X_index[observation])
                else:
                    letterObservation.append(UNKid)
                letterState.append(self.Y_index[state])
            return letterObservation,letterState
        
        def observation_estimation(self, cpairs):
            """ Build the observation distribution: 
                observation_proba is the observation probablility matrix
                    [b_ki],  b_ki = Pr(X_t=v_k|Y_t=q_i)"""
            # fill with counts
            for pair in cpairs: 
                observation=pair[0]
                state=pair[1]
                cpt=cpairs[pair]
                k = 0 # for <unk>
                if observation in self.X_index: 
                    k=self.X_index[observation]
                i=self.Y_index[state]
                self.observation_proba[k,i]=cpt
            # normalize
            self.observation_proba=self.observation_proba+self.smoothing_obs
            self.observation_proba=self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
        
        def transition_estimation(self, ctrans):
            """ Build the transition distribution: 
                transition_proba is the transition matrix with : 
                [a_ij] a[i,j] = Pr(Y_(t+1)=q_i|Y_t=q_j)
            """
            # fill with counts
            for pair in ctrans:
                i=self.Y_index[pair[0]]
                j=self.Y_index[pair[1]]
                self.transition_proba[i,j]=ctrans[pair]
            # normalize
            self.transition_proba=self.transition_proba/self.transition_proba.sum(axis=0).reshape(1,self.N)
     
            
        def init_estimation(self, cinits):
            """Build the init. distribution"""
            # fill with counts
            for state in cinits:
                i=self.Y_index[state]
                self.initial_state_proba[i]=cinits[state]
            # normalize
            self.initial_state_proba=self.initial_state_proba/sum(self.initial_state_proba)
        
        def supervised_training(self, cpairs, ctrans,cinits):
            """ Train the HMM's parameters. This function wraps everything"""
            self.observation_estimation(cpairs)
            self.transition_estimation(ctrans)
            self.init_estimation(cinits)
            
            # Viterbi Algortihm
            # Source: http://stackoverflow.com/questions/34219766/need-help-implementing-viterbi-for-a-second-order-hidden-markov-model-with-pytho
        def viterbi( self, obs ):
            """Find the most probable state sequence
            """

            B = self.observation_proba 
            A = self.transition_proba
            T = len(obs)
            N = self.N  # the number of state
        # initialisation
            delta = zeros( N, float )
            tmp = zeros( N, float )
            psi = zeros( (T, N), int )      
            delta_t = zeros( N, float )
        # apply initial_state probs to the first frame
            delta = B[obs[0],:] * self.initial_state_proba   
        # recursion
            for t in xrange(1, T):
                O_t = obs[t]
                for j in range(N):
                    multiply( delta, A[j, :], tmp )
                    idx = psi[t, j] = tmp.argmax()       
                    delta_t[j] = tmp[idx] * B[O_t, j] 
            delta, delta_t = delta_t, delta
        # reconstruction
            i_star = [delta.argmax()]                        
            for psi_t in psi[-1:0:-1]:
                i_star.append( psi_t[i_star[-1]] )                 
            i_star.reverse()
            return i_star     

# Compter les mots et les tags


