# Modèle de Markov Caché du second ordre

### Application à la correction de typos dans des textes

#### Les typos sont maintenant des supressions de caractères

In [1]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys
from __future__ import division

UNK = "<unk>"  # token to map all out-of-vocabulary words (OOVs)
INS = "<ins>"
UNKid = 0  # index for UNK
#INSid = 1
epsilon = 1e-100


class HMM:
    def __init__(self,
                 state_list,
                 observation_list,
                 head_transition_proba=None,
                 transition_proba=None,
                 observation_proba=None,
                 initial_state_proba=None,
                 transition_head_proba=None,
                 smoothing_obs=0.01):
        """
            Builds a Hidden Markov Model
            * state_list is the list of state symbols [q_0...q_(N-1)]
            * observation_list is the list of observation symbols [v_0...v_(M-1)]
            * transition_proba is the transition probability matrix
                [a_ij] a_ij = Pr(Y_(t+1)=q_i|Y_t=q_j)
            * observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            * initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""

        print ("HMM creating with: ")

        self.N = len(state_list)  # number of states
        self.M = len(observation_list)  # number of possible emissions

        print (str(self.N) + " states")
        print (str(self.M) + " observations")

        self.omega_Y = state_list
        self.omega_X = observation_list

        if head_transition_proba is None:
            self.head_transition_proba = zeros((self.N, self.N))
        else:
            self.head_transition_proba = head_transition_proba

        if transition_proba is None:
            self.transition_proba = zeros((self.N, self.N**2))
        else:
            self.transition_proba = transition_proba

        if observation_proba is None:
            self.observation_proba = zeros((self.M, self.N))
        else:
            self.observation_proba = observation_proba

        if initial_state_proba is None:
            self.initial_state_proba = zeros((self.N, ))
        else:
            self.initial_state_proba = initial_state_proba

        self.make_indexes(
        )  # build indexes, i.e the mapping between token and int
        self.smoothing_obs = smoothing_obs

    def make_indexes(self):
        """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
        self.Y_index = {}
        for i in range(self.N):
            self.Y_index[self.omega_Y[i]] = i
        self.X_index = {}
        for i in range(self.M):
            self.X_index[self.omega_X[i]] = i

    def get_observationIndices(self, observations):
        """return observation indices, i.e 
            return [self.O_index[o] for o in observations]
            and deals with OOVs
            """
        indices = zeros(len(observations), int)
        k = 0
        for o in observations:
            if o in self.X_index:
                indices[k] = self.X_index[o]
            else:
                indices[k] = UNKid
            k += 1
        return indices

    def data2indices(self, sent):
        """From one tagged sentence of the brown corpus: 
            - extract the words and tags 
            - returns two list of indices, one for each
            -> (wordids, tagids)
            """
        wordids = list()
        tagids = list()
        for couple in sent:
            wrd = couple[0]
            tag = couple[1]
            if wrd in self.X_index:
                wordids.append(self.X_index[wrd])
            else:
                wordids.append(UNKid)
            tagids.append(self.Y_index[tag])
        return wordids, tagids

    def observation_estimation(self, pair_counts):
        """ Build the observation distribution: 
                observation_proba is the observation probablility matrix
                    [b_ki],  b_ki = Pr(X_t=v_k|Y_t=q_i)
                
                pair_counts is dictionary with 
                - key : a tuple (word,tag)
                - value: the associated count
                
                We just need to fill the matrix and normalize the count in the right way: 
                - one column (i constant) is the distrib. of word given a tag
                - just normalize the column, i.e sum over the row (axis=0)
            """
        # fill with counts
        for pair in pair_counts:
            wrd = pair[0]  # get word
            tag = pair[1]  # get tag
            cpt = pair_counts[pair]  # get the count
            # get word index (row), deal with OOV
            k = 0  # for <unk>
            if wrd in self.X_index:
                k = self.X_index[wrd]
            # get tag  index (column)
            i = self.Y_index[tag]
            # fill the matrix
            self.observation_proba[k, i] = cpt
        # normalize
        self.observation_proba = self.observation_proba + self.smoothing_obs
        self.observation_proba = self.observation_proba / self.observation_proba.sum(
            axis=0).reshape(1, self.N)

    def head_transition_estimation(self, head_trans_counts):
        """ Buid the transition distribution """
        # fill with counts
        for pair in head_trans_counts:
            i = self.Y_index[pair[0]]
            j = self.Y_index[pair[1]]
            self.head_transition_proba[j, i] = head_trans_counts[pair]
        # normalize
        self.head_transition_proba = self.head_transition_proba + self.smoothing_obs
        self.head_transition_proba = self.head_transition_proba / self.head_transition_proba.sum(
            axis=0).reshape(1, self.N)

    def transition_estimation(self, trans_counts):
        """ Build the transition distribution """
        # fill with counts
        for pair in trans_counts:
            i = self.Y_index[pair[0][0]]
            j = self.Y_index[pair[0][1]]
            k = self.Y_index[pair[1]]
            self.transition_proba[k, (i * self.N + j)] = trans_counts[pair]
        # normalize
        self.transition_proba = self.transition_proba + self.smoothing_obs
        self.transition_proba = self.transition_proba / self.transition_proba.sum(
            axis=0).reshape(1, self.N**2)

    def init_estimation(self, init_counts):
        """Build the init. distribution"""
        # fill with counts
        for tag in init_counts:
            i = self.Y_index[tag]
            self.initial_state_proba[i] = init_counts[tag]
        # normalize
        self.initial_state_proba = self.initial_state_proba / sum(
            self.initial_state_proba)

    def supervised_training(self, pair_counts, head_trans_counts, trans_counts,
                            init_counts):
        """ Train the HMM's parameters. This function wraps everything """
        self.observation_estimation(pair_counts)
        self.head_transition_estimation(head_trans_counts)
        self.transition_estimation(trans_counts)
        self.init_estimation(init_counts)

    def viterbi(self, obsids):
        """ Viterbi Algorithm : 
            Finding the most likely sequence of hidden states. 
            """

        T = len(obsids)

        # Initialisation
        delta = zeros(self.N, float)
        tmp = zeros(self.N, float)
        psi = zeros((T, self.N), int)

        # Delta update
        delta_t = zeros(self.N, float)

        # Apply initial_state probabilities to the first frame
        delta = self.observation_proba[obsids[0]] * self.initial_state_proba

        # Recursion
        for t in range(1, T):
            if t == 1:
                for i in range(self.N):
                    for j in range(self.N):
                        # Head
                        tmp[j] = delta[j] * self.head_transition_proba[i, j]
                    psi[t, i] = tmp.argmax()
                    delta_t[i] = tmp.max() * self.observation_proba[obsids[t],
                                                                    i]
            else:
                for i in range(self.N):
                    for j in range(self.N):
                        # Second
                        tmp[j] = delta[j] * self.transition_proba[
                            i, psi[t - 1, j] * self.N + j]
                    psi[t, i] = tmp.argmax()
                    delta_t[i] = tmp.max() * self.observation_proba[obsids[t],
                                                                    i]

            delta, delta_t = delta_t, delta

        # Reconstruction
        i_star = [delta.argmax()]
        for psi_t in psi[-1:0:-1]:
            i_star.append(psi_t[i_star[-1]])
        i_star.reverse()

        return i_star

        
    def print_error_rate(self, test):
        """ Compute and print error rate on test
            """
        nb_correct_before_hmm = 0
        nb_correct_after_hmm = 0
        total = 0
        
        for word in test:
            obsids,statids = hmm.data2indices(word)
            best_sequence = hmm.viterbi(obsids)

            for (i,j) in zip(best_sequence,statids):
                if i==j:
                    nb_correct_after_hmm += 1
                    
            for(i,j) in zip(statids,obsids):
                if i==j:
                    nb_correct_before_hmm += 1

            total += len(statids)


        error_before_hmm = 100 - nb_correct_before_hmm * 100.0 / total
        error_after_hmm = 100 - nb_correct_after_hmm * 100.0 / total

        print("Error rate before HMM : {}%".format(error_before_hmm))
        print("Error rate after HMM : {}%".format(error_after_hmm))

# Compter les mots et les tags

In [2]:
def make_counts(corpus):
    """ 
    Build different count tables to train a HMM. Each count table is a dictionnary. 
    Returns: 
    * c_words: word counts
    * c_tags: tag counts
    * c_pairs: count of pairs (word,tag)
    * c_transitions: count of tag bigram 
    * c_inits: count of tag found in the first position
    """
    c_words = dict()
    c_tags = dict()
    c_pairs= dict()
    c_transitions1 = dict()
    c_transitions2 = dict()
    c_inits = dict()
    
    for sent in corpus:
        # we use i because of the transition counts
        for i in range(len(sent)):
            couple = sent[i]
            wrd = couple[0]
            tag = couple[1]
            # word counts
            if wrd in c_words:
                c_words[wrd] = c_words[wrd] + 1
            else:
                c_words[wrd] = 1
                
            # tag counts
            if tag in c_tags:
                c_tags[tag] = c_tags[tag] + 1
            else:
                c_tags[tag] = 1
                
            # observation counts
            if couple in c_pairs:
                c_pairs[couple] = c_pairs[couple] + 1
            else:
                c_pairs[couple] = 1
            
            # i >  1 -> transition counts
            if i > 1:
                trans = ((sent[i-2][1], sent[i-1][1]), tag) #(tag at t-2, tag at t-1, tag at t)
                if trans in c_transitions2:
                    c_transitions2[trans] = c_transitions2[trans] + 1
                else:
                    c_transitions2[trans] = 1
                    
            elif i == 1:
                trans = (sent[i-1][1], tag)
                if trans in c_transitions1:
                    c_transitions1[trans] = c_transitions1[trans] + 1
                else:
                    c_transitions1[trans] = 1
                    
            # i == 0 -> counts for initial states
            else:
                if tag in c_inits:
                    c_inits[tag] = c_inits[tag] + 1
                else:
                    c_inits[tag] = 1
                    
    return c_words, c_tags, c_pairs, c_transitions1, c_transitions2, c_inits

# Création du vocabulaire (filtrage selon le nombre d'occurence)

In [3]:
def make_vocab(c, threshold):
    """ 
    return a vocabulary by thresholding word counts. 
    inputs: 
    * c_words : a dictionnary that maps word to its counts
    * threshold: count must be >= to the threshold to be included
    
    returns: 
    * a word list
    """
    voc = list()
    voc.append(UNK)
    #for w in c:
    #    if c[w] >= threshold:
    #        voc.append(w)
    letters = 'abcdefghijklmnopqrstuvwxyz'
    for l1 in letters:
        voc.append(l1)
        for l2 in letters:
            voc.append(l1+l2)
    return voc


# Les données


In [6]:
import pickle

path = ''

# Données avec 10% de typos
train10 = pickle.load(open(path+'train10.pkl', 'rb'))
test10 = pickle.load(open(path+'test10.pkl', 'rb'))

# Données avec 20% de typos
train20 = pickle.load(open(path+'train20.pkl', 'rb'))
test20 = pickle.load(open(path+'test20.pkl', 'rb'))

In [7]:
train10[1]

[('t', 't'), ('h', 'h'), ('e', 'e'), ('i', 'i'), ('r', 'r')]

# Mise des lettres en couple

### [(b, b), (y, y)] devient [(by, by)]

In [8]:
# Method 1

def couple_data(data):
    """ from [(b, b), (y, y)] to [(by, by)] """
    cdata = []
    for w in data:
        cw = []
        for i in range(0, len(w), 2):
            try:
                cw.append((w[i][0] + w[i+1][0], w[i][1] + w[i+1][1]))
            except:
                cw.append((w[i][0], w[i][1]))
        cdata.append(cw)
    return cdata

# Method 2
# If suppresion : from [(b, b), (y, y)] to [(b, by)]

In [9]:
train10 = couple_data(train10)
test10 = couple_data(test10)
train20 = couple_data(train20)
test20 = couple_data(test20)

In [10]:
train10[1]

[('th', 'th'), ('ei', 'ei'), ('r', 'r')]

# Les suppressions

In [11]:
from random import *

def delete_char(train, percentage, with_substitution=False):
    newtext = []
    for word in train:
        newword=[]
        for i in range(len(word)):
            # Suppression
            if (len(word[i][0]) == 2) and (random()<percentage/100):
                r = randrange(2) # 0 or 1
                newword.append((word[i][1][r], word[i][1]))
            else:
                if(with_substitution):
                    newword.append(word[i])
                else:
                    newword.append((word[i][1],word[i][1]))
        newtext.append(newword)
    return newtext

In [12]:
# Seulement des suppressions
train10 = delete_char(train10, 10)
test10 = delete_char(test10, 10)
train20 = delete_char(train20, 20)
test20 = delete_char(test20, 20)

In [13]:
print(train10[7])

[('th', 'th'), ('em', 'em')]


In [17]:
train = train10
test = test10

tot = len(train + test)
print ("Nombre de phrases totale = " + str(tot))
print ("Nombre de phrases de train = " + str(len(train)))
print ("Nombre de phrases de test  = " + str(len(test)))

Nombre de phrases totale = 30558
Nombre de phrases de train = 29057
Nombre de phrases de test  = 1501


In [18]:
cwords,ctags,cpairs,ctrans1,ctrans2,cinits = make_counts(train)
print ("Nombre de lettres  : "+str(len(cwords)))
print ("Nombre de tags  : "+str(len(ctags)))
print ("Nombre de paires: "+str(len(cpairs)))
print ("Nombre de init. : "+str(len(cinits)))
print (ctags)
vocab = make_vocab(cwords,10)
print ("Vocabulaire :"+str(len(vocab)))

Nombre de lettres  : 384
Nombre de tags  : 381
Nombre de paires: 904
Nombre de init. : 207
{'by': 195, 'th': 3657, 'ei': 187, 'r': 1113, 'ow': 137, 'n': 742, 'ac': 240, 'co': 661, 'un': 210, 't': 1312, 'vi': 325, 'ol': 117, 'en': 868, 'ce': 327, 'is': 1017, 'fo': 566, 'em': 449, 'a': 672, 'rm': 142, 'of': 1106, 'li': 361, 'be': 796, 'ra': 331, 'ti': 985, 'on': 1100, 'in': 1818, 'ot': 198, 'he': 371, 'wo': 288, 'rd': 80, 's': 1706, 'mm': 59, 'it': 734, 'ng': 404, 'ey': 218, 'br': 79, 'ea': 298, 'k': 52, 'ro': 290, 'ug': 88, 'h': 235, 'e': 3316, 'ps': 95, 'yc': 67, 'ho': 153, 'lo': 464, 'gi': 237, 'ca': 617, 'l': 561, 're': 1083, 'st': 948, 'ts': 224, 'at': 1134, 'ha': 524, 've': 659, 'tr': 215, 'ai': 136, 'ne': 545, 'd': 1394, 'to': 1191, 'us': 366, 'ar': 527, 'ov': 101, 'er': 711, 'so': 645, 'ci': 521, 'al': 925, 'iz': 36, 'ed': 534, 'es': 803, 'mo': 474, 'nf': 38, 'g': 413, 'an': 1340, 'rs': 169, 'nc': 194, 'fr': 223, 'ee': 144, 'bu': 255, 'ua': 39, 'll': 464, 'y': 1045, 'ju': 42, 'if

# Création du HMM et apprentissage

In [19]:
hmm = HMM(state_list=vocab, 
          observation_list=vocab,
          smoothing_obs = 0.001)

hmm.supervised_training(cpairs,ctrans1,ctrans2,cinits)

HMM creating with: 
703 states
703 observations


# Tests sur test10 avec seulement des suppressions

In [20]:
hmm.print_error_rate(test)

Error rate before HMM : 8.02901450725362%
Error rate after HMM : 2.776388194097052%


# Tests sur test20 avec seulement des suppressions

In [21]:
cwords,ctags,cpairs,ctrans1,ctrans2,cinits = make_counts(train20)

hmm = HMM(state_list=vocab, 
          observation_list=vocab, 
          smoothing_obs = 0.001)

hmm.supervised_training(cpairs,ctrans1,ctrans2,cinits)
hmm.print_error_rate(test20)

HMM creating with: 
703 states
703 observations
Error rate before HMM : 16.752718883884427%
Error rate after HMM : 5.108206085905749%
