# Author
    - Selim Lakhdar
        - selim.lakhdar@gmail.com
        - selim.lakhdar.etu@univ-lille.fr
-----------------------------------------------------------------

# Libraries

In [1]:
import os

import numpy as np
import pandas as pd
import random
import time

from sklearn.model_selection import train_test_split

# Read Data

In [2]:
data = pd.DataFrame()
for root, dirs, files in os.walk('./data/names/'):
    for f in files:
        content = pd.read_table('./data/names/' + f, header=None)
        content['LNG'] = f[:2]
        
        data = data.append(content)

        
data = data.reset_index(drop=True)
data['name'] = data[0]
data = data.drop([0], axis=1)

In [3]:
data['LNG'].unique()

array(['Cz', 'It', 'Po', 'Ir', 'Vi', 'En', 'Ko', 'Ja', 'Sp', 'Ch', 'Du',
       'Sc', 'Gr', 'Ge', 'Ar', 'Ru', 'Fr'], dtype=object)

# Tags

## BIO Tag

In [4]:
io_tags = []

for name, lng in zip(data['name'], data['LNG']):
    tags = []
    first = True
    for l_i in range(len(name)):
        if first:
            first = False
            tags += [(name[l_i], 'B_' + lng)]
        elif name[l_i] == ' ':
            tags += [(name[l_i], 'O_' + lng)]
        else:
            tags += [(name[l_i], 'I_' + lng)]
            
    io_tags += [tags]

data['BIO'] = io_tags
data

Unnamed: 0,LNG,name,BIO
0,Cz,Abl,"[(A, B_Cz), (b, I_Cz), (l, I_Cz)]"
1,Cz,Adsit,"[(A, B_Cz), (d, I_Cz), (s, I_Cz), (i, I_Cz), (..."
2,Cz,Ajdrna,"[(A, B_Cz), (j, I_Cz), (d, I_Cz), (r, I_Cz), (..."
3,Cz,Alt,"[(A, B_Cz), (l, I_Cz), (t, I_Cz)]"
4,Cz,Antonowitsch,"[(A, B_Cz), (n, I_Cz), (t, I_Cz), (o, I_Cz), (..."
...,...,...,...
20069,Fr,Villeneuve,"[(V, B_Fr), (i, I_Fr), (l, I_Fr), (l, I_Fr), (..."
20070,Fr,Vincent,"[(V, B_Fr), (i, I_Fr), (n, I_Fr), (c, I_Fr), (..."
20071,Fr,Vipond,"[(V, B_Fr), (i, I_Fr), (p, I_Fr), (o, I_Fr), (..."
20072,Fr,Voclain,"[(V, B_Fr), (o, I_Fr), (c, I_Fr), (l, I_Fr), (..."


## BIOES Tag

In [5]:
bioes_tags = []

for name, lng in zip(data['name'], data['LNG']):
    tags = []
    first = True
    for l_i in range(len(name)):
        if first:
            first = False
            tags += [(name[l_i], 'B_' + lng)]
        elif name[l_i] == ' ':
            tags += [(name[l_i], 'O_' + lng)]
        else:
            if l_i == len(name) - 1:
                tags += [(name[l_i], 'E_' + lng)]
            else:
                tags += [(name[l_i], 'I_' + lng)]
            
    bioes_tags += [tags]

data['BIOES'] = bioes_tags
data['BIOES'].head().values

array([list([('A', 'B_Cz'), ('b', 'I_Cz'), ('l', 'E_Cz')]),
       list([('A', 'B_Cz'), ('d', 'I_Cz'), ('s', 'I_Cz'), ('i', 'I_Cz'), ('t', 'E_Cz')]),
       list([('A', 'B_Cz'), ('j', 'I_Cz'), ('d', 'I_Cz'), ('r', 'I_Cz'), ('n', 'I_Cz'), ('a', 'E_Cz')]),
       list([('A', 'B_Cz'), ('l', 'I_Cz'), ('t', 'E_Cz')]),
       list([('A', 'B_Cz'), ('n', 'I_Cz'), ('t', 'I_Cz'), ('o', 'I_Cz'), ('n', 'I_Cz'), ('o', 'I_Cz'), ('w', 'I_Cz'), ('i', 'I_Cz'), ('t', 'I_Cz'), ('s', 'I_Cz'), ('c', 'I_Cz'), ('h', 'E_Cz')])],
      dtype=object)

# Split

In [6]:
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)

print("train_set.shape:", train_set.shape)
print("test_set.shape:", test_set.shape)

train_set.shape: (14051, 4)
test_set.shape: (6023, 4)


In [7]:
train_bio = []
for v in train_set['BIO']:
    train_bio += v
test_bio = []
for v in test_set['BIO']:
    test_bio += v
print('len(train_bio):', len(train_bio))
print('len(test_bio):', len(test_bio))
    
tags_bio = {tag for word,tag in train_bio}
print('len(tags_bio):', len(tags_bio), ':', tags_bio)

# check total words in vocabulary
vocab_bio = {word for word,tag in train_bio}
print('len(vocab_bio):', len(vocab_bio))

len(train_bio): 100451
len(test_bio): 43171
len(tags_bio): 44 : {'O_Du', 'B_It', 'O_Ge', 'O_En', 'B_Vi', 'O_Ch', 'O_Ru', 'O_Fr', 'I_Du', 'B_Du', 'I_It', 'I_Ar', 'B_Ja', 'I_Sp', 'B_Ru', 'I_Ch', 'I_En', 'B_Ar', 'I_Gr', 'B_Fr', 'B_Po', 'B_Ch', 'I_Ge', 'O_Sp', 'B_Ir', 'B_Gr', 'I_Cz', 'B_Ko', 'B_Cz', 'I_Ja', 'B_Sp', 'B_Ge', 'I_Fr', 'O_It', 'B_En', 'I_Ko', 'O_Ko', 'I_Sc', 'I_Ir', 'B_Sc', 'I_Po', 'I_Ru', 'I_Vi', 'O_Cz'}
len(vocab_bio): 83


In [8]:
train_bioes = []
for v in train_set['BIOES']:
    train_bioes += v
test_bioes = []
for v in test_set['BIOES']:
    test_bioes += v
print('len(train_bioes):', len(train_bioes))
print('len(test_bioes):', len(test_bioes))
    
tags_bioes = {tag for word,tag in train_bioes}
print('len(tags_bioes):', len(tags_bioes), ':', tags_bioes)

# check total words in vocabulary
vocab_bioes = {word for word,tag in train_bioes}
print('len(vocab_bioes):', len(vocab_bioes))

len(train_bioes): 100451
len(test_bioes): 43171
len(tags_bioes): 61 : {'E_Ge', 'B_It', 'E_Sc', 'O_Ge', 'O_En', 'E_Ar', 'O_Du', 'B_Vi', 'O_Ch', 'E_Vi', 'O_Ru', 'E_Ch', 'O_Fr', 'I_Du', 'E_Ko', 'E_Cz', 'B_Du', 'I_It', 'I_Ar', 'B_Ja', 'E_Sp', 'I_Sp', 'B_Ru', 'I_Ch', 'I_En', 'B_Ar', 'I_Gr', 'E_Ja', 'B_Fr', 'E_Ru', 'B_Po', 'B_Ch', 'I_Ge', 'O_Sp', 'B_Ir', 'B_Gr', 'E_En', 'B_Ko', 'B_Cz', 'I_Cz', 'E_Ir', 'E_Fr', 'I_Ja', 'B_Sp', 'E_Gr', 'E_It', 'E_Po', 'I_Fr', 'B_Ge', 'B_En', 'O_It', 'I_Ko', 'O_Ko', 'I_Sc', 'I_Ir', 'B_Sc', 'I_Po', 'I_Ru', 'E_Du', 'I_Vi', 'O_Cz'}
len(vocab_bioes): 83


# HMM

In [9]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag):
    
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    
    #total number of times the passed tag occurred in train_bag
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    
    #now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
     
    return (count_w_given_tag, count_tag)

In [10]:
# compute Transition Probability
def t2_given_t1(t2, t1, train_bag):
    
    tags = [pair[1] for pair in train_bag]
    
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

## BIO transition matrix

In [11]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix_bio = np.zeros((len(tags_bio), len(tags_bio)), dtype='float32')

for i, t1 in enumerate(list(tags_bio)):
    for j, t2 in enumerate(list(tags_bio)): 
        tags_matrix_bio[i, j] = t2_given_t1(t2, t1, train_bio)[0]/t2_given_t1(t2, t1, train_bio)[1]
        
tags_bio_df = pd.DataFrame(tags_matrix_bio, columns = list(tags_bio), index=list(tags_bio))
tags_bio_df

Unnamed: 0,O_Du,B_It,O_Ge,O_En,B_Vi,O_Ch,O_Ru,O_Fr,I_Du,B_Du,...,B_En,I_Ko,O_Ko,I_Sc,I_Ir,B_Sc,I_Po,I_Ru,I_Vi,O_Cz
O_Du,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B_It,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O_Ge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O_En,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B_Vi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
O_Ch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O_Ru,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
O_Fr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I_Du,0.000791,0.006329,0.0,0.0,0.0,0.0,0.0,0.0,0.828323,0.003165,...,0.031646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B_Du,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## BIOES transition matrix

In [12]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix_bioes = np.zeros((len(tags_bioes), len(tags_bioes)), dtype='float32')

for i, t1 in enumerate(list(tags_bioes)):
    for j, t2 in enumerate(list(tags_bioes)): 
        tags_matrix_bioes[i, j] = t2_given_t1(t2, t1, train_bioes)[0]/t2_given_t1(t2, t1, train_bioes)[1]
        
tags_bioes_df = pd.DataFrame(tags_matrix_bioes, columns = list(tags_bioes), index=list(tags_bioes))
tags_bioes_df

Unnamed: 0,E_Ge,B_It,E_Sc,O_Ge,O_En,E_Ar,O_Du,B_Vi,O_Ch,E_Vi,...,I_Ko,O_Ko,I_Sc,I_Ir,B_Sc,I_Po,I_Ru,E_Du,I_Vi,O_Cz
E_Ge,0.0,0.039749,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.004184,0.00000,0.000000,0.0,0.000000,0.0
B_It,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0
E_Sc,0.0,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0
O_Ge,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0
O_En,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
I_Po,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.79972,0.000000,0.0,0.000000,0.0
I_Ru,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.834726,0.0,0.000000,0.0
E_Du,0.0,0.037037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0
I_Vi,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.518519,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.481481,0.0


In [13]:
def Viterbi(words, train_bag, tags_df):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                # transition_p = tags_df.loc['.', tag]
                transition_p = tags_df.loc[tag].max()
                # transition_p = tags_df.loc[tag, tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag, train_bag)[0]/word_given_tag(words[key], tag, train_bag)[1]
            state_probability = emission_p * transition_p
            
            # print("state_probability:", state_probability)
            p.append(state_probability)
                
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [14]:
def Viterbi2(words, train_bag, tags_df):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                # transition_p = tags_df.loc['.', tag]
                # transition_p = tags_df.loc[tag].max()
                transition_p = tags_df.loc[tag, tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag, train_bag)[0]/word_given_tag(words[key], tag, train_bag)[1]
            state_probability = emission_p * transition_p
            
            # print("state_probability:", state_probability)
            p.append(state_probability)
                
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

# Test

In [15]:
random.seed(1234)
 
# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]
test_run = test_set.iloc[rndom]
 
# list of tagged words
test_run_base_bio = test_run['BIO'].values
test_run_base_bioes = test_run['BIOES'].values
 
# list of untagged words
test_tagged_words = test_run['name'].values

test_run

Unnamed: 0,LNG,name,BIO,BIOES
6741,Sp,San nicolas,"[(S, B_Sp), (a, I_Sp), (n, I_Sp), ( , O_Sp), (...","[(S, B_Sp), (a, I_Sp), (n, I_Sp), ( , O_Sp), (..."
5203,En,Walshe,"[(W, B_En), (a, I_En), (l, I_En), (s, I_En), (...","[(W, B_En), (a, I_En), (l, I_En), (s, I_En), (..."
447,Cz,StrakaO,"[(S, B_Cz), (t, I_Cz), (r, I_Cz), (a, I_Cz), (...","[(S, B_Cz), (t, I_Cz), (r, I_Cz), (a, I_Cz), (..."
16696,Ru,Patz,"[(P, B_Ru), (a, I_Ru), (t, I_Ru), (z, I_Ru)]","[(P, B_Ru), (a, I_Ru), (t, I_Ru), (z, E_Ru)]"
17864,Ru,Tihy,"[(T, B_Ru), (i, I_Ru), (h, I_Ru), (y, I_Ru)]","[(T, B_Ru), (i, I_Ru), (h, I_Ru), (y, E_Ru)]"
4885,En,Summers,"[(S, B_En), (u, I_En), (m, I_En), (m, I_En), (...","[(S, B_En), (u, I_En), (m, I_En), (m, I_En), (..."
17569,Ru,Shakhmagon,"[(S, B_Ru), (h, I_Ru), (a, I_Ru), (k, I_Ru), (...","[(S, B_Ru), (h, I_Ru), (a, I_Ru), (k, I_Ru), (..."
84,Cz,Faltejsek,"[(F, B_Cz), (a, I_Cz), (l, I_Cz), (t, I_Cz), (...","[(F, B_Cz), (a, I_Cz), (l, I_Cz), (t, I_Cz), (..."
14426,Ru,Jabinsky,"[(J, B_Ru), (a, I_Ru), (b, I_Ru), (i, I_Ru), (...","[(J, B_Ru), (a, I_Ru), (b, I_Ru), (i, I_Ru), (..."
88,Cz,Finfera,"[(F, B_Cz), (i, I_Cz), (n, I_Cz), (f, I_Cz), (...","[(F, B_Cz), (i, I_Cz), (n, I_Cz), (f, I_Cz), (..."


## BIO

In [16]:
for w, w_t in zip(test_tagged_words, test_run_base_bio):
    print("trying:", w)
    print("expecting:", w_t)
    print("="*20)
    start = time.time()
    tagged_seq = Viterbi(w, train_bio, tags_bio_df)
    end = time.time()
    difference = end-start

    print("Time taken in seconds: ", difference)

    tmp = pd.DataFrame(tagged_seq)
    display(tmp)
    print("=====================================================")

trying: San nicolas
expecting: [('S', 'B_Sp'), ('a', 'I_Sp'), ('n', 'I_Sp'), (' ', 'O_Sp'), ('n', 'I_Sp'), ('i', 'I_Sp'), ('c', 'I_Sp'), ('o', 'I_Sp'), ('l', 'I_Sp'), ('a', 'I_Sp'), ('s', 'I_Sp')]
Time taken in seconds:  7.740196943283081


Unnamed: 0,0,1
0,S,B_Ko
1,a,I_Ko
2,n,I_Ko
3,,O_Ko
4,n,O_Du
5,i,I_Du
6,c,I_Du
7,o,I_Du
8,l,I_Du
9,a,I_Du


trying: Walshe
expecting: [('W', 'B_En'), ('a', 'I_En'), ('l', 'I_En'), ('s', 'I_En'), ('h', 'I_En'), ('e', 'I_En')]
Time taken in seconds:  4.11969518661499


Unnamed: 0,0,1
0,W,B_Sc
1,a,I_Sc
2,l,I_Sc
3,s,I_Sc
4,h,I_Sc
5,e,I_Sc


trying: StrakaO
expecting: [('S', 'B_Cz'), ('t', 'I_Cz'), ('r', 'I_Cz'), ('a', 'I_Cz'), ('k', 'I_Cz'), ('a', 'I_Cz'), ('O', 'I_Cz')]
Time taken in seconds:  4.882996320724487


Unnamed: 0,0,1
0,S,B_Ko
1,t,O_Du
2,r,I_Du
3,a,I_Du
4,k,I_Du
5,a,I_Du
6,O,B_Ja


trying: Patz
expecting: [('P', 'B_Ru'), ('a', 'I_Ru'), ('t', 'I_Ru'), ('z', 'I_Ru')]
Time taken in seconds:  2.7975993156433105


Unnamed: 0,0,1
0,P,B_Gr
1,a,I_Gr
2,t,I_Gr
3,z,I_Gr


trying: Tihy
expecting: [('T', 'B_Ru'), ('i', 'I_Ru'), ('h', 'I_Ru'), ('y', 'I_Ru')]
Time taken in seconds:  2.6637187004089355


Unnamed: 0,0,1
0,T,B_Vi
1,i,I_Vi
2,h,I_Vi
3,y,I_Vi


trying: Summers
expecting: [('S', 'B_En'), ('u', 'I_En'), ('m', 'I_En'), ('m', 'I_En'), ('e', 'I_En'), ('r', 'I_En'), ('s', 'I_En')]
Time taken in seconds:  4.8645148277282715


Unnamed: 0,0,1
0,S,B_Ko
1,u,I_Ko
2,m,I_Ko
3,m,I_Ko
4,e,I_Ko
5,r,I_Ko
6,s,I_Ko


trying: Shakhmagon
expecting: [('S', 'B_Ru'), ('h', 'I_Ru'), ('a', 'I_Ru'), ('k', 'I_Ru'), ('h', 'I_Ru'), ('m', 'I_Ru'), ('a', 'I_Ru'), ('g', 'I_Ru'), ('o', 'I_Ru'), ('n', 'I_Ru')]
Time taken in seconds:  7.318213939666748


Unnamed: 0,0,1
0,S,B_Ko
1,h,I_Ko
2,a,I_Ko
3,k,I_Ko
4,h,I_Ko
5,m,I_Ko
6,a,I_Ko
7,g,I_Ko
8,o,I_Ko
9,n,I_Ko


trying: Faltejsek
expecting: [('F', 'B_Cz'), ('a', 'I_Cz'), ('l', 'I_Cz'), ('t', 'I_Cz'), ('e', 'I_Cz'), ('j', 'I_Cz'), ('s', 'I_Cz'), ('e', 'I_Cz'), ('k', 'I_Cz')]
Time taken in seconds:  6.634110450744629


Unnamed: 0,0,1
0,F,B_Po
1,a,I_Po
2,l,I_Po
3,t,I_Po
4,e,I_Po
5,j,I_Po
6,s,I_Po
7,e,I_Po
8,k,I_Po


trying: Jabinsky
expecting: [('J', 'B_Ru'), ('a', 'I_Ru'), ('b', 'I_Ru'), ('i', 'I_Ru'), ('n', 'I_Ru'), ('s', 'I_Ru'), ('k', 'I_Ru'), ('y', 'I_Ru')]
Time taken in seconds:  5.81659197807312


Unnamed: 0,0,1
0,J,B_Ko
1,a,I_Ko
2,b,O_Du
3,i,I_Du
4,n,I_Du
5,s,I_Du
6,k,I_Du
7,y,I_Du


trying: Finfera
expecting: [('F', 'B_Cz'), ('i', 'I_Cz'), ('n', 'I_Cz'), ('f', 'I_Cz'), ('e', 'I_Cz'), ('r', 'I_Cz'), ('a', 'I_Cz')]
Time taken in seconds:  5.213959455490112


Unnamed: 0,0,1
0,F,B_Po
1,i,I_Po
2,n,I_Po
3,f,I_Po
4,e,I_Po
5,r,I_Po
6,a,I_Po




## V2

In [17]:
for w, w_t in zip(test_tagged_words, test_run_base_bio):
    print("trying:", w)
    print("expecting:", w_t)
    print("="*20)
    start = time.time()
    tagged_seq = Viterbi2(w, train_bio, tags_bio_df)
    end = time.time()
    difference = end-start

    print("Time taken in seconds: ", difference)

    tmp = pd.DataFrame(tagged_seq)
    display(tmp)
    print("=====================================================")

trying: San nicolas
expecting: [('S', 'B_Sp'), ('a', 'I_Sp'), ('n', 'I_Sp'), (' ', 'O_Sp'), ('n', 'I_Sp'), ('i', 'I_Sp'), ('c', 'I_Sp'), ('o', 'I_Sp'), ('l', 'I_Sp'), ('a', 'I_Sp'), ('s', 'I_Sp')]
Time taken in seconds:  7.995017051696777


Unnamed: 0,0,1
0,S,I_Ir
1,a,I_Ir
2,n,I_Ir
3,,O_Du
4,n,I_Du
5,i,I_Du
6,c,I_Du
7,o,I_Du
8,l,I_Du
9,a,I_Du


trying: Walshe
expecting: [('W', 'B_En'), ('a', 'I_En'), ('l', 'I_En'), ('s', 'I_En'), ('h', 'I_En'), ('e', 'I_En')]
Time taken in seconds:  4.333609580993652


Unnamed: 0,0,1
0,W,O_Du
1,a,I_Du
2,l,I_Du
3,s,I_Du
4,h,I_Du
5,e,I_Du


trying: StrakaO
expecting: [('S', 'B_Cz'), ('t', 'I_Cz'), ('r', 'I_Cz'), ('a', 'I_Cz'), ('k', 'I_Cz'), ('a', 'I_Cz'), ('O', 'I_Cz')]
Time taken in seconds:  5.162780284881592


Unnamed: 0,0,1
0,S,I_Ir
1,t,I_Ir
2,r,I_Ir
3,a,I_Ir
4,k,I_Ir
5,a,I_Ir
6,O,B_Ir


trying: Patz
expecting: [('P', 'B_Ru'), ('a', 'I_Ru'), ('t', 'I_Ru'), ('z', 'I_Ru')]
Time taken in seconds:  2.708646535873413


Unnamed: 0,0,1
0,P,I_Ru
1,a,I_Ru
2,t,I_Ru
3,z,I_Ru


trying: Tihy
expecting: [('T', 'B_Ru'), ('i', 'I_Ru'), ('h', 'I_Ru'), ('y', 'I_Ru')]
Time taken in seconds:  3.090329885482788


Unnamed: 0,0,1
0,T,I_Ir
1,i,I_Ir
2,h,I_Ir
3,y,I_Ir


trying: Summers
expecting: [('S', 'B_En'), ('u', 'I_En'), ('m', 'I_En'), ('m', 'I_En'), ('e', 'I_En'), ('r', 'I_En'), ('s', 'I_En')]
Time taken in seconds:  4.922358512878418


Unnamed: 0,0,1
0,S,I_Ir
1,u,I_Ir
2,m,I_Ir
3,m,I_Ir
4,e,I_Ir
5,r,I_Ir
6,s,I_Ir


trying: Shakhmagon
expecting: [('S', 'B_Ru'), ('h', 'I_Ru'), ('a', 'I_Ru'), ('k', 'I_Ru'), ('h', 'I_Ru'), ('m', 'I_Ru'), ('a', 'I_Ru'), ('g', 'I_Ru'), ('o', 'I_Ru'), ('n', 'I_Ru')]
Time taken in seconds:  6.8434693813323975


Unnamed: 0,0,1
0,S,I_Ir
1,h,I_Ir
2,a,I_Ir
3,k,I_Ir
4,h,I_Ir
5,m,I_Ir
6,a,I_Ir
7,g,I_Ir
8,o,I_Ir
9,n,I_Ir


trying: Faltejsek
expecting: [('F', 'B_Cz'), ('a', 'I_Cz'), ('l', 'I_Cz'), ('t', 'I_Cz'), ('e', 'I_Cz'), ('j', 'I_Cz'), ('s', 'I_Cz'), ('e', 'I_Cz'), ('k', 'I_Cz')]
Time taken in seconds:  6.29771089553833


Unnamed: 0,0,1
0,F,I_Ru
1,a,I_Ru
2,l,I_Ru
3,t,I_Ru
4,e,I_Ru
5,j,I_Ru
6,s,I_Ru
7,e,I_Ru
8,k,I_Ru


trying: Jabinsky
expecting: [('J', 'B_Ru'), ('a', 'I_Ru'), ('b', 'I_Ru'), ('i', 'I_Ru'), ('n', 'I_Ru'), ('s', 'I_Ru'), ('k', 'I_Ru'), ('y', 'I_Ru')]
Time taken in seconds:  6.13077712059021


Unnamed: 0,0,1
0,J,O_Du
1,a,I_Du
2,b,I_Du
3,i,I_Du
4,n,I_Du
5,s,I_Du
6,k,I_Du
7,y,I_Du


trying: Finfera
expecting: [('F', 'B_Cz'), ('i', 'I_Cz'), ('n', 'I_Cz'), ('f', 'I_Cz'), ('e', 'I_Cz'), ('r', 'I_Cz'), ('a', 'I_Cz')]
Time taken in seconds:  4.948254823684692


Unnamed: 0,0,1
0,F,I_Ru
1,i,I_Ru
2,n,I_Ru
3,f,I_Ru
4,e,I_Ru
5,r,I_Ru
6,a,I_Ru




------------------------------------------------

## BIOES

In [18]:
for w, w_t in zip(test_tagged_words, test_run_base_bioes):
    print("trying:", w)
    print("expecting:", w_t)
    print("="*20)
    start = time.time()
    tagged_seq = Viterbi(w, train_bioes, tags_bioes_df)
    end = time.time()
    difference = end-start

    print("Time taken in seconds: ", difference)

    tmp = pd.DataFrame(tagged_seq)
    display(tmp)
    print("=====================================================")

trying: San nicolas
expecting: [('S', 'B_Sp'), ('a', 'I_Sp'), ('n', 'I_Sp'), (' ', 'O_Sp'), ('n', 'I_Sp'), ('i', 'I_Sp'), ('c', 'I_Sp'), ('o', 'I_Sp'), ('l', 'I_Sp'), ('a', 'I_Sp'), ('s', 'E_Sp')]
Time taken in seconds:  11.792368412017822


Unnamed: 0,0,1
0,S,B_Du
1,a,I_Du
2,n,I_Du
3,,O_Du
4,n,I_Du
5,i,I_Du
6,c,I_Du
7,o,I_Du
8,l,I_Du
9,a,I_Du


trying: Walshe
expecting: [('W', 'B_En'), ('a', 'I_En'), ('l', 'I_En'), ('s', 'I_En'), ('h', 'I_En'), ('e', 'E_En')]
Time taken in seconds:  6.474341630935669


Unnamed: 0,0,1
0,W,B_Sc
1,a,I_Sc
2,l,I_Sc
3,s,I_Sc
4,h,I_Sc
5,e,I_Sc


trying: StrakaO
expecting: [('S', 'B_Cz'), ('t', 'I_Cz'), ('r', 'I_Cz'), ('a', 'I_Cz'), ('k', 'I_Cz'), ('a', 'I_Cz'), ('O', 'E_Cz')]
Time taken in seconds:  7.834620952606201


Unnamed: 0,0,1
0,S,B_Du
1,t,I_Du
2,r,I_Du
3,a,I_Du
4,k,I_Du
5,a,I_Du
6,O,E_Ge


trying: Patz
expecting: [('P', 'B_Ru'), ('a', 'I_Ru'), ('t', 'I_Ru'), ('z', 'E_Ru')]
Time taken in seconds:  4.58123779296875


Unnamed: 0,0,1
0,P,B_Gr
1,a,I_Gr
2,t,I_Gr
3,z,I_Gr


trying: Tihy
expecting: [('T', 'B_Ru'), ('i', 'I_Ru'), ('h', 'I_Ru'), ('y', 'E_Ru')]
Time taken in seconds:  4.1656410694122314


Unnamed: 0,0,1
0,T,B_Vi
1,i,I_Vi
2,h,I_Vi
3,y,I_Vi


trying: Summers
expecting: [('S', 'B_En'), ('u', 'I_En'), ('m', 'I_En'), ('m', 'I_En'), ('e', 'I_En'), ('r', 'I_En'), ('s', 'E_En')]
Time taken in seconds:  7.042537689208984


Unnamed: 0,0,1
0,S,B_Du
1,u,I_Du
2,m,I_Du
3,m,I_Du
4,e,I_Du
5,r,I_Du
6,s,E_Du


trying: Shakhmagon
expecting: [('S', 'B_Ru'), ('h', 'I_Ru'), ('a', 'I_Ru'), ('k', 'I_Ru'), ('h', 'I_Ru'), ('m', 'I_Ru'), ('a', 'I_Ru'), ('g', 'I_Ru'), ('o', 'I_Ru'), ('n', 'E_Ru')]
Time taken in seconds:  10.201515436172485


Unnamed: 0,0,1
0,S,B_Du
1,h,I_Du
2,a,I_Du
3,k,I_Du
4,h,I_Du
5,m,I_Du
6,a,I_Du
7,g,I_Du
8,o,I_Du
9,n,I_Du


trying: Faltejsek
expecting: [('F', 'B_Cz'), ('a', 'I_Cz'), ('l', 'I_Cz'), ('t', 'I_Cz'), ('e', 'I_Cz'), ('j', 'I_Cz'), ('s', 'I_Cz'), ('e', 'I_Cz'), ('k', 'E_Cz')]
Time taken in seconds:  8.204911947250366


Unnamed: 0,0,1
0,F,B_Po
1,a,I_Po
2,l,I_Po
3,t,I_Po
4,e,I_Po
5,j,I_Po
6,s,I_Po
7,e,I_Po
8,k,I_Po


trying: Jabinsky
expecting: [('J', 'B_Ru'), ('a', 'I_Ru'), ('b', 'I_Ru'), ('i', 'I_Ru'), ('n', 'I_Ru'), ('s', 'I_Ru'), ('k', 'I_Ru'), ('y', 'E_Ru')]
Time taken in seconds:  7.496773958206177


Unnamed: 0,0,1
0,J,B_Ko
1,a,I_Ko
2,b,E_Ge
3,i,E_Ge
4,n,E_Ge
5,s,E_Ge
6,k,E_Ge
7,y,E_Ge


trying: Finfera
expecting: [('F', 'B_Cz'), ('i', 'I_Cz'), ('n', 'I_Cz'), ('f', 'I_Cz'), ('e', 'I_Cz'), ('r', 'I_Cz'), ('a', 'E_Cz')]
Time taken in seconds:  6.4466516971588135


Unnamed: 0,0,1
0,F,B_Po
1,i,I_Po
2,n,I_Po
3,f,I_Po
4,e,I_Po
5,r,I_Po
6,a,I_Po




## V2

In [19]:
for w, w_t in zip(test_tagged_words, test_run_base_bioes):
    print("trying:", w)
    print("expecting:", w_t)
    print("="*20)
    start = time.time()
    tagged_seq = Viterbi2(w, train_bioes, tags_bioes_df)
    end = time.time()
    difference = end-start

    print("Time taken in seconds: ", difference)

    tmp = pd.DataFrame(tagged_seq)
    display(tmp)
    print("=====================================================")

trying: San nicolas
expecting: [('S', 'B_Sp'), ('a', 'I_Sp'), ('n', 'I_Sp'), (' ', 'O_Sp'), ('n', 'I_Sp'), ('i', 'I_Sp'), ('c', 'I_Sp'), ('o', 'I_Sp'), ('l', 'I_Sp'), ('a', 'I_Sp'), ('s', 'E_Sp')]
Time taken in seconds:  10.43902039527893


Unnamed: 0,0,1
0,S,I_Ir
1,a,I_Ir
2,n,E_Ir
3,,E_Ge
4,n,E_Ge
5,i,E_Ge
6,c,E_Ge
7,o,E_Ge
8,l,E_Ge
9,a,E_Ge


trying: Walshe
expecting: [('W', 'B_En'), ('a', 'I_En'), ('l', 'I_En'), ('s', 'I_En'), ('h', 'I_En'), ('e', 'E_En')]
Time taken in seconds:  5.716686725616455


Unnamed: 0,0,1
0,W,E_Ge
1,a,E_Ge
2,l,E_Ge
3,s,E_Ge
4,h,E_Ge
5,e,E_Ge


trying: StrakaO
expecting: [('S', 'B_Cz'), ('t', 'I_Cz'), ('r', 'I_Cz'), ('a', 'I_Cz'), ('k', 'I_Cz'), ('a', 'I_Cz'), ('O', 'E_Cz')]
Time taken in seconds:  6.901586055755615


Unnamed: 0,0,1
0,S,I_Ir
1,t,I_Ir
2,r,I_Ir
3,a,I_Ir
4,k,E_Ir
5,a,E_Ge
6,O,B_En


trying: Patz
expecting: [('P', 'B_Ru'), ('a', 'I_Ru'), ('t', 'I_Ru'), ('z', 'E_Ru')]
Time taken in seconds:  4.0935633182525635


Unnamed: 0,0,1
0,P,I_Ru
1,a,I_Ru
2,t,I_Ru
3,z,I_Ru


trying: Tihy
expecting: [('T', 'B_Ru'), ('i', 'I_Ru'), ('h', 'I_Ru'), ('y', 'E_Ru')]
Time taken in seconds:  3.7145345211029053


Unnamed: 0,0,1
0,T,I_Ir
1,i,I_Ir
2,h,I_Ir
3,y,E_Ir


trying: Summers
expecting: [('S', 'B_En'), ('u', 'I_En'), ('m', 'I_En'), ('m', 'I_En'), ('e', 'I_En'), ('r', 'I_En'), ('s', 'E_En')]
Time taken in seconds:  6.835211515426636


Unnamed: 0,0,1
0,S,I_Ir
1,u,I_Ir
2,m,I_Ir
3,m,I_Ir
4,e,I_Ir
5,r,I_Ir
6,s,I_Ir


trying: Shakhmagon
expecting: [('S', 'B_Ru'), ('h', 'I_Ru'), ('a', 'I_Ru'), ('k', 'I_Ru'), ('h', 'I_Ru'), ('m', 'I_Ru'), ('a', 'I_Ru'), ('g', 'I_Ru'), ('o', 'I_Ru'), ('n', 'E_Ru')]
Time taken in seconds:  9.764057159423828


Unnamed: 0,0,1
0,S,I_Ir
1,h,I_Ir
2,a,I_Ir
3,k,E_Ir
4,h,E_Ge
5,m,E_Ge
6,a,E_Ge
7,g,E_Ge
8,o,E_Ge
9,n,E_Ge


trying: Faltejsek
expecting: [('F', 'B_Cz'), ('a', 'I_Cz'), ('l', 'I_Cz'), ('t', 'I_Cz'), ('e', 'I_Cz'), ('j', 'I_Cz'), ('s', 'I_Cz'), ('e', 'I_Cz'), ('k', 'E_Cz')]
Time taken in seconds:  8.882898569107056


Unnamed: 0,0,1
0,F,I_Ru
1,a,I_Ru
2,l,I_Ru
3,t,I_Ru
4,e,I_Ru
5,j,I_Ru
6,s,I_Ru
7,e,I_Ru
8,k,I_Ru


trying: Jabinsky
expecting: [('J', 'B_Ru'), ('a', 'I_Ru'), ('b', 'I_Ru'), ('i', 'I_Ru'), ('n', 'I_Ru'), ('s', 'I_Ru'), ('k', 'I_Ru'), ('y', 'E_Ru')]
Time taken in seconds:  8.579764127731323


Unnamed: 0,0,1
0,J,E_Ge
1,a,E_Ge
2,b,E_Ge
3,i,E_Ge
4,n,E_Ge
5,s,E_Ge
6,k,E_Ge
7,y,E_Ge


trying: Finfera
expecting: [('F', 'B_Cz'), ('i', 'I_Cz'), ('n', 'I_Cz'), ('f', 'I_Cz'), ('e', 'I_Cz'), ('r', 'I_Cz'), ('a', 'E_Cz')]
Time taken in seconds:  7.043835878372192


Unnamed: 0,0,1
0,F,I_Ru
1,i,I_Ru
2,n,I_Ru
3,f,I_Ru
4,e,I_Ru
5,r,I_Ru
6,a,I_Ru




------------------------------------------------

- On remarque que notre HMM n'arrive pas à bien prédire les séquences ....
- Le problème pourrait venir de l'implémentation de viterbi ....

------------------------------------------------

# Example retrouvé sur internet qui m'a guidé

In [20]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
 
#download the treebank corpus from nltk
nltk.download('treebank')
 
#download the universal tagset from nltk
nltk.download('universal_tagset')
 
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
 
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))


#use set datatype to check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)
 
# check total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
     
    return (count_w_given_tag, count_tag)

# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df.sum())
print("=============================================================")
display(tags_df)


def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))


# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(1234)      #define a random seed to get same sentences when run multiple times
 
# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]
 
# list of 10 sents on which we test the model
test_run = [test_set[i] for i in rndom]
 
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]
 
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
                     

#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start
 
tmp = pd.DataFrame(tagged_seq)
tmp

[nltk_data] Downloading package treebank to /home/alpha/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/alpha/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


80310
20366
12
{'ADJ', 'ADP', 'X', 'ADV', '.', 'NOUN', 'PRT', 'DET', 'CONJ', 'VERB', 'NUM', 'PRON'}


ADJ     0.952829
ADP     0.866611
X       0.781935
ADV     0.399379
.       1.030826
NOUN    3.505724
PRT     0.335972
DET     1.017380
CONJ    0.165853
VERB    2.069407
NUM     0.539431
PRON    0.334546
dtype: float32



Unnamed: 0,ADJ,ADP,X,ADV,.,NOUN,PRT,DET,CONJ,VERB,NUM,PRON
ADJ,0.063301,0.080583,0.020971,0.005243,0.066019,0.696893,0.011456,0.005243,0.016893,0.011456,0.021748,0.000194
ADP,0.107062,0.016958,0.034548,0.014553,0.038724,0.323589,0.001266,0.320931,0.001012,0.008479,0.063275,0.069603
X,0.017682,0.142226,0.075726,0.025754,0.160869,0.061695,0.185086,0.05689,0.010379,0.206419,0.003075,0.0542
ADV,0.130721,0.119472,0.022886,0.081458,0.139255,0.032196,0.01474,0.071373,0.006982,0.339022,0.029868,0.012025
.,0.046132,0.092908,0.025641,0.052569,0.092372,0.218539,0.002789,0.172192,0.060079,0.08969,0.07821,0.068769
NOUN,0.012584,0.176827,0.028825,0.016895,0.240094,0.262344,0.043935,0.013106,0.042454,0.149134,0.009144,0.004659
PRT,0.082975,0.019569,0.012133,0.009393,0.04501,0.250489,0.001174,0.10137,0.002348,0.401174,0.056751,0.017613
DET,0.206411,0.009918,0.045134,0.012074,0.017393,0.635906,0.000287,0.006037,0.000431,0.040247,0.022855,0.003306
CONJ,0.113611,0.055982,0.00933,0.05708,0.035126,0.349067,0.004391,0.123491,0.000549,0.150384,0.040615,0.060373
VERB,0.06639,0.092357,0.21593,0.083886,0.034807,0.110589,0.030663,0.13361,0.005433,0.167956,0.022836,0.035543


Unnamed: 0,0,1
0,The,DET
1,company,NOUN
2,is,VERB
3,contesting,ADJ
4,the,DET
...,...,...
204,are,VERB
205,scrutinizing,VERB
206,program,NOUN
207,trades,NOUN


In [21]:
for v1, v2 in zip(test_run_base, tmp[1]):
    print('actual:', v1, 'predictied:' ,v2)

actual: ('The', 'DET') predictied: DET
actual: ('company', 'NOUN') predictied: NOUN
actual: ('is', 'VERB') predictied: VERB
actual: ('contesting', 'VERB') predictied: ADJ
actual: ('the', 'DET') predictied: DET
actual: ('fine', 'NOUN') predictied: NOUN
actual: ('.', '.') predictied: .
actual: ('``', '.') predictied: .
actual: ('I', 'PRON') predictied: PRON
actual: ("'m", 'VERB') predictied: VERB
actual: ('starting', 'VERB') predictied: VERB
actual: ('*-1', 'X') predictied: X
actual: ('to', 'PRT') predictied: PRT
actual: ('see', 'VERB') predictied: VERB
actual: ('more', 'ADJ') predictied: ADV
actual: ('business', 'NOUN') predictied: NOUN
actual: ('transactions', 'NOUN') predictied: NOUN
actual: (',', '.') predictied: .
actual: ("''", '.') predictied: .
actual: ('says', 'VERB') predictied: VERB
actual: ('*T*-2', 'X') predictied: X
actual: ('Andrea', 'NOUN') predictied: ADJ
actual: ('West', 'NOUN') predictied: NOUN
actual: ('of', 'ADP') predictied: ADP
actual: ('American', 'NOUN') predicti