In [1]:
import numpy as np
import re
from IPython.display import clear_output
from sklearn.model_selection import train_test_split

In [29]:
#Jeu de données de pytorch 
files_abbr = {"Arabic": "AR","Chinese":"CHN","Czech":"CZ","Dutch":"DU","English":"EN","French":"FR","German":"GER","Greek":"GK","Irish":"IR","Italian":"IT","Japanese":"JPN","Korean":"KOR","Polish":"POL","Portuguese":"POR","Russian":"RU","Scottish":"SCT","Spanish":"SPN","Vietnamese":"VTN"}
noms = []
pays = []
voc = []
for k in files_abbr:
    clear_output(wait=True)
    print ("Processing names from data/names/"+k+".txt...")
    f = open("data/names/"+k+".txt",'r')
    d = f.read()
    for l in d.split("\n"):
        if l!='':
            noms.append(l)
            pays.append(files_abbr[k])
            for c in l:
                if c not in voc:
                    voc.append(c)

#On prépare des ensembles de train et de test        
noms_train, noms_test, pays_train, pays_test = train_test_split(np.array(noms),np.array(pays))


Processing names from data/names/Vietnamese.txt...


In [30]:
noms_train, pays_train

(array(['Zelenko', 'Whitehouse', 'Wiles', ..., 'Kenning', 'Rees', 'Lojkin'],
       dtype='<U20'),
 array(['RU', 'EN', 'EN', ..., 'EN', 'EN', 'RU'], dtype='<U3'))

In [31]:
#On fait la liste des caractères et de leurs tags associés, on sépare chaque nom avec un espace
tags = []
liste_tags = ['O']
for i in range(len(noms_train)):
    pays = pays_train[i]
    #Soit on a un nom a une seul lettre (on tague S-pays)
    if (len(noms_train[i])==1):
        print(i)
        if (noms_train[i][0]!=' '):
            tags.append((noms_train[i][0],'S-'+pays))
            t = 'S-'+pays
            if t not in liste_tags:
                liste_tags.append(t)
        else:
            tags.append((noms_train[i][0],'O'))
    elif (len(noms_train[i])>1):
        #On tag le premier cractère
        prec = ' '
        current = noms_train[i][0]
        suivant = noms_train[i][1]
        if (current==' '):
            tags.append((current,'O'))
        else:
            tags.append((current,'B-'+pays))
            t = 'B-'+pays
            if t not in liste_tags:
                liste_tags.append(t)
        for k in range(1,len(noms_train[i])-1):
            prec = noms_train[i][k-1]
            current = noms_train[i][k]
            suivant = noms_train[i][k+1]
            if (current==' '):
                tags.append((current,'O'))
            else:
                tags.append((current,'I-'+pays))
                t = 'I-'+pays
                if t not in liste_tags:
                    liste_tags.append(t)
        if (suivant==' '):
            tags.append((suivant,'O'))
        else:
            tags.append((suivant,'E-'+pays))
            t = 'E-'+pays
            if t not in liste_tags:
                liste_tags.append(t)
    tags.append((' ','O'))

In [32]:
#On prépare les matrices A et B pour définir notre HMM

#La matrice A définit les probabilité de transition d'un tag à un autre
A = np.zeros((len(liste_tags),len(liste_tags))).astype(float)
#la matrice B définit les probabilités d'observation'
B = np.zeros((len(liste_tags),len(voc))).astype(float)
for i in range(len(tags)-1):
    suivant = tags[i+1]
    current = tags[i]
    index_voc = voc.index(current[0])
    index_crt_tag = liste_tags.index(current[1])
    index_next_tag = liste_tags.index(suivant[1])
    B[index_crt_tag][index_voc] +=1
    A[index_crt_tag][index_next_tag] +=1
B[liste_tags.index(tags[-1][1])][voc.index(tags[-1][0])]
for i in range(len(A)):
    A[i] = A[i]/np.sum(A[i])
    B[i] = B[i]/np.sum(B[i])

In [33]:
B[:,1]

array([0.        , 0.        , 0.06729334, 0.05035562, 0.        ,
       0.03129387, 0.0295082 , 0.        , 0.06731653, 0.05405405,
       0.        , 0.03038936, 0.01376147, 0.        , 0.0079602 ,
       0.        , 0.        , 0.01004637, 0.        , 0.        ,
       0.06514037, 0.0025974 , 0.        , 0.02727273, 0.        ,
       0.        , 0.05077173, 0.04166667, 0.        , 0.07514451,
       0.0982659 , 0.        , 0.02834467, 0.04134367, 0.        ,
       0.01240458, 0.        , 0.        , 0.11965812, 0.01415094,
       0.        , 0.04      , 0.02777778, 0.        , 0.00684932,
       0.        , 0.        , 0.01581028, 0.        , 0.        ,
       0.16190476, 0.03225806, 0.        , 0.11111111, 0.15      ])

In [34]:
A.shape

(55, 55)

In [35]:
#Algorithme de Viterbi
def viterbi(o,A,B,Pi,voc):
    T = len(o)
    N = len(A)
    Vit = np.zeros((N,T))
    backpointer = np.zeros((N,T)).astype(int)
    for s in range(N):
        Vit[s][0] = Pi[s]*B[s][voc.index(o[0])]
        backpointer[s][0]=0
    for t in range(1,T):
        for s in range(N):
            Vit[s][t] = max(Vit[:,t-1]*A[:,s]*B[s][voc.index(o[t])])
            backpointer[s][t] = np.argmax(Vit[:,t-1]*A[:,s]*B[s][voc.index(o[t])])
    bestpathprob = max(Vit[:,T-1])
    bestpathpointer = np.argmax(Vit[:,T-1])
    bestpath = []
    for i in range(T-1,-1,-1):
        bestpath.append(liste_tags[bestpathpointer])
        bestpathpointer = backpointer[bestpathpointer,i]
    return list(reversed(bestpath)), bestpathprob

In [36]:
viterbi("Henry",A,B,np.ones(len(A))/len(A),voc)

(['B-EN', 'I-EN', 'I-EN', 'I-EN', 'E-EN'], 1.1524024069252931e-08)

In [37]:
#Une fonction donnant la nationalité dominante d'après une liste de tags
def nationality_from_tags(tags):
    nats = {}
    for t in tags:
        if t == 'O':
            nats['O'] = nats.get('O',0)+1
        else:
            pays = t.split('-')[1]
            nats[pays] = nats.get(pays,0)+1
    return max(nats, key=nats.get)

In [38]:
nationality_from_tags(viterbi("Henry a faim",A,B,np.ones(len(A))/len(A),voc)[0])

'EN'

In [40]:
pays_pred = []
n = len(pays_test)
for i in range(n):
    clear_output(wait=True)
    print(str(i)+"/"+str(n))
    pays_pred.append(nationality_from_tags(viterbi(noms_test[i],A,B,np.ones(len(A))/len(A),voc)[0]))
pays_pred

5018/5019


['GER',
 'GK',
 'JPN',
 'RU',
 'SCT',
 'JPN',
 'IT',
 'JPN',
 'EN',
 'FR',
 'VTN',
 'RU',
 'IR',
 'RU',
 'GK',
 'FR',
 'RU',
 'AR',
 'AR',
 'RU',
 'POL',
 'RU',
 'IT',
 'GK',
 'RU',
 'JPN',
 'SCT',
 'JPN',
 'RU',
 'RU',
 'RU',
 'SCT',
 'GK',
 'SCT',
 'JPN',
 'RU',
 'RU',
 'GER',
 'DU',
 'IT',
 'RU',
 'IT',
 'JPN',
 'JPN',
 'RU',
 'FR',
 'VTN',
 'RU',
 'RU',
 'RU',
 'DU',
 'RU',
 'JPN',
 'VTN',
 'RU',
 'RU',
 'JPN',
 'RU',
 'SCT',
 'IT',
 'GER',
 'RU',
 'CZ',
 'RU',
 'AR',
 'IT',
 'GK',
 'DU',
 'CZ',
 'SPN',
 'IT',
 'CHN',
 'AR',
 'RU',
 'VTN',
 'RU',
 'CZ',
 'GER',
 'EN',
 'JPN',
 'JPN',
 'RU',
 'RU',
 'EN',
 'CHN',
 'JPN',
 'IT',
 'POL',
 'CZ',
 'AR',
 'DU',
 'RU',
 'GK',
 'FR',
 'JPN',
 'FR',
 'RU',
 'GK',
 'RU',
 'GER',
 'RU',
 'EN',
 'IT',
 'POL',
 'FR',
 'SPN',
 'GK',
 'JPN',
 'DU',
 'GK',
 'JPN',
 'RU',
 'RU',
 'RU',
 'JPN',
 'JPN',
 'CZ',
 'POL',
 'RU',
 'KOR',
 'RU',
 'EN',
 'VTN',
 'POR',
 'GK',
 'GER',
 'RU',
 'GER',
 'GK',
 'KOR',
 'IR',
 'RU',
 'DU',
 'CZ',
 'SCT',
 'RU',
 

In [42]:
from sklearn.metrics import classification_report
print(classification_report(pays_test,pays_pred))

              precision    recall  f1-score   support

          AR       0.55      0.35      0.43       483
         CHN       0.53      0.36      0.43        55
          CZ       0.15      0.16      0.16       132
          DU       0.12      0.30      0.18        79
          EN       0.67      0.15      0.25       923
          FR       0.10      0.30      0.15        71
         GER       0.33      0.36      0.34       196
          GK       0.06      0.60      0.11        50
          IR       0.18      0.54      0.27        59
          IT       0.23      0.37      0.28       188
         JPN       0.31      0.61      0.41       221
         KOR       0.21      0.46      0.29        26
           O       0.00      0.00      0.00         0
         POL       0.11      0.60      0.19        30
         POR       0.02      0.22      0.04        18
          RU       0.93      0.59      0.72      2378
         SCT       0.03      0.25      0.05        28
         SPN       0.12    