In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation,Embedding,LSTM, TimeDistributed,Bidirectional
from keras.optimizers import RMSprop,SGD
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


Lecture des données
===================

In [2]:
def read_conll_sentence(istream):
    x_seq = []
    y_seq = []
    line = istream.readline()
    while line and not line.isspace():
        fields = line.split()
        x_seq.append(fields[1])
        y_seq.append(fields[3])
        line = istream.readline()
    return (x_seq,y_seq)

In [3]:
def read_conll_corpus(filename):
    X = []
    Y = []
    istream = open(filename)
    (x,y) = read_conll_sentence(istream)
    while x and y:
        X.append(x)
        Y.append(y)
        (x,y) = read_conll_sentence(istream)
    istream.close()
    return X,Y

In [4]:
X,Y = read_conll_corpus('../projets-2017-2018/sequoia-corpus.np_conll')

In [14]:
print(X[:3],Y[:3])

[['Gutenberg'], ['Cette', 'exposition', 'nous', 'apprend', 'que', 'dès', 'le', 'XIIe', 'siècle', ',', 'à', 'Dammarie-sur-Saulx', ',', 'entre', 'autres', 'sites', ',', 'une', 'industrie', 'métallurgique', 'existait', '.'], ['à_peu_près', 'au', 'même', 'moment', 'que', 'Gutenberg', 'inventait', "l'", 'imprimerie', ',', 'Gillet', 'Bonnemire', 'créait', 'en', '1450', 'la', 'première', 'forge', 'à', 'Saint-Dizier', ',', 'à', "l'", 'actuel', 'emplacement', 'du', 'CHS', '.']] [['N'], ['D', 'N', 'CL', 'V', 'C', 'P', 'D', 'A', 'N', 'PONCT', 'P', 'N', 'PONCT', 'P', 'A', 'N', 'PONCT', 'D', 'N', 'A', 'V', 'PONCT'], ['ADV', 'P+D', 'A', 'N', 'C', 'N', 'V', 'D', 'N', 'PONCT', 'N', 'N', 'V', 'P', 'N', 'D', 'A', 'N', 'P', 'N', 'PONCT', 'P', 'D', 'A', 'N', 'P+D', 'N', 'PONCT']]


Codage des données
==================

In [15]:
x_set = set([])
y_set = set([])
init_token = "__START__"
for x in X:
    x_set.update(x)
for y in Y:
    y_set.update(y)
rev_x_codes = [init_token]
rev_x_codes.extend(list(x_set))
rev_y_codes = list(y_set)
x_codes     = dict((x,idx) for idx,x in enumerate(rev_x_codes))
y_codes     = dict((y,idx) for idx,y in enumerate(rev_y_codes))
print(y_codes)



{'C': 0, 'D': 1, 'PRO': 2, 'CL': 4, 'I': 7, 'P+PRO': 8, 'ADV': 5, 'ET': 9, 'N': 10, 'V': 6, 'PONCT': 11, 'P+D': 12, 'P': 14, 'A': 13, 'PREF': 3}


In [26]:
Xcodes = []
for x in X:
    Xcodes.append([x_codes[elt] for elt in x])
Ycodes = []
for y in Y:
    ymat = np.zeros((len(y),len(y_codes)))
    for idx,elt in enumerate(y):
        ymat[idx,y_codes[elt]] = 1.0
    Ycodes.append(ymat)

Padding et troncation...
-------------------------

In [28]:
L = [len(y) for y in Ycodes]
mL = int(sum(L)/len(L))
print(mL) #longueur moyenne
Xcodes = pad_sequences(Xcodes,maxlen=mL)
Ycodes = pad_sequences(Ycodes,maxlen=mL)

21


Structure du modèle
===================

In [9]:
x_size = len(x_codes)
y_size = len(y_codes)
embedding_size = 75
memory_size    = 50
model = Sequential()
model.add(Embedding(x_size,embedding_size))
model.add(Bidirectional(LSTM(memory_size,return_sequences=True))) #bi-LSTM
#model.add(LSTM(memory_size,return_sequences=True))               #simple LSTM
model.add(TimeDistributed(Dense(y_size, activation='softmax'))) 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          518950    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 100)         40400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 15)          1515      
Total params: 560,865
Trainable params: 560,865
Non-trainable params: 0
_________________________________________________________________


Descente de gradient
====================

In [31]:
sgd = RMSprop(lr=0.001)
model.compile(optimizer=sgd,loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(Xcodes,Ycodes,epochs=30,batch_size=32)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x120054cc0>

Prédictions 
===========

(Attention !) Predictions sur les données d'entrainement...

In [21]:
def eval_model(x_data,y_ref):
    
    C = 0
    N = 0
    for x,yvec in zip(x_data,y_ref):
        prob_vec = model.predict(x)
        L = [np.argmax(tok_probs) == np.argmax(y) for(y,tok_probs,tok_code) in zip(yvec,prob_vec,x) if tok_code != 0]
        C += sum(L)
        N += len(L)
        
    return C/N

eval_model(Xcodes,Ycodes)    

0.96023376953373141

Illustration (données entrainement parfois tronquées) :

In [30]:
for x in Xcodes[:5]:
    probs = model.predict(x)
    for xc,yprob in zip(x,probs):
        if xc != 0:
            print(rev_x_codes[xc],rev_y_codes[np.argmax(yprob)])
    print()

Gutenberg N

exposition N
nous CL
apprend V
que C
dès P
le D
XIIe PONCT
siècle N
, PONCT
à P
Dammarie-sur-Saulx N
, PONCT
entre P
autres A
sites N
, PONCT
une D
industrie N
métallurgique A
existait V
. PONCT

l' D
imprimerie N
, PONCT
Gillet N
Bonnemire N
créait V
en P
1450 N
la D
première A
forge N
à P
Saint-Dizier N
, PONCT
à P
l' D
actuel A
emplacement N
du P+D
CHS N
. PONCT

Ensuite ADV
, PONCT
fut V
installée V
une D
autre A
forge N
à P
la D
Vacquerie N
, PONCT
à P
l' D
emplacement N
aujourd'_hui ADV
de P
Cora N
. PONCT

de P
la D
Marne A
ou_bien C
en P
aval N
de P
la D
Marne A
- PONCT
, PONCT
une D
forge N
qui PRO
connut V
son D
apogée N
au P+D
XIXe PONCT
siècle N
. PONCT

