In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten,Embedding,LSTM, TimeDistributed
from keras.optimizers import RMSprop,SGD
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


Lecture des données
===================

In [2]:
def read_conll_sentence(istream):
    x_seq = []
    y_seq = []
    line = istream.readline()
    while line and not line.isspace():
        fields = line.split()
        x_seq.append(fields[1])
        y_seq.append(fields[3])
        line = istream.readline()
    return (x_seq,y_seq)

In [3]:
def read_conll_corpus(filename):
    X = []
    Y = []
    istream = open(filename)
    (x,y) = read_conll_sentence(istream)
    while x and y:
        X.append(x)
        Y.append(y)
        (x,y) = read_conll_sentence(istream)
    istream.close()
    return X,Y

In [4]:
X,Y = read_conll_corpus('../projets-2017-2018/sequoia-corpus.np_conll')

In [5]:
print(X[:2],Y[:2])

[['Gutenberg'], ['Cette', 'exposition', 'nous', 'apprend', 'que', 'dès', 'le', 'XIIe', 'siècle', ',', 'à', 'Dammarie-sur-Saulx', ',', 'entre', 'autres', 'sites', ',', 'une', 'industrie', 'métallurgique', 'existait', '.']] [['N'], ['D', 'N', 'CL', 'V', 'C', 'P', 'D', 'A', 'N', 'PONCT', 'P', 'N', 'PONCT', 'P', 'A', 'N', 'PONCT', 'D', 'N', 'A', 'V', 'PONCT']]


Codage des données
==================

In [6]:
x_set = set([])
y_set = set([])
init_token = "__START__"
for x in X:
    x_set.update(x)
for y in Y:
    y_set.update(y)
rev_x_codes = [init_token]
rev_x_codes.extend(list(x_set))
rev_y_codes = list(y_set)
x_codes     = dict((x,idx) for idx,x in enumerate(rev_x_codes))
y_codes     = dict((y,idx) for idx,y in enumerate(rev_y_codes))
print(y_codes)



{'PONCT': 0, 'V': 8, 'P': 9, 'I': 2, 'PRO': 3, 'N': 12, 'D': 6, 'P+D': 7, 'P+PRO': 13, 'A': 10, 'C': 4, 'CL': 11, 'PREF': 1, 'ADV': 14, 'ET': 5}


In [7]:
Xcodes = []
for x in X:
    Xcodes.append([x_codes[elt] for elt in x])
Ycodes = []
for y in Y:
    ymat = np.zeros((len(y),len(y_codes)))
    for idx,elt in enumerate(y):
        ymat[idx,y_codes[elt]] = 1.0
    Ycodes.append(ymat)

In [8]:
print(Xcodes[1],Ycodes[1])

[7946, 2368, 7379, 624, 1006, 8543, 778, 2882, 7592, 8680, 5708, 374, 8680, 2414, 355, 2409, 8680, 7574, 6713, 4878, 7546, 5030] [[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0

Padding et troncation...
-------------------------

In [9]:
L = [len(y) for y in Ycodes]
mL = sum(L)/len(L)
print(mL) #longueur moyenne
Xcodes = pad_sequences(Xcodes,maxlen=40)
Ycodes = pad_sequences(Ycodes,maxlen=40)

21.632139399806388


Structure du modèle
===================

In [10]:
x_size = len(x_codes)
y_size = len(y_codes)
embedding_size = 50
memory_size    = 30
model = Sequential()
model.add(Embedding(x_size,embedding_size))
model.add(LSTM(memory_size,return_sequences=True))
model.add(TimeDistributed(Dense(y_size, activation='softmax'))) 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          518950    
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 30)          9720      
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 15)          465       
Total params: 529,135
Trainable params: 529,135
Non-trainable params: 0
_________________________________________________________________


Descente de gradient
====================

In [13]:
sgd = RMSprop(lr=0.001)
model.compile(optimizer=sgd,loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(Xcodes,Ycodes,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x10787cc50>

Prédictions 
===========

Predictions sur les données d'entrainement...

In [14]:
def eval_model(x_data,y_ref):
    
    C = 0
    N = 0
    for x,yvec in zip(x_data,y_ref):
        prob_vec = model.predict(x)
        L = [np.argmax(tok_probs) == np.argmax(y) for(y,tok_probs,tok_code) in zip(yvec,prob_vec,x) if tok_code != 0]
        C += sum(L)
        N += len(L)
        
    return C/N

eval_model(Xcodes,Ycodes)    

0.96409687736046157