In [2]:
import json 
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report

In [97]:
with open('datasets/trainset.json','r', encoding = 'utf8') as f:
     train = json.load(f)
        
with open('datasets/testset.json','r', encoding = 'utf8') as f:
     test = json.load(f)

In [99]:
X_train = [x['token'] for x in train]
print(f'nombre de phrases dans X_train: {len(X_train)}')
X_train[0]

nombre de phrases dans X_train: 13250


['Histoire',
 'clinique',
 'L’',
 'interrogatoire',
 'est',
 'rendu',
 'difficile',
 'à',
 'la',
 'fois',
 'par',
 'la',
 'barrière',
 'linguistique',
 'et',
 'par',
 'une',
 'réticence',
 'de',
 'la',
 'patiente',
 'à',
 'fournir',
 'des',
 'informations',
 'médicales',
 '.']

In [100]:
Y_train = [x['label'] for x in train]
print(f'nombre de phrases dans Y_train: {len(Y_train)}')
Y_train[0]

nombre de phrases dans Y_train: 13250


['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [98]:
X_test = [x['token'] for x in test]
Y_test = [x['label'] for x in test]

In [101]:
cat_2_id = {'<UNK>': 0}
for sent in Y_train:
    for label in sent:
        if label not in cat_2_id.keys():
            cat_2_id[label] = len(cat_2_id)
            
id_2_cat = {v:k for k,v in cat_2_id.items()}
            
def preprocess_Y(Y, cat_to_id): 
    res = []
    for sent in Y: 
        sent_res = []
        for label in sent:
        
            if label not in cat_to_id.keys():
                sent_res.append(cat_to_id['<UNK>'])
            else:
                sent_res.append(cat_to_id[label])
        res.append(sent_res)
    return res

In [102]:
cat_2_id

{'<UNK>': 0,
 'O': 1,
 'disorder': 2,
 'procedure': 3,
 'labvalue': 4,
 'drugs': 5,
 'profession': 6,
 'risk': 7,
 'anatomy': 8}

In [103]:
id_2_cat

{0: '<UNK>',
 1: 'O',
 2: 'disorder',
 3: 'procedure',
 4: 'labvalue',
 5: 'drugs',
 6: 'profession',
 7: 'risk',
 8: 'anatomy'}

In [105]:
cat_vocab = len(cat_2_id)
cat_vocab

9

In [106]:
Y_train = preprocess_Y(Y_train, cat_2_id)
Y_test = preprocess_Y(Y_test, cat_2_id)



In [48]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [49]:
tokenizer.fit_on_texts(X_train)

In [91]:
vocab = len(tokenizer.word_index)
vocab

22456

In [108]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [109]:
max_len_95 = round(np.quantile([len(x) for x in X_train_seq], .95))

In [110]:
X_train_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen = max_len_95, 
                                                               truncating = "post")
Y_train_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(Y_train, maxlen = max_len_95, 
                                                               truncating = "post")

X_test_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen = max_len_95, 
                                                               truncating = "post")
Y_test_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(Y_test, maxlen = max_len_95, 
                                                               truncating = "post")

In [111]:
X_train_seq_pad.shape

(13250, 58)

In [112]:
Y_train_seq_pad.shape

(13250, 58)

In [160]:
Y_train_seq_pad_cat = tf.keras.utils.to_categorical(Y_train_seq_pad)
Y_test_seq_pad_cat = tf.keras.utils.to_categorical(Y_train_seq_pad)

In [161]:
embed_dim = 128
lstm_out = 128

In [171]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab+1, embed_dim,input_length = max_len_95))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out, return_sequences = True)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(cat_vocab,activation='softmax')))

In [172]:
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 58, 128)           2874496   
                                                                 
 bidirectional_7 (Bidirectio  (None, 58, 256)          263168    
 nal)                                                            
                                                                 
 dropout_7 (Dropout)         (None, 58, 256)           0         
                                                                 
 time_distributed_7 (TimeDis  (None, 58, 9)            2313      
 tributed)                                                       
                                                                 
Total params: 3,139,977
Trainable params: 3,139,977
Non-trainable params: 0
_________________________________________________________________
None


In [173]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['categorical_accuracy', 
                                                                             tf.keras.metrics.Precision(),
                                                                            tf.keras.metrics.Recall()])


In [174]:
model.fit(X_train_seq_pad, Y_train_seq_pad_cat, batch_size = 16, epochs = 5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff4b2850b50>

In [175]:
test_pred = model.predict(X_test_seq_pad)
test_pred = np.argmax(test_pred, axis= 2)

In [176]:
labels = [k for k,v in id_2_cat.items() if v not in ['<UNK>', 'O']]
target_names = [id_2_cat[x] for x in labels]

test_pred.shape[0]

print(classification_report(Y_test_seq_pad.reshape(test_pred.shape[0]*max_len_95), 
                            test_pred.reshape(test_pred.shape[0]*max_len_95),
                            labels=labels,
                            target_names= target_names,
                           zero_division=0))

              precision    recall  f1-score   support

    disorder       0.44      0.36      0.40      5803
   procedure       0.46      0.27      0.34      4940
    labvalue       0.47      0.35      0.40      2279
       drugs       0.46      0.30      0.37       700
  profession       0.26      0.15      0.19       245
        risk       0.34      0.04      0.08       247
     anatomy       0.00      0.00      0.00       127

   micro avg       0.45      0.31      0.37     14341
   macro avg       0.35      0.21      0.25     14341
weighted avg       0.45      0.31      0.36     14341



In [179]:
train_pred = model.predict(X_train_seq_pad)
train_pred = np.argmax(train_pred, axis= 2)



print(classification_report(Y_train_seq_pad.reshape(train_pred.shape[0]*max_len_95), 
                            train_pred.reshape(train_pred.shape[0]*max_len_95),
                            labels=labels,
                            target_names= target_names,
                           zero_division=0))

              precision    recall  f1-score   support

    disorder       0.82      0.77      0.80     24066
   procedure       0.86      0.66      0.75     18305
    labvalue       0.82      0.80      0.81     10275
       drugs       0.84      0.76      0.80      3273
  profession       0.75      0.53      0.62      1183
        risk       0.51      0.28      0.36       660
     anatomy       0.24      0.38      0.29       482

   micro avg       0.82      0.73      0.77     58244
   macro avg       0.69      0.60      0.63     58244
weighted avg       0.83      0.73      0.77     58244

