In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import local modules

In [3]:
import sys
sys.path.insert(0, '../src')

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from utils.exp_saver import ExpSaver

from data_loading.load_cas import load_cas
from data_loading.load_embeddings import load_vectors
from data_loading.load_neg_filters import load_filters

from data_processing.preprocessing import preprocess_data, add_tags_per_word

from model.LSTM import embed_bilstm

from train.train import train
from utils.utils import format_results

# Import universal modules

In [4]:
import pandas as pd
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',None)

import numpy as np
import glob

from gensim.models.fasttext import load_facebook_model, load_facebook_vectors


# Parameters

In [5]:
import ruamel.yaml
with open('params.yml') as f:
    params = ruamel.yaml.load(f)
    
params

The default 'Loader' for 'load(stream)' without further arguments can be unsafe.
Use 'load(stream, Loader=ruamel.yaml.Loader)' explicitly if that is OK.
Alternatively include the following in your code:


In most other cases you should consider using 'safe_load(stream)'
  This is separate from the ipykernel package so we can avoid doing imports until


{'pretrainedEmbed': 'fasttext',
 'label_level': 'sentence',
 'neg_only': False,
 'n_fold': 10,
 'batch_size': 32,
 'epochs': 100,
 'hyper_params': {'lr': 0.001, 'lr_decay': 0, 'dropout': 0.3},
 'model_structure': {'with_attention': True,
  'lib': 'torch',
  'with_crf': False,
  'word_embed': 300,
  'pos_embed': None,
  'cue_embed': None,
  'lstm_size': 300}}

# Load corpus

In [6]:
cas = load_cas()

from ipywidgets import IntProgress

# get ids of sentences containing negation
negative_sentence_ids = cas.groupby('sentence_id').apply(lambda x : x['label'].str.contains('neg')).dropna().index.get_level_values(0).unique()

classes = ['O_neg', 'B_neg', 'I_neg']

  1%|          | 412/75258 [00:00<00:18, 4115.02it/s]

Grouping negation


100%|██████████| 75258/75258 [00:47<00:00, 1579.80it/s]


# Retain only neg or not

In [7]:
neg_only = load_filters() if params['neg_only']=='filter' else params['neg_only']

if isinstance(neg_only, str):
    df1 = add_tags_per_word(cas, neg_only)  
    merge = cas.reset_index().merge(df1, left_index=True, right_index=True)
    neg_sentences = merge[merge['label_y'].str.contains('B_neg|I_neg')]['sentence_id_x'].unique()
    cas_neg = cas[cas['sentence_id'].isin(neg_sentences)]
elif neg_only==True:
    cas_neg = cas.groupby('sentence_id').filter(lambda x: (x['label'].str.contains('B').any()) or (x['label'].str.contains('I').any()))
else:
    cas_neg = cas.copy()
        
if neg_only:
    reset_sentence_dic = {w: i for i,w in enumerate(cas_neg['sentence_id'].unique())}
    cas_neg['sentence_id'] = cas_neg['sentence_id'].map(reset_sentence_dic).astype(int)

# Check filter accuracy, sentence wise

In [8]:
if not isinstance(neg_only, bool):
    from sklearn.metrics import classification_report

    true_neg = []
    pred_neg = []

    for sentence_id, sentence in cas.groupby('sentence_id'):

        true_labels = sentence['label'].unique()

        true_neg.append(('B_scope_neg' in true_labels) or ('I_scope_neg' in true_labels))

        pred_neg.append(sentence_id in neg_sentences)

        if (sentence_id in neg_sentences) and not (('B_scope_neg' in true_labels) or ('I_scope_neg' in true_labels)):
            display(sentence)


    print("Predicted support: \nO_neg: %d\nI_neg: %d"%(len(pred_neg)-sum(pred_neg), sum(pred_neg)))
    print(classification_report(true_neg, pred_neg, target_names=['O_neg', 'I_neg']))

In [9]:
# WITHOUT PUNCTUATION (with punctuation, results crumble)

# Predicted support: 
# O_neg: 2888
# I_neg: 902
#               precision    recall  f1-score   support

#        O_neg       0.99      0.96      0.98      2984
#        I_neg       0.87      0.97      0.92       806

#     accuracy                           0.96      3790
#    macro avg       0.93      0.97      0.95      3790
# weighted avg       0.97      0.96      0.96      3790

# Sample one random negative sentence to check

In [10]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

random_neg_sentence = np.random.choice(negative_sentence_ids)
display(cas[cas['sentence_id']==random_neg_sentence])

pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',None)

Unnamed: 0,sentence_id,word_id,word,lem,postag,cue_tag,label
79305,3464,0,Un,un,DET:ART,_,_
79306,3464,1,homme,homme,NOM,_,_
79307,3464,2,âgé,âgé,ADJ,_,_
79308,3464,3,de,de,PRP,_,_
79309,3464,4,41,card,NUM,_,_
79310,3464,5,ans,an,NOM,_,_
79312,3464,7,sans,sans,PRP,B_cue_neg,B_scope_neg
79313,3464,8,antécédents,antécédent,NOM,_,I_scope_neg
79314,3464,9,pathologiques,pathologique,ADJ,_,I_scope_neg
79315,3464,10,particuliers,particulier,ADJ,_,I_scope_neg


# Load embeddings

In [11]:
if params['pretrainedEmbed']==True:
    ft_vec = load_vectors()

    ft_set = set(ft_vec.keys())
    cas_set = set(cas_neg['lem'].unique())

    inter_set = set(ft_set & cas_set)

    print("Fast text vocab: %d \nCAS vocab: %d \nIntersection vocab: %d"%(len(ft_set), len(cas_set), len(inter_set)))
elif params['pretrainedEmbed']=='bin':
    ft_vec = load_facebook_vectors('../embeddings/cc.fr.300.bin')
    # use ft_vec.wv['word'] to get vector values, works with long expressions too ("cancer du sein")
else:
    ft_vec = None

# Training

In [12]:
from multiprocessing import Pool
import gc

with Pool(1):
    gc.collect()
    result_dict = train(
        cas_neg,
        params,
        embeddings=ft_vec,
    )

    model = result_dict['models']
    model_function = result_dict['model_function']


<function attention_bilstm at 0x2aaaeb2f7ae8>


ValueError: Input 0 is incompatible with layer bidirectional_1: expected ndim=3, found ndim=2

In [None]:
import re

# IMPORTANT SPECIAL CASE:
# -APOSTROPHES (do not remove, attach to first character)

def attach_apostrophe(
    sentences,
):
    res    
    
    return res
    
def preprocess_sentences(
    sentences,
    embeddings,
    max_sentence_size=99, # should match model shape
    ):
    
    raw_sentences = np.zeros((len(sentences), max_sentence_size, embeddings.wv.vector_size))
    split_sentences = []
    
    for idx, sent in enumerate(sentences):
        sentence = re.split('\W+', sent)
        split_sentences.append(sentence)
        raw_sentences[idx, :len(sentence)] = np.array([embeddings.wv[w] for w in sentence])
        
    return split_sentences, raw_sentences

sentences = [
    "Il n'a pas suivi de régime pendant un an.",
    "Il est suivi par un médecin et ne consulte qu'au besoin.",
    "Il n'a pas de bonne grammaire et de bon orthograf.",
    "Bonjour j'ai un cancer mais je n'ai pas soif.",
    "Bonjour je n'ai pas de cancer mais j'ai soif.",
    "Le patient n'a jamais eu d'embolie pulmonaire ni de saignements.",
    "Le patient n'a jamais eu d'embolie pulmonaire et de saignements.",
    "Le patient n'a jamais eu d'embolie pulmonaire mais des saignements.",
    "Nous n'avons pas injecté 100mg de lorazépam.",
    "Le lupus n'apparaît pas sur l'IRM.",
    "Aucune tumeur n'est présente dans les scans de l'oncle du patient.",
]

split_sentences, embed_sentences = preprocess_sentences(
    sentences,
    ft_vec,
)

In [None]:
def predict(
    embed_sentences,
    model,
):

    predict_dict = {
        'word_input': embed_sentences,
    }

    classes = {
        0: 'O_neg',
        1: 'B_neg',
        2: 'I_neg',
    }

    y_prob = model.predict(predict_dict)
    y_classes = y_prob.argmax(axis=-1)
    results = np.vectorize(classes.get)(y_classes)
    
    labeled_sentences = {
        'sentence_id': [],
        'word': [],
        'label': [],
    }

    for sentence_id, (sentence, labels) in enumerate(zip(split_sentences, results)):
        for word, label in zip(sentence, labels):
            labeled_sentences['sentence_id'].append(sentence_id)
            labeled_sentences['word'].append(word)
            labeled_sentences['label'].append(label)
    
    return pd.DataFrame.from_dict(labeled_sentences)

In [None]:
labeled_sentences = predict(
    embed_sentences,
    model,
)

In [None]:
for _, sentence in labeled_sentences.groupby('sentence_id'):
    display(sentence.transpose())