In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import local modules

In [4]:
import sys
sys.path.insert(0, '../src')

import os

from utils.exp_saver import ExpSaver
from utils.utils import format_results

from data_loading.load_cas import load_cas
from data_loading.load_embeddings import load_vectors
from data_loading.load_neg_filters import load_filters

from data_processing.preprocessing import preprocess_data, add_tags_per_word


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Import universal modules

In [6]:
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

import numpy as np
import glob

from gensim.models.fasttext import load_facebook_model, load_facebook_vectors


# Parameters (edit in notebooks/params.yml)

In [4]:
import ruamel.yaml
with open('params.yml') as f:
    params = ruamel.yaml.load(f)
    
print(params)

if params['model_structure']['lib'] in ['torch', 'pytorch']:
    from train.train_torch import CV_train
else:
    from train.train import CV_train

The default 'Loader' for 'load(stream)' without further arguments can be unsafe.
Use 'load(stream, Loader=ruamel.yaml.Loader)' explicitly if that is OK.
Alternatively include the following in your code:


In most other cases you should consider using 'safe_load(stream)'
  This is separate from the ipykernel package so we can avoid doing imports until


{'pretrainedEmbed': 'fasttext', 'label_level': 'sentence', 'neg_only': False, 'n_fold': 10, 'batch_size': 32, 'epochs': 100, 'hyper_params': {'lr': 0.001, 'lr_decay': 0, 'dropout': 0.3}, 'model_structure': {'with_attention': True, 'lib': 'torch', 'with_crf': False, 'word_embed': 300, 'pos_embed': None, 'cue_embed': None, 'lstm_size': 300}}


# Block to rename folders in case of new parameter

In [5]:
# logs_folder = '../logs/*'

# from glob import glob
# import os

# for directory in glob(logs_folder):
#     if os.path.isdir(directory):
#         dir_path, dir_name = directory.rsplit('/', 1)
#         insert_index = dir_name.index('neg_only')
#         new_dir_name = dir_name[:insert_index] + 'label_level+word___' + dir_name[insert_index:]
#         os.rename(directory, dir_path + '/' + new_dir_name)

# Load corpus

In [6]:
cas = load_cas()

from ipywidgets import IntProgress

# get ids of sentences containing negation
negative_sentence_ids = cas.groupby('sentence_id').apply(lambda x : x['label'].str.contains('neg')).dropna().index.get_level_values(0).unique()

classes = ['O_neg', 'B_neg', 'I_neg']

  1%|          | 426/75258 [00:00<00:17, 4253.87it/s]

Grouping negation


100%|██████████| 75258/75258 [00:47<00:00, 1583.66it/s]


# Retain only neg or not

In [7]:
neg_only = load_filters() if params['neg_only']=='filter' else params['neg_only']

if isinstance(neg_only, str):
    df1 = add_tags_per_word(cas, neg_only)  
    merge = cas.reset_index().merge(df1, left_index=True, right_index=True)
    neg_sentences = merge[merge['label_y'].str.contains('B_neg|I_neg')]['sentence_id_x'].unique()
    cas_neg = cas[cas['sentence_id'].isin(neg_sentences)]
elif neg_only==True:
    cas_neg = cas.groupby('sentence_id').filter(lambda x: (x['label'].str.contains('B').any()) or (x['label'].str.contains('I').any()))
else:
    cas_neg = cas.copy()
        
if neg_only:
    reset_sentence_dic = {w: i for i,w in enumerate(cas_neg['sentence_id'].unique())}
    cas_neg['sentence_id'] = cas_neg['sentence_id'].map(reset_sentence_dic).astype(int)

# Check filter accuracy, sentence wise

In [8]:
if not isinstance(neg_only, bool):
    from sklearn.metrics import classification_report

    true_neg = []
    pred_neg = []

    for sentence_id, sentence in cas.groupby('sentence_id'):

        true_labels = sentence['label'].unique()

        true_neg.append(('B_scope_neg' in true_labels) or ('I_scope_neg' in true_labels))

        pred_neg.append(sentence_id in neg_sentences)

        if (sentence_id in neg_sentences) and not (('B_scope_neg' in true_labels) or ('I_scope_neg' in true_labels)):
            display(sentence)


    print("Predicted support: \nO_neg: %d\nI_neg: %d"%(len(pred_neg)-sum(pred_neg), sum(pred_neg)))
    print(classification_report(true_neg, pred_neg, target_names=['O_neg', 'I_neg']))

In [9]:
# WITHOUT PUNCTUATION (with punctuation, results crumble)

# Predicted support: 
# O_neg: 2888
# I_neg: 902
#               precision    recall  f1-score   support

#        O_neg       0.99      0.96      0.98      2984
#        I_neg       0.87      0.97      0.92       806

#     accuracy                           0.96      3790
#    macro avg       0.93      0.97      0.95      3790
# weighted avg       0.97      0.96      0.96      3790

# Sample one random negative sentence to check

In [10]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

random_neg_sentence = np.random.choice(negative_sentence_ids)
display(cas[cas['sentence_id']==random_neg_sentence])

pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',None)

Unnamed: 0,sentence_id,word_id,word,lem,postag,cue_tag,label
22676,1020,0,L',le,DET:ART,_,_
22677,1020,1,état,état,NOM,_,_
22678,1020,2,de,de,PRP,_,_
22679,1020,3,la,le,DET:ART,_,_
22680,1020,4,malade,malade,NOM,_,_
22681,1020,5,ne,ne,ADV,B_cue_neg,B_scope_neg
22682,1020,6,permettait,permettre,VER:impf,_,I_scope_neg
22683,1020,7,pas,pas,ADV,I_cue_neg,I_scope_neg
22684,1020,8,de,de,PRP,_,I_scope_neg
22685,1020,9,confirmer,confirmer,VER:infi,_,I_scope_neg


# Trying to implement Flaubert

# Load embeddings

In [11]:
if params['pretrainedEmbed']==True:
    ft_vec = load_vectors()

    ft_set = set(ft_vec.keys())
    cas_set = set(cas_neg['lem'].unique())

    inter_set = set(ft_set & cas_set)

    print("Fast text vocab: %d \nCAS vocab: %d \nIntersection vocab: %d"%(len(ft_set), len(cas_set), len(inter_set)))
elif params['pretrainedEmbed']=='fasttext':
    ft_vec = load_facebook_vectors('../embeddings/cc.fr.300.bin')
    # use ft_vec.wv['word'] to get vector values, works with long expressions too ("cancer du sein")
else:
    ft_vec = None

# Training

In [12]:
from multiprocessing import Pool
import gc

# BLOCK TO RELOAD PARAMS
import ruamel.yaml
with open('params.yml') as f:
    params = ruamel.yaml.load(f)
    
print(params)

if params['model_structure']['lib'] in ['torch', 'pytorch']:
    from train.train_torch import CV_train
else:
    from train.train import CV_train
    
with Pool(1):
    gc.collect()
    result_dict = CV_train(
        cas_neg,
        params,
        embeddings=ft_vec if params['pretrainedEmbed'] != 'flaubert' else None,
        flaubert_path="../Flaubert/xlm_bert_fra_base_lower" if params['pretrainedEmbed'] == 'flaubert' else None,
    )

    models = result_dict['models']
    model_function = result_dict['model_function']
    scores = result_dict['scores']

{'pretrainedEmbed': 'fasttext', 'label_level': 'sentence', 'neg_only': False, 'n_fold': 10, 'batch_size': 32, 'epochs': 100, 'hyper_params': {'lr': 0.001, 'lr_decay': 0, 'dropout': 0.3}, 'model_structure': {'with_attention': True, 'lib': 'torch', 'with_crf': False, 'word_embed': 300, 'pos_embed': None, 'cue_embed': None, 'lstm_size': 300}}


The default 'Loader' for 'load(stream)' without further arguments can be unsafe.
Use 'load(stream, Loader=ruamel.yaml.Loader)' explicitly if that is OK.
Alternatively include the following in your code:


In most other cases you should consider using 'safe_load(stream)'
  import sys



        ############################
        # Cross validaton split 0 
        ############################
        


NameError: name 'overrides' is not defined

In [34]:
score_df = format_results(scores, classes)

score_df

NameError: name 'scores' is not defined

# Saving Experiment

In [1]:
es = ExpSaver(
    config=params,
    model=models[0],
    model_function=model_function,
    folder='../logs',
    results=score_df,
)

es.save()

NameError: name 'ExpSaver' is not defined

# Get best results (may not be the best way to do so)

In [7]:
from train.results import get_best_results

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', 57)

get_best_results(
    n_best=10,
    must_contain=['no_pos___no_cue'],
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[metric] = df[metric].astype(float)


Unnamed: 0_level_0,Unnamed: 1_level_0,rank,exp_name,timestamp,cv_fold,class,precision,recall,f1,support
Unnamed: 0_level_1,best_class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,B_neg,0,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+filter___n_fold+10___batch_size+256___epochs+100___lr+0.005___lr_decay+1e-06___no_pos___no_cue,21012020_132944,mean,O_neg,0.988419,0.991046,0.989713,8164.1
1,B_neg,0,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+filter___n_fold+10___batch_size+256___epochs+100___lr+0.005___lr_decay+1e-06___no_pos___no_cue,21012020_132944,mean,B_neg,0.898717,0.883677,0.890423,82.4
2,B_neg,0,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+filter___n_fold+10___batch_size+256___epochs+100___lr+0.005___lr_decay+1e-06___no_pos___no_cue,21012020_132944,mean,I_neg,0.844146,0.814777,0.825517,475.4
3,B_neg,1,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+True___n_fold+10___batch_size+512___epochs+100___lr+0.01___lr_decay+1e-06___no_pos___no_cue,20012020_150719,mean,O_neg,0.986089,0.992838,0.989433,7387.2
4,B_neg,1,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+True___n_fold+10___batch_size+512___epochs+100___lr+0.01___lr_decay+1e-06___no_pos___no_cue,20012020_150719,mean,B_neg,0.906985,0.859056,0.882241,87.2
5,B_neg,1,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+True___n_fold+10___batch_size+512___epochs+100___lr+0.01___lr_decay+1e-06___no_pos___no_cue,20012020_150719,mean,I_neg,0.882993,0.802472,0.836436,505.0
6,B_neg,2,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+filter___n_fold+10___batch_size+512___epochs+100___lr+0.01___lr_decay+1e-06___no_pos___no_cue,20012020_151654,mean,O_neg,0.987612,0.991933,0.989747,8164.1
7,B_neg,2,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+filter___n_fold+10___batch_size+512___epochs+100___lr+0.01___lr_decay+1e-06___no_pos___no_cue,20012020_151654,mean,B_neg,0.88455,0.875074,0.878984,82.4
8,B_neg,2,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+filter___n_fold+10___batch_size+512___epochs+100___lr+0.01___lr_decay+1e-06___no_pos___no_cue,20012020_151654,mean,I_neg,0.853645,0.799456,0.8211,475.4
9,B_neg,3,torch_BiLSTM_attn___pretrainedEmbed+fasttext___label_level+word___neg_only+False___n_fold+10___batch_size+512___epochs+100___lr+0.01___lr_decay+1e-06___no_pos___no_cue,20012020_160836,mean,O_neg,0.997419,0.998374,0.997895,44887.8
