In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from nlstruct.dataloaders.medic import get_raw_medic

In [None]:
from __future__ import absolute_import
import argparse
import numpy as np
import torch
import os
import sys
import logging
import pdb

sys.path.insert(0,'/home/ytaille/AttentionSegmentation')

from allennlp.data import Vocabulary
from allennlp.data.iterators import DataIterator
# import allennlp.data.dataset_readers as Readers
import AttentionSegmentation.reader as Readers

# import model as Models
import AttentionSegmentation.model.classifiers as Models

from AttentionSegmentation.commons.utils import \
    setup_output_dir, read_from_config_file
from AttentionSegmentation.commons.model_utils import \
    construct_vocab, load_model_from_existing
# from AttentionSegmentation.visualization.visualize_attns import \
#     html_visualizer
import AttentionSegmentation.model.attn2labels as SegmentationModels

"""The main entry point

This is the main entry point for training HAN SOLO models.

Usage::

    ${PYTHONPATH} -m AttentionSegmentation/main
        --config_file ${CONFIG_FILE}

"""
args = type('MyClass', (object,), {'content':{}})()
args.config_file = 'Configs/config_ncbi.json'
args.log = 'INFO'
args.loglevel = 'INFO'
args.seed = 1

# Setup Experiment Directory
config = read_from_config_file(args.config_file)
if args.seed > 0:
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if config.get('trainer', None) is not None and \
       config.get('trainer', None).get('cuda_device', -1) > 0:
        torch.cuda.manual_seed(args.seed)
serial_dir, config = setup_output_dir(config, args.loglevel)
logger = logging.getLogger(__name__)

# Load Training Data
TRAIN_PATH = config.pop("train_data_path")
logger.info("Loading Training Data from {0}".format(TRAIN_PATH))
dataset_reader_params = config.pop("dataset_reader")
reader_type = dataset_reader_params.pop("type", None)
assert reader_type is not None and hasattr(Readers, reader_type),\
    f"Cannot find reader {reader_type}"
reader = getattr(Readers, reader_type).from_params(dataset_reader_params)
instances_train = reader.read(file_path=TRAIN_PATH)
instances_train = instances_train
logger.info("Length of {0}: {1}".format(
    "Training Data", len(instances_train)))

# Load Validation Data
VAL_PATH = config.pop("validation_data_path")
logger.info("Loading Validation Data from {0}".format(VAL_PATH))
instances_val = reader.read(VAL_PATH)
instances_val = instances_val
logger.info("Length of {0}: {1}".format(
    "Validation Data", len(instances_val)))

# Load Test Data
TEST_PATH = config.pop("test_data_path", None)
instances_test = None
if TEST_PATH is not None:
    logger.info("Loading Test Data from {0}".format(TEST_PATH))
    instances_test = reader.read(TEST_PATH)
    instances_test = instances_test
    logger.info("Length of {0}: {1}".format(
        "Testing Data", len(instances_test)))

# # Load Pretrained Existing Model
# load_config = config.pop("load_from", None)

# # Construct Vocabulary
vocab_size = config.pop("max_vocab_size", -1)
logger.info("Constructing Vocab of size: {0}".format(vocab_size))
vocab_size = None if vocab_size == -1 else vocab_size
vocab = Vocabulary.from_instances(instances_train,
                                  max_vocab_size=vocab_size)
vocab_dir = os.path.join(serial_dir, "vocab")
assert os.path.exists(vocab_dir), "Couldn't find the vocab directory"
vocab.save_to_files(vocab_dir)

# if load_config is not None:
#     # modify the vocab from the source model vocab
#     src_vocab_path = load_config.pop("vocab_path", None)
#     if src_vocab_path is not None:
#         vocab = construct_vocab(src_vocab_path, vocab_dir)
#         # Delete the old vocab
#         for file in os.listdir(vocab_dir):
#             os.remove(os.path.join(vocab_dir, file))
#         # save the new vocab
#         vocab.save_to_files(vocab_dir)
logger.info("Saving vocab to {0}".format(vocab_dir))
logger.info("Vocab Construction Done")

# # Construct the data iterators
logger.info("Constructing Data Iterators")
data_iterator = DataIterator.from_params(config.pop("iterator"))
data_iterator.index_with(vocab)

logger.info("Data Iterators Done")

# Create the model
logger.info("Constructing The model")
model_params = config.pop("model")
model_type = model_params.pop("type")
assert model_type is not None and hasattr(Models, model_type),\
    f"Cannot find reader {model_type}"
model = getattr(Models, model_type).from_params(
    vocab=vocab,
    params=model_params,
    label_indexer=reader.get_label_indexer()
)
logger.info("Model Construction done")

# visualize = config.pop("visualize", False)
# visualizer = None
# if visualize:
#     visualizer = html_visualizer(vocab, reader)
segmenter_params = config.pop("segmentation")
segment_class = segmenter_params.pop("type")
segmenter = getattr(SegmentationModels, segment_class).from_params(
    vocab=vocab,
    reader=reader,
    params=segmenter_params
)

# logger.info("Segmenter Done")

# print("##################################\nAYYYYYYYYYYYYYYYYYYYYYYYY\n\n\n\n\n\n\n\n###########################")

# exit()


# if load_config is not None:
#     # Load the weights, as specified by the load_config
#     model_path = load_config.pop("model_path", None)
#     layers = load_config.pop("layers", None)
#     load_config.assert_empty("Load Config")
#     assert model_path is not None,\
#         "You need to specify model path to load from"
#     model = load_model_from_existing(model_path, model, layers)
#     logger.info("Pretrained weights loaded")

# logger.info("Starting the training process")





2021-03-24 16:34:35,673: INFO: train_data_path = /home/ytaille/data/resources/medic/ncbi_conll_ner_train.conll
2021-03-24 16:34:35,678: INFO: Loading Training Data from /home/ytaille/data/resources/medic/ncbi_conll_ner_train.conll
2021-03-24 16:34:35,681: INFO: dataset_reader.type = WeakConll2003DatasetReader
2021-03-24 16:34:35,683: INFO: dataset_reader.token_indexers.bert.type = bert-pretrained
2021-03-24 16:34:35,684: INFO: dataset_reader.token_indexers.bert.pretrained_model = ./Data/embeddings/bert-base-multilingual-cased-vocab.txt
2021-03-24 16:34:35,686: INFO: dataset_reader.token_indexers.bert.use_starting_offsets = True
2021-03-24 16:34:35,687: INFO: dataset_reader.token_indexers.bert.do_lowercase = False
2021-03-24 16:34:35,688: INFO: dataset_reader.token_indexers.bert.never_lowercase = None
2021-03-24 16:34:35,689: INFO: dataset_reader.token_indexers.bert.max_pieces = 512
2021-03-24 16:34:35,692: INFO: loading vocabulary file ./Data/embeddings/bert-base-multilingual-cased-voc

In [9]:
# from transformers.models.bert.modeling_bert
from transformers.models.bert.modeling_bert import BertModel

In [None]:
# Necessary to add unknown tag to dictionnary to avoid errors later
data_iterator.vocab.add_token_to_namespace("@@UNKNOWN@@", "chunk_tags")

In [None]:
config = read_from_config_file(args.config_file)


In [None]:
from AttentionSegmentation.trainer import Trainer

from nlstruct.utils import  torch_global as tg

trainer = Trainer.from_params(
    model=model,
    base_dir=serial_dir,
    iterator=data_iterator,
    train_data=instances_train,
    validation_data=instances_val,
    segmenter=segmenter,
    params=config.pop("trainer")
)


In [None]:
# BIT FOR BOOSTING SURROUNDING ATTENTIONS

# attn = torch.Tensor([[0,1,0,1,0], [0,0,1,0,0]])

# attn_boosted = attn.clone()
# nnz = (attn>0).nonzero().t().chunk(chunks=2,dim=0)

# print(nnz)

# new_nnz = [[], []]

# for nz0, nz1 in zip(nnz[0][0].numpy(), nnz[1][0].numpy()):
#     new_nnz[0].extend([nz0,nz0])
#     new_nnz[1].extend([nz1-1,nz1+1])
    
# new_nnz[0] = torch.Tensor([new_nnz[0]]).long()
# new_nnz[1] = torch.Tensor([new_nnz[1]]).long()
# new_nnz = (new_nnz[0], new_nnz[1])
# print(new_nnz)
# attn_boosted[new_nnz] += 0.1

# attn_boosted

In [None]:

# USE BIO BERT
# TRAIN STEP 1 ONLY ON MEDIC LABELS (+ NCBI MENTIONS)
# PREPROCESS / TRAIN / ATTEINDRE BONS SCORES
# GET MEDIC ALTERNATIVE LABELS DANS NLSTRUCT -> TRADUIRE LABELS NCBI VERS MEDIC

# USE ENTROPY INSTEAD OF CROSS ENTROPY -> not rely on labelled data only (rely on model certainty)

# GROUPS : TYPE SEMANTIQUE À LA MENTION (pas utiliser)

# NGRAMS FOR ENTITIES -> not possible with discontinued entities

# Use "separation token" in phrases ?

# Use a limited number of attention heads (not one per class)

# Use same method as Perceval for trajectories (draw closest ones, reduce list, repeat) -> prédiction itérative

# Maybe remove weakly supervised completely?

# Test with Reinforce only after a few epochs

# Facteur de représentation pour pondérer loss de Perceval ?

# Plusieurs facteurs pour constituer la reward

# Facteur de similarité mention extraite / synonyme plutôt que similarité mention / label ?

# Make sure that every trajectory is different -> draw first then use Perceval

# Métrique finale : Est-ce qu'on arrive à choper les CUI ? -> parce que frontières entités dures à déterminer 

# Use only one class ? -> simpler because all mentions are diseases -> MAKE SURE THAT SEVERAL MENTIONS ARE PREDICTABLE

# maybe problem with reinforce comes from hyperparameters??

In [None]:
from __future__ import absolute_import
import logging
import os
import shutil
import json
from collections import deque
import time
import re
import datetime
import traceback
import numpy as np
from typing import Dict, Optional, List, Tuple, Union, Iterable, Any, Set
import pdb

import torch
import torch.optim.lr_scheduler
from torch.nn.parallel import replicate, parallel_apply
from torch.nn.parallel.scatter_gather import scatter_kwargs, gather
from tensorboardX import SummaryWriter

from itertools import tee

from allennlp.common import Params
from allennlp.common.checks import ConfigurationError
from allennlp.common.util import peak_memory_mb, gpu_memory_mb
from allennlp.common.tqdm import Tqdm
from allennlp.data.instance import Instance
from allennlp.data.iterators.data_iterator import DataIterator
from allennlp.models.model import Model
from allennlp.nn import util
from allennlp.training.learning_rate_schedulers import LearningRateScheduler
from allennlp.training.optimizers import Optimizer

from AttentionSegmentation.commons.trainer_utils import is_sparse,\
    sparse_clip_norm, move_optimizer_to_cuda, TensorboardWriter
# from AttentionSegmentation.visualization.visualize_attns \
#     import html_visualizer
from AttentionSegmentation.model.attn2labels import BasePredictionClass
logger = logging.getLogger(__name__)

TQDM_COLUMNS = 200

import sys
sys.path.insert(0,'/home/ytaille/deep_multilingual_normalization')
from create_classifiers import create_classifiers
from nlstruct.dataloaders import load_from_brat

logger2 = logging.getLogger("nlstruct")
logger2.setLevel(logging.ERROR)

from notebook_utils import *

def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        if torch.cuda.is_available():
            for gpu, memory in gpu_memory_mb().items():
                logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0

        from allennlp.data.fields.array_field import ArrayField

        for i, td in enumerate(self._train_data):
            td.fields['sample_id'] = ArrayField(np.array([i]))

        # Get tqdm for the training batches
        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device,
                                         shuffle=True,
                                         )

        train_generator, cp_generator, id_generator = tee(train_generator, 3)

        ids = []

        for ig in id_generator:
            ids.extend([int(sid.item()) for sid in ig['sample_id']])

        shuffled_train_data = [self._train_data[i] for i in ids]

#         train_predictions = self._segmenter.get_predictions(
#                     instances=shuffled_train_data,
#                     iterator = cp_generator,
#                     model=self._model,
#                     cuda_device=self._iterator_device,
#                     verbose=True)

        num_training_batches = self._iterator.get_num_batches(self._train_data)
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches
                                         )
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        cpt_batch = 0

        # Set the model to "train" mode.
        self._model.train()

        for batch in train_generator_tqdm:
            
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total
            batch_len = len(batch['labels'])

            # FOR train_predictions:
            # pred/gold is sentence level
            # pred_labels/gold_labels is word level


            # FOR batch:
            # labels is sentence level
            # tags is word level

            # print(train_texts)
            # print("SENTENCE LEVEL")
            # print([tp['gold'] for tp in train_predictions[:10]])
            # print(batch['labels'][:10])

            # print("WORD LEVEL")
            # print([tp['gold_labels'] for tp in train_predictions[:2]])
            # print(batch['tags'][:2])

            # exit()
            
            if epoch <= -1:
                trajectory_scores =  [0]
            else:
                output_dict = self._model(**batch)
                
                attns = output_dict['attentions']
#                 attns = output_dict['attentions_rl'].permute(0,2,1)
                
                # Policy is "attention mask": attention scores should be higher if we want to predict CUI
                # Only take words with attention above threshold when predicting with deep norm -> see if it's enough (reward indicates that)
                # REINFORCE algo: (also known as Monte Carlo PG)
                # - draw N trajectories (N attention paths?) -> discretise attentions to make them 1 / 0? -> see if it works with bernoulli first
                # - evaluate each trajectory then sum (maybe add baseline -> subtract mean of all trajectories rewards)
                # - Expected return is given by sum(prob(Ti | W) * reward(Ti)) -> see again if it works with bernoulli first
                # W are WeakL weights 
                # - Gradient ascent of return / gradient descent of negative return

                # Set horizon ? -> number / proportion of attention at 1 per batch
                # Set number of trajectories ? -> maybe make trajectories number vary based on sentence length
                # gamma = 0.9 ? -> used to simulate temporal importance of reward (multiply each step by a certain power of gamma, furthest rewards are less impactful) -> may not be possible to model here
                
                horizon = 0.2
                n_trajectories = 10
                gamma = 0.9
                attn_threshold = 0.01

                mask = batch['tokens']['mask']
        
                prob_attn = attns
                from torch.distributions import Binomial

                m = Binomial(probs=prob_attn)
                trajectory_scores = []#{i: [] for i in range(prob_attn.shape[-1])}

                policy_loss = []
                
#                 all_samples = []
                
                for nb_traj in range(n_trajectories):
                    attn_sample = m.sample()
                    
#                     all_samples.append(attn_sample)
                    
#                 all_samples = torch.stack(all_samples)
                
                    # logsum: probabilité qu'il y ait au moins une mention d'un type ? -> ensuite sigmoide
                    # d'abord agréger puis calculer la loss

                    # multi label: itération ? têtes d'attention ? tags

                    # match cui avec cui le plus probable -> 

                    real_tokens = [np.array(b.fields['tokens'].tokens) for b in shuffled_train_data[cpt_batch:cpt_batch+batch_len]]
    #                     gold_labels = [np.array(b.fields['tags'].labels) for b in shuffled_train_data[cpt_batch:cpt_batch+batch_len]]
                    gold_norm_labels = [np.array(b.fields['chunk_tags'].labels) for b in shuffled_train_data[cpt_batch:cpt_batch+batch_len]]

                    # unique_mask: n_class * batch_size * nb_traj * seq_len
                    attn_mask = attn_sample

                    from datetime import datetime
                    now = datetime.now
                    
                    dnow = now()
#                     for class_n in range(attn_sample.shape[-1]):
                        
#                     masked_tokens = [rt[attn_mask[w_id,:len(rt),class_n].cpu().to(bool)]
#                                      if len(rt) > 1 else rt[[attn_mask[w_id,:len(rt),class_n].cpu().to(bool)]] 
#                                      for w_id, rt in enumerate(real_tokens)] # weird behaviour for len == 1
# #                     masked_gold = [rt[attn_mask[w_id,:len(rt)].cpu().to(bool)] for w_id, rt in enumerate(gold_labels)]
#                     masked_gold_norm = [rt[attn_mask[w_id,:len(rt),class_n].cpu().to(bool)] 
#                                         if len(rt) > 1 else rt[[attn_mask[w_id,:len(rt),class_n].cpu().to(bool)]]
#                                         for w_id, rt in enumerate(gold_norm_labels)]

                    masked_tokens = [rt[attn_mask[w_id,:len(rt),class_n].cpu().to(bool)]
                                     if len(rt) > 1 else rt[[attn_mask[w_id,:len(rt),class_n].cpu().to(bool)]] 
                                     for w_id, rt in enumerate(real_tokens) for class_n in range(attn_sample.shape[-1])] # weird behaviour for len == 1
#                     masked_gold = [rt[attn_mask[w_id,:len(rt)].cpu().to(bool)] for w_id, rt in enumerate(gold_labels)]
                    masked_gold_norm = [rt[attn_mask[w_id,:len(rt),class_n].cpu().to(bool)] 
                                        if len(rt) > 1 else rt[[attn_mask[w_id,:len(rt),class_n].cpu().to(bool)]]
                                        for w_id, rt in enumerate(gold_norm_labels) for class_n in range(attn_sample.shape[-1])]
    
                    save_to_ann(masked_tokens, masked_gold_norm, '/home/ytaille/data/tmp/ws_inputs/')

                    # NLSTRUCT PART

                    bert_name = "bert-base-multilingual-uncased"

                    dataset = load_from_brat("/home/ytaille/data/tmp/ws_inputs/")

                    if len(dataset['mentions']) == 0:
                        continue

                    dataset['mentions']['mention_id'] = dataset['mentions']['doc_id'] +'.'+ dataset['mentions']['mention_id'].astype(str)

                    batcher, vocs, mention_ids = preprocess_train(
                        dataset,
                        vocabularies=self.vocabularies1,
                        bert_name=bert_name,
                    )

                    batch_size = len(batcher)
                    with_tqdm = True

                    tg.set_device('cuda:0')#('cuda:0')
                    device = tg.device

                    pred_batcher = predict(batcher, self.classifier1)

                    scores = compute_scores(pred_batcher, batcher)

                    try:
                        trajectory_scores.append((scores['loss'] * prob_attn).mean())
                    except:
                        print(trajectory_scores)
                        raise

                cpt_batch += batch_len

#                 if any(len(tj) > 0 for tj in trajectory_scores.values()):
#                     trajectory_scores = [t for tj in trajectory_scores.values() for t in tj]
#                 else: policy_loss = 0

            self._optimizer.zero_grad()
            loss = self._batch_loss(batch, for_training=True) + sum(trajectory_scores) # policy_loss 
            loss.backward()

            # Make sure Variable is on the cpu before converting to numpy.
            # .cpu() is a no-op if you aren't using GPUs.
            train_loss += loss.data.cpu().numpy()
            batch_grad_norm = self._rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
                
            self._optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)
            if hasattr(self, "_tf_params") and self._tf_params is not None:
                # We have TF logging
                if self._batch_num_total % self._tf_params["log_every"] == 0:
                    self._tf_log(metrics, self._batch_num_total)

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)
    
import functools

trainer._train_epoch = functools.partial(_train_epoch, trainer)

In [None]:
trainer.train()

In [None]:
logger.info("Training Done.")
if instances_test is not None:
    logger.info("Computing final Test Accuracy")
    trainer.test(instances_test)
logger.info("Done.")