# Example usage of the Stacked model for joint anonymization (ANON) and concept extraction (CE)

Copyright (c) 2020 Robert Bosch GmbH

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.

In [None]:
experiment = 'spanish_stacked'
# Name of the experiments and also the storage path
# This could also be a nested directory, e.g., 'test/v1/spanish_stacked/'
# This path will be created if non-existent
print(f'Storing at: {experiment}/')

epochs = 100
batch_size = 64

# Set the learning rates for ANON and CE trainers
# A slightly higher learning rate is used for the main task CE
learning_rate_anon = 0.1
learning_rate_ce = 0.2

# The STACKED model is trained on anonymization data for a couple 
# of epochs before the actual multitask training starts. 
# This gives the model good anonymization performance before 
# the concept extraction uses the anonymziation internally.
pretrain_epochs_anon = 1

# Train

In [None]:
# anon_corpus and ce_corpus can be any corpora
# inheriting from flair.data.ColumnCorpus
#
# Check out the following link for more information:
# https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md
#
from flair.datasets import ColumnCorpus

# Get the corpus for concept extraction (CE)
# Adapt the paths and filenmes to your setting
print('Load CE data')
ce_tag_type = 'ce'
ce_columns = {0: 'text', 1: ce_tag_type}

ce_corpus = ColumnCorpus(
    'data/pharmaconer/', ce_columns, 
    tag_to_bioes=ce_tag_type, # is used to convert BIO to BIOES labels
    train_file='train.sample.bio', # NOTE that these files are samples
    dev_file='dev.sample.bio',     # and not the complete dataset
    test_file='test.sample.bio')   # which we do not ship with this repo
print(ce_corpus)

# make the tag dictionary from the corpus
ce_tag_dictionary = ce_corpus.make_tag_dictionary(tag_type=ce_tag_type)
print(f'found {len(ce_tag_dictionary)} labels')
print(ce_tag_dictionary.idx2item)


# Now the same for anonymization (ANON) data
# Adapt the paths and filenmes to your setting
anon_tag_type = 'anon'
anon_columns = {0: 'text', 1: anon_tag_type}

print('Load ANON data')
anon_corpus = ColumnCorpus(
    'data/meddocan/', anon_columns,
    train_file='train.sample.bio', # NOTE that these files are samples
    dev_file='dev.sample.bio',     # and not the complete dataset
    test_file='test.sample.bio')   # which we do not ship with this repo
print(anon_corpus)

# make the tag dictionary from the corpus
anon_tag_dictionary = anon_corpus.make_tag_dictionary(tag_type=anon_tag_type)
print(f'found {len(anon_tag_dictionary)} labels')
print(anon_tag_dictionary.idx2item)

In [None]:
# import the embeddings you want to use in your experiments
#
# Check out the following link for more information:
# https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
#
from flair.embeddings import * 
from src.embeddings import CustomBytePairEmbeddings

# Embedding can be any (combination of) embedding(s) 
# inheriting from flair.embeddings.TokenEmbeddings.
# We use the concatenation of the following embeddings
embeddings: TokenEmbeddings = StackedEmbeddings([
    WordEmbeddings('cc.es.300.vec.gensim'),            # * Spanish FastText converted to gensim format
    WordEmbeddings('Scielo_wiki_Fasttext.vec.gensim'), # * Domain-specific fastText (https://www.aclweb.org/anthology/W19-1916.pdf)
    CustomBytePairEmbeddings(language='es'),           # * Spanish BPEmb
    BertEmbeddings('bert-base-multilingual-uncased',   # * multilingual BERT embeddings
                   layers='-1,-2,-3,-4',               #   using a scalar mix of the last layers
                   use_scalar_mix=True),               # 
    FlairEmbeddings('es-forward'),                     # * Flair forward language model
    FlairEmbeddings('es-backward')                     # * Flair backward language model
])

In [None]:
from src.models import StackedSequenceTagger # for STACKED
#from src.models import MultitaskSequenceTagger # for MULTITASK
#
# Both models follow the same API. Changing the import and the initialization (line 8)
# is enough to train a MultiTask model instead

# Create stacked model for joint ANON and CE
tagger = StackedSequenceTagger(
    hidden_size = 256,
    embeddings = embeddings,
    ce_tag_dictionary = ce_tag_dictionary,
    anon_tag_dictionary = anon_tag_dictionary,
    ce_tag_type = ce_tag_type,
    anon_tag_type = anon_tag_type,
)

In [None]:
from src.trainers import BatchWiseMultiTaskTrainer, SingleTaskTrainer

# Create ANON trainer
anon_trainer = SingleTaskTrainer(
    tagger, anon_corpus, 
    model_mode='ANON'
)

# Create CE trainer
ce_trainer = SingleTaskTrainer(
    tagger, ce_corpus, 
    model_mode='CE'
)

# Create MultitaskTrainer
mt_trainer = BatchWiseMultiTaskTrainer(
    trainer_list = [anon_trainer, ce_trainer], 
    base_path_list = [
        f'{experiment}/{anon_trainer.display_name}', 
        f'{experiment}/{ce_trainer.display_name}'], 
    learning_rate_list = [learning_rate_anon, learning_rate_ce],
    pretrain_list = [pretrain_epochs_anon, 0]
)

In [None]:
# Start training for <epochs> epochs
mt_trainer.train(
    max_epochs=epochs,
    mini_batch_size=batch_size
)

In [None]:
# Load the best model according to CE development score
try:
    tagger = StackedSequenceTagger.load(f'{experiment}/{ce_trainer.display_name}/best-model.pt')
except ModuleNotFoundError: # when MultiTaskTrainer was trained instead
    tagger = MultitaskSequenceTagger.load(f'{experiment}/{ce_trainer.display_name}/best-model.pt')

In [None]:
from flair.data import Sentence

# Get labels for a new sentence
sent = Sentence('Paciente de 70 años de edad fue remitido al departamento '\
                'de Oncología iniciando tratamiento quimioterápico '\
                'adyuvante con Cisplatino y Gemcitabina .')

# Predict ANON labels 
# ANON labels are in BIO format
tagger.set_output_to_anon()
tagger.predict(sent)

# Predict CE labels
# CE labels are in BIOES format
tagger.set_output_to_ce()
tagger.predict(sent)

for token in sent:
    text = token.text
    anon_label = token.get_tag(anon_tag_type).value
    ce_label = token.get_tag(ce_tag_type).value
    print((text, anon_label, ce_label))
    
# Output could look like:
# ('Paciente', 'O', 'O')
# ('de', 'O', 'O')
# ('70', 'B-EDAD_SUJETO_ASISTENCIA', 'O')
# ('años', 'I-EDAD_SUJETO_ASISTENCIA', 'O')
# ('de', 'O', 'O')
# ('edad', 'O', 'O')
# ('fue', 'O', 'O')
# ('remitido', 'O', 'O')
# ('al', 'O', 'O')
# ('departamento', 'O', 'O')
# ('de', 'O', 'O')
# ('Oncología', 'O', 'O')
# ('iniciando', 'O', 'O')
# ('tratamiento', 'O', 'O')
# ('quimioterápico', 'O', 'O')
# ('adyuvante', 'O', 'O')
# ('con', 'O', 'O')
# ('Cisplatino', 'O', 'S-NORMALIZABLES')
# ('y', 'O', 'O')
# ('Gemcitabina', 'O', 'S-NORMALIZABLES')
# ('.', 'O', 'O')

# Evaluate
Convert your data to the right format and use the following evaluation scripts:

* [PharmaCoNER](https://github.com/PlanTL-SANIDAD/PharmaCoNER-CODALAB-Evaluation-Script) (Spanish CE)
* [MEDDOCAN](https://github.com/PlanTL-SANIDAD/MEDDOCAN-Evaluation-Script) (Spanish ANON)
* [i2b2](https://github.com/EmilyAlsentzer/clinicalBERT/tree/master/downstream_tasks/ner_eval) (English)