# Initialization

In [5]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

import sys

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import torch

device = torch.device('cuda')
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))

Tesla V100-DGXS-16GB


In [7]:
from logger import initialize_logger
logger = initialize_logger('../workdir/logs/biomed_ie.log', 'biomed_ie')
#initialize_logger('../workdir/logs/sequence_tagger_bert.log', 'sequence_tagger_bert')

In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import json

# Global parameters

In [9]:
CACHE_DIR = '../workdir/cache'

# MAX_LEN = 150
BATCH_SIZE = 32
#BATCH_SIZE = 16
MAX_LEN = 100
#BATCH_SIZE = 100
#BATCH_SIZE_PRED = 1200 # 1600
#BATCH_SIZE_PRED = 1200 # 1600
BATCH_SIZE_PRED = 1500
PRED_BATCH_SIZE = BATCH_SIZE_PRED
random_state = 2019

EARLY_STOPPING = 1
MAX_RETRAIN_EPOCHS = 30
#MAX_RETRAIN_EPOCHS = 15
MAX_N_EPOCHS = MAX_RETRAIN_EPOCHS

N_SAMPLES_PER_AL_ITER = 30
N_AL_PASSES = 5
N_AL_ITERATIONS = 25
# N_AL_PASSES = 1
# N_AL_ITERATIONS = 1

BASE_LR = 5e-5
LEARNING_RATE = BASE_LR

VALIDATION_RATIO = 0.25

MAX_TO_ANNEAL = 3
ANNEAL_FACTOR = 0.5
PATIENCE = 2
WEIGHT_DECAY = 0.01

SEED_POSITIVE = 10
SEED_RANDOM = 40

BIO_BERT = '../workdir/bio_bert/torch2/'
OUTPUT_FILE_PATH = '../workdir/experiments/17_biobert/'
#OUTPUT_FILE_PATH = '../workdir/experiments/debug'

# Load dataset

In [12]:
from flair.datasets import ColumnCorpus

corpora = {}

data_folder = '../workdir/i2b2/conll/'
for attr in ['hypertension', 'cad', 'diabetes']:
    corpora[attr] = ColumnCorpus(data_folder, {0 : 'text', 1 : 'ner'},
                                 train_file=f'i2b2_training_{attr}.conll',
                                 test_file=f'i2b2_testing_{attr}.conll',
                                 dev_file=f'i2b2_testing_{attr}.conll')

data_folder = '../workdir/genia/conll/'
corpora['genia'] = ColumnCorpus(data_folder, {0 : 'text', 1 : 'ner'},
                                train_file='Genia4ERtask1.iob2',
                                test_file='Genia4EReval1.iob2',
                                dev_file='Genia4EReval1.iob2')    

2020-02-18 22:46:10,959 Reading data from ../workdir/i2b2/conll
2020-02-18 22:46:10,960 Train: ../workdir/i2b2/conll/i2b2_training_hypertension.conll
2020-02-18 22:46:10,960 Dev: ../workdir/i2b2/conll/i2b2_testing_hypertension.conll
2020-02-18 22:46:10,961 Test: ../workdir/i2b2/conll/i2b2_testing_hypertension.conll
2020-02-18 22:46:17,866 Reading data from ../workdir/i2b2/conll
2020-02-18 22:46:17,867 Train: ../workdir/i2b2/conll/i2b2_training_cad.conll
2020-02-18 22:46:17,868 Dev: ../workdir/i2b2/conll/i2b2_testing_cad.conll
2020-02-18 22:46:17,869 Test: ../workdir/i2b2/conll/i2b2_testing_cad.conll
2020-02-18 22:46:34,645 Reading data from ../workdir/i2b2/conll
2020-02-18 22:46:34,646 Train: ../workdir/i2b2/conll/i2b2_training_diabetes.conll
2020-02-18 22:46:34,647 Dev: ../workdir/i2b2/conll/i2b2_testing_diabetes.conll
2020-02-18 22:46:34,647 Test: ../workdir/i2b2/conll/i2b2_testing_diabetes.conll
2020-02-18 22:46:43,005 Reading data from ../workdir/genia/conll
2020-02-18 22:46:43,006

# Run experiments

In [18]:
from bert_active_learning_exp import run_experiment_al
from active_learning_seq import LibActNN, LibActNNPositiveLessCertain, RandomSamplingWithRetraining
from libact.query_strategies import UncertaintySampling, RandomSampling

from bert_sequence_tagger import SequenceTaggerBert, BertForTokenClassificationCustom, ModelTrainerBert
from bert_sequence_tagger.bert_utils import get_parameters_without_decay

from pytorch_transformers import BertTokenizer, AdamW, WarmupLinearSchedule, WarmupConstantSchedule
from torch.optim.lr_scheduler import ReduceLROnPlateau

from bert_sequence_tagger.metrics import f1_entity_level, f1_token_level


BERT_TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased', 
                                               cache_dir=CACHE_DIR, 
                                               do_lower_case=False)

def create_libact_adaptor(tag2index, index2tag, adaptor_type, *args):
    def model_ctor():
        model = BertForTokenClassificationCustom.from_pretrained(BIO_BERT,
                                                                 cache_dir=CACHE_DIR, 
                                                                 num_labels=len(tag2index)).cuda()

        seq_tagger = SequenceTaggerBert(model, BERT_TOKENIZER, idx2tag=index2tag, tag2idx=tag2index)
        
        return seq_tagger
    
    def trainer_ctor(seq_tagger, corpus_len, train_dataloader, val_dataloader):
        optimizer = AdamW(get_parameters_without_decay(seq_tagger._bert_model),
                          lr=LEARNING_RATE, betas=(0.9, 0.999), 
                          eps=1e-6, weight_decay=0.01, correct_bias=True)

        lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=ANNEAL_FACTOR, patience=PATIENCE)
        
        trainer = ModelTrainerBert(seq_tagger, 
                                   optimizer, 
                                   lr_scheduler,
                                   train_dataloader, 
                                   val_dataloader,
                                   update_scheduler='ee',
                                   keep_best_model=True,
                                   restore_bm_on_lr_change=True,
                                   max_grad_norm=1.,
                                   validation_metrics=[f1_entity_level],
                                   decision_metric=lambda metrics: metrics[0],
                                   smallest_lr=LEARNING_RATE / (MAX_TO_ANNEAL**(1./ANNEAL_FACTOR) + 0.1))
        
        return trainer
    
    return adaptor_type(*args, 
                        model_ctor=model_ctor,
                        trainer_ctor=trainer_ctor,
                        batch_size=BATCH_SIZE,
                        bs_pred=BATCH_SIZE_PRED,
                        train_from_scratch=True,
                        retrain_epochs=MAX_RETRAIN_EPOCHS,
                        valid_ratio=VALIDATION_RATIO,
                        string_input=False)

In [20]:
def create_i2b2_experiment_pack(corpus, attr):
     return [
    {
        'corpus' : corpus,
        'output_file_path' : OUTPUT_FILE_PATH,
        'ranking_strategy' : lambda trn_ds, _libact_model: UncertaintySampling(trn_ds, model=_libact_model, method='lc'),
        'libact_adaptor_ctor' : lambda tag2index, index2tag: create_libact_adaptor(tag2index, index2tag, LibActNN),
        'name' : f'{attr}_MNLP',
        'n_passes' : N_AL_PASSES,
        'n_al_iterations' : N_AL_ITERATIONS,
        'max_len' : MAX_LEN,
        'max_samples_number' : N_SAMPLES_PER_AL_ITER,
        'seed_elems_per_class' : SEED_POSITIVE,
        'n_seeds_random' : SEED_RANDOM
    }
]

experiments = []
for attr, corpus in corpora.items():
    experiments += create_i2b2_experiment_pack(corpus, attr)

experiments

[{'corpus': <flair.datasets.ColumnCorpus at 0x7f2f7e84d350>,
  'output_file_path': '../workdir/experiments/17_biobert/',
  'ranking_strategy': <function __main__.create_i2b2_experiment_pack.<locals>.<lambda>(trn_ds, _libact_model)>,
  'libact_adaptor_ctor': <function __main__.create_i2b2_experiment_pack.<locals>.<lambda>(tag2index, index2tag)>,
  'name': 'hypertension_MNLP',
  'n_passes': 5,
  'n_al_iterations': 25,
  'max_len': 100,
  'max_samples_number': 30,
  'seed_elems_per_class': 10,
  'n_seeds_random': 40},
 {'corpus': <flair.datasets.ColumnCorpus at 0x7f2d0b6ff650>,
  'output_file_path': '../workdir/experiments/17_biobert/',
  'ranking_strategy': <function __main__.create_i2b2_experiment_pack.<locals>.<lambda>(trn_ds, _libact_model)>,
  'libact_adaptor_ctor': <function __main__.create_i2b2_experiment_pack.<locals>.<lambda>(tag2index, index2tag)>,
  'name': 'cad_MNLP',
  'n_passes': 5,
  'n_al_iterations': 25,
  'max_len': 100,
  'max_samples_number': 30,
  'seed_elems_per_clas

In [None]:
for exp in experiments[2:4]:
    logger.info('#################### Experiment ######################')
    logger.info(exp)
    
    run_experiment_al(**exp)
    
    logger.info('################### Experiment finished ##############')

2020-02-18 23:03:38,842 - biomed_ie - INFO - #################### Experiment ######################
2020-02-18 23:03:38,843 - biomed_ie - INFO - {'corpus': <flair.datasets.ColumnCorpus object at 0x7f2cd04cf510>, 'output_file_path': '../workdir/experiments/17_biobert/', 'ranking_strategy': <function create_i2b2_experiment_pack.<locals>.<lambda> at 0x7f2c5fe7e560>, 'libact_adaptor_ctor': <function create_i2b2_experiment_pack.<locals>.<lambda> at 0x7f2c5fe7e5f0>, 'name': 'diabetes_MNLP', 'n_passes': 5, 'n_al_iterations': 25, 'max_len': 100, 'max_samples_number': 30, 'seed_elems_per_class': 10, 'n_seeds_random': 40}
2020-02-18 23:03:39,825 - biomed_ie - INFO - Active learning...
2020-02-18 23:03:39,895 - biomed_ie - INFO - Number of seeding examples: 59
2020-02-18 23:03:39,899 - biomed_ie - INFO - Start emulating active learning.


Epoch: 100%|██████████| 30/30 [00:23<00:00,  1.26it/s]


2020-02-18 23:04:33,044 - biomed_ie - INFO - Performance on seed examples: {'f1_entity_level': 0.6100536847242557}
2020-02-18 23:04:33,046 - biomed_ie - INFO - Active learning iteration: #0


Epoch:  63%|██████▎   | 19/30 [00:20<00:11,  1.07s/it]


2020-02-18 23:05:49,384 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.505664877757901}
2020-02-18 23:05:49,386 - biomed_ie - INFO - Active learning iteration: #1


Epoch:  77%|███████▋  | 23/30 [00:26<00:08,  1.15s/it]


2020-02-18 23:07:11,744 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6040892193308551}
2020-02-18 23:07:11,745 - biomed_ie - INFO - Active learning iteration: #2


Epoch:  70%|███████   | 21/30 [00:28<00:12,  1.34s/it]


2020-02-18 23:08:36,048 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6171835282155567}
2020-02-18 23:08:36,050 - biomed_ie - INFO - Active learning iteration: #3


Epoch:  60%|██████    | 18/30 [00:28<00:19,  1.61s/it]


2020-02-18 23:10:01,022 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6252446183953033}
2020-02-18 23:10:01,023 - biomed_ie - INFO - Active learning iteration: #4


Epoch:  60%|██████    | 18/30 [00:30<00:20,  1.71s/it]


2020-02-18 23:11:27,932 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6142241379310344}
2020-02-18 23:11:27,934 - biomed_ie - INFO - Active learning iteration: #5


Epoch:  60%|██████    | 18/30 [00:33<00:22,  1.88s/it]


2020-02-18 23:12:57,663 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6378155589902113}
2020-02-18 23:12:57,665 - biomed_ie - INFO - Active learning iteration: #6


Epoch:  63%|██████▎   | 19/30 [00:38<00:22,  2.02s/it]


2020-02-18 23:14:31,754 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.624332977588047}
2020-02-18 23:14:31,756 - biomed_ie - INFO - Active learning iteration: #7


Epoch:  53%|█████▎    | 16/30 [00:33<00:29,  2.10s/it]


2020-02-18 23:16:01,036 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.5791549295774648}
2020-02-18 23:16:01,037 - biomed_ie - INFO - Active learning iteration: #8


Epoch:  50%|█████     | 15/30 [00:35<00:35,  2.38s/it]


2020-02-18 23:17:32,928 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6044980800877674}
2020-02-18 23:17:32,930 - biomed_ie - INFO - Active learning iteration: #9


Epoch:  53%|█████▎    | 16/30 [00:41<00:36,  2.58s/it]


2020-02-18 23:19:10,172 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6454013822434875}
2020-02-18 23:19:10,173 - biomed_ie - INFO - Active learning iteration: #10


Epoch:  50%|█████     | 15/30 [00:43<00:43,  2.88s/it]


2020-02-18 23:20:49,190 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.68259385665529}
2020-02-18 23:20:49,192 - biomed_ie - INFO - Active learning iteration: #11


Epoch:  47%|████▋     | 14/30 [00:42<00:48,  3.05s/it]


2020-02-18 23:22:27,591 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.5769020251778872}
2020-02-18 23:22:27,592 - biomed_ie - INFO - Active learning iteration: #12


Epoch:  57%|█████▋    | 17/30 [00:54<00:41,  3.19s/it]


2020-02-18 23:24:17,342 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6882793017456359}
2020-02-18 23:24:17,344 - biomed_ie - INFO - Active learning iteration: #13


Epoch:  50%|█████     | 15/30 [00:50<00:50,  3.38s/it]


2020-02-18 23:26:03,519 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6840148698884757}
2020-02-18 23:26:03,521 - biomed_ie - INFO - Active learning iteration: #14


Epoch:  60%|██████    | 18/30 [01:01<00:41,  3.42s/it]


2020-02-18 23:28:00,454 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6886051080550097}
2020-02-18 23:28:00,455 - biomed_ie - INFO - Active learning iteration: #15


Epoch:  53%|█████▎    | 16/30 [00:57<00:50,  3.60s/it]


2020-02-18 23:29:53,333 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.7101449275362318}
2020-02-18 23:29:53,335 - biomed_ie - INFO - Active learning iteration: #16


Epoch:  47%|████▋     | 14/30 [00:54<01:02,  3.88s/it]


2020-02-18 23:31:43,017 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6009070294784581}
2020-02-18 23:31:43,019 - biomed_ie - INFO - Active learning iteration: #17


Epoch:  60%|██████    | 18/30 [01:12<00:48,  4.05s/it]


2020-02-18 23:33:51,016 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6681342872867364}
2020-02-18 23:33:51,018 - biomed_ie - INFO - Active learning iteration: #18


Epoch:  57%|█████▋    | 17/30 [01:09<00:53,  4.11s/it]


2020-02-18 23:35:55,904 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6240913811007269}
2020-02-18 23:35:55,905 - biomed_ie - INFO - Active learning iteration: #19


Epoch:  50%|█████     | 15/30 [01:04<01:04,  4.32s/it]


2020-02-18 23:37:55,767 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.7196779063915452}
2020-02-18 23:37:55,768 - biomed_ie - INFO - Active learning iteration: #20


Epoch:  50%|█████     | 15/30 [01:07<01:07,  4.51s/it]


2020-02-18 23:39:58,770 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6871165644171778}
2020-02-18 23:39:58,771 - biomed_ie - INFO - Active learning iteration: #21


Epoch:  47%|████▋     | 14/30 [01:06<01:15,  4.75s/it]


2020-02-18 23:42:00,580 - biomed_ie - INFO - Performance: {'f1_entity_level': 0.6819338422391857}
2020-02-18 23:42:00,582 - biomed_ie - INFO - Active learning iteration: #22


Epoch:   7%|▋         | 2/30 [00:09<02:13,  4.76s/it]