# Install dependencies

In [None]:
!git clone https://github.com/NVIDIA/apex ./packages/apex && cd ./packages/apex && pip install -v --no-cache-dir \
    --global-option="--cpp_ext" --global-option="--cuda_ext" ./

# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import sys
sys.path.append('../../bert_sequence_tagger/src/')

In [2]:
import torch

device = torch.device('cuda')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

Tesla V100-DGXS-16GB


In [3]:
from logger import initialize_logger
logger = initialize_logger('../workdir/logs/i2b2_active_learning.log', name='sequence_tagger_bert')

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import json

# Global parameters

In [5]:
CACHE_DIR = '../workdir/cache'

# MAX_LEN = 150
MAX_LEN = 100

#BATCH_SIZE = 105
#BATCH_SIZE = 45 
BATCH_SIZE = 32
#BATCH_SIZE = 16

MAX_N_EPOCHS = 10

PRED_BATCH_SIZE = 1200

#LEARNING_RATE = 3e-5
LEARNING_RATE = 5e-5

random_state = 2019

WEIGHT_DECAY = 0.01

BIO_BERT = '../workdir/bio_bert/torch2'

In [6]:
from bert_sequence_tagger import SequenceTaggerBert, ModelTrainerBert
from flair.datasets import ColumnCorpus
from pytorch_transformers import BertTokenizer
from bert_sequence_tagger.bert_utils import make_bert_tag_dict_from_flair_corpus
from bert_sequence_tagger import BertForTokenClassificationCustom

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load dataset

In [7]:
data_folder = '../workdir/i2b2/conll/'
attr = 'hypertension'
#attr = 'diabetes'
#attr = 'cad'
corpus = ColumnCorpus(data_folder, {0 : 'text', 1 : 'ner'},
                      train_file=f'i2b2_training_{attr}.conll',
                      test_file=f'i2b2_testing_{attr}.conll',
                      dev_file=None)


# data_folder = '../workdir/genia/conll/'
# attr = 'genia'
# corpus = ColumnCorpus(data_folder, 
#                       {0 : 'text', 1 : 'ner'},
#                       train_file='Genia4ERtask1.iob2',
#                       test_file='Genia4EReval1.iob2',
#                       dev_file=None)

# data_folder = '../data/conll2003/conll2003'
# attr = 'conll2003'
# corpus = ColumnCorpus(data_folder, 
#                       {0 : 'text', 3 : 'ner'},
#                       train_file='eng.train.txt',
#                       test_file='eng.testb.txt',
#                       dev_file='eng.testa.txt')

print(corpus.obtain_statistics())

2019-10-18 03:30:19,229 Reading data from ../workdir/i2b2/conll
2019-10-18 03:30:19,230 Train: ../workdir/i2b2/conll/i2b2_training_hypertension.conll
2019-10-18 03:30:19,231 Dev: None
2019-10-18 03:30:19,232 Test: ../workdir/i2b2/conll/i2b2_testing_hypertension.conll
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 8884,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 156702,
            "min": 1,
            "max": 488,
            "avg": 17.63867627194957
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 6813,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 117214,
            "min": 1,
            "max": 390,
            "avg": 17.204462057830618
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documen

In [8]:
lengths = np.array([len(sent) for sent in corpus.train])
n_max_lengths = (lengths > (MAX_LEN-2)).sum()
print('N with more than max lengths:', n_max_lengths)
print('Ratio:', n_max_lengths / lengths.shape[0])

N with more than max lengths: 151
Ratio: 0.0169968482665466


# Prepare model

In [9]:
bpe_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', cache_dir=CACHE_DIR, do_lower_case=False)

idx2tag, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)

#model = BertForTokenClassificationCustom.from_pretrained('bert-base-cased', cache_dir=CACHE_DIR, num_labels=len(tag2idx)).cuda()
model = BertForTokenClassificationCustom.from_pretrained(BIO_BERT, cache_dir=CACHE_DIR, num_labels=len(tag2idx)).cuda()

seq_tagger = SequenceTaggerBert(bert_model=model, bpe_tokenizer=bpe_tokenizer, 
                                idx2tag=idx2tag, tag2idx=tag2idx, max_len=MAX_LEN)

# Training

In [11]:
from bert_active_learning_exp import prepare_corpus, initialize_seeds2

X_train, y_train = prepare_corpus(corpus.train)
y_seed = initialize_seeds2(y_train, ['I', 'B', 'O'], 30)

selector = [e is not None for e in y_seed]
y_train = np.array(y_seed)[selector]
X_train = np.array(X_train)[selector]

print(X_train.shape, y_train.shape)

(59,) (59,)


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

collate_fn = lambda inpt: tuple(zip(*inpt))

train_data = list(zip(X_train, y_train))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, 
                              sampler=train_sampler, 
                              batch_size=BATCH_SIZE,
                              collate_fn=collate_fn)

In [12]:
from torch.utils.data import RandomSampler, SequentialSampler

from bert_sequence_tagger.bert_utils import create_loader_from_flair_corpus, get_parameters_without_decay
from bert_sequence_tagger.model_trainer_bert import ModelTrainerBert

from bert_sequence_tagger.metrics import f1_entity_level, f1_token_level

from pytorch_transformers import AdamW, WarmupLinearSchedule

from torch.optim.lr_scheduler import ReduceLROnPlateau


train_dataloader = create_loader_from_flair_corpus(corpus.train, 
                                                   sampler_ctor=RandomSampler, 
                                                   batch_size=BATCH_SIZE)
valid_dataloader = create_loader_from_flair_corpus(corpus.dev,
                                                   sampler_ctor=SequentialSampler,
                                                   batch_size=PRED_BATCH_SIZE)

optimizer = AdamW(get_parameters_without_decay(model), lr=LEARNING_RATE, betas=(0.9, 0.999), 
                  eps =1e-6, weight_decay=0.01, correct_bias=True)

# lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0.1, 
#                                     t_total=(len(corpus.train) / BATCH_SIZE)*MAX_N_EPOCHS)

lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

trainer = ModelTrainerBert(model=seq_tagger, 
                           optimizer=optimizer, 
                           lr_scheduler=lr_scheduler,
                           train_dataloader=train_dataloader, 
                           val_dataloader=valid_dataloader,
                           update_scheduler='ep',
                           keep_best_model=True,
                           restore_bm_on_lr_change=True,
                           max_grad_norm=1.,
                           validation_metrics=[f1_entity_level],
                           decision_metric=lambda metrics: -metrics[1])

trainer.train(epochs=MAX_N_EPOCHS)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

2019-10-18 03:34:27,319 - sequence_tagger_bert - INFO - Train loss: 0.06409758169990429
2019-10-18 03:34:27,319 Train loss: 0.06409758169990429
2019-10-18 03:34:30,203 - sequence_tagger_bert - INFO - Validation loss: 0.02446748875081539
2019-10-18 03:34:30,203 Validation loss: 0.02446748875081539
2019-10-18 03:34:30,205 - sequence_tagger_bert - INFO - Validation metrics: (0.6486486486486486,)
2019-10-18 03:34:30,205 Validation metrics: (0.6486486486486486,)
2019-10-18 03:34:30,219 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:34:30,219 Current learning rate: 5e-05


Epoch:  10%|█         | 1/10 [01:21<12:12, 81.34s/it]

2019-10-18 03:35:47,347 - sequence_tagger_bert - INFO - Train loss: 0.017690664797267757
2019-10-18 03:35:47,347 Train loss: 0.017690664797267757
2019-10-18 03:35:50,251 - sequence_tagger_bert - INFO - Validation loss: 0.02103930525481701
2019-10-18 03:35:50,251 Validation loss: 0.02103930525481701
2019-10-18 03:35:50,253 - sequence_tagger_bert - INFO - Validation metrics: (0.6810551558752997,)
2019-10-18 03:35:50,253 Validation metrics: (0.6810551558752997,)
2019-10-18 03:35:50,268 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:35:50,268 Current learning rate: 5e-05


Epoch:  20%|██        | 2/10 [02:41<10:47, 80.95s/it]

2019-10-18 03:37:08,646 - sequence_tagger_bert - INFO - Train loss: 0.012191056116240596
2019-10-18 03:37:08,646 Train loss: 0.012191056116240596
2019-10-18 03:37:11,539 - sequence_tagger_bert - INFO - Validation loss: 0.026673121377825737
2019-10-18 03:37:11,539 Validation loss: 0.026673121377825737
2019-10-18 03:37:11,541 - sequence_tagger_bert - INFO - Validation metrics: (0.6858513189448441,)
2019-10-18 03:37:11,541 Validation metrics: (0.6858513189448441,)
2019-10-18 03:37:11,555 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:37:11,555 Current learning rate: 5e-05


Epoch:  30%|███       | 3/10 [04:02<09:27, 81.05s/it]

2019-10-18 03:38:27,117 - sequence_tagger_bert - INFO - Train loss: 0.0095674098272395
2019-10-18 03:38:27,117 Train loss: 0.0095674098272395
2019-10-18 03:38:29,998 - sequence_tagger_bert - INFO - Validation loss: 0.027471965178847313
2019-10-18 03:38:29,998 Validation loss: 0.027471965178847313
2019-10-18 03:38:29,999 - sequence_tagger_bert - INFO - Validation metrics: (0.6509433962264151,)
2019-10-18 03:38:29,999 Validation metrics: (0.6509433962264151,)
2019-10-18 03:38:30,001 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:38:30,001 Current learning rate: 5e-05


Epoch:  40%|████      | 4/10 [05:21<08:01, 80.27s/it]

2019-10-18 03:39:42,143 - sequence_tagger_bert - INFO - Train loss: 0.009451734010293903
2019-10-18 03:39:42,143 Train loss: 0.009451734010293903
2019-10-18 03:39:45,058 - sequence_tagger_bert - INFO - Validation loss: 0.033516354858875275
2019-10-18 03:39:45,058 Validation loss: 0.033516354858875275
2019-10-18 03:39:45,060 - sequence_tagger_bert - INFO - Validation metrics: (0.691358024691358,)
2019-10-18 03:39:45,060 Validation metrics: (0.691358024691358,)
2019-10-18 03:39:45,077 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:39:45,077 Current learning rate: 5e-05


Epoch:  50%|█████     | 5/10 [06:36<06:33, 78.71s/it]

2019-10-18 03:41:02,755 - sequence_tagger_bert - INFO - Train loss: 0.007586548956705817
2019-10-18 03:41:02,755 Train loss: 0.007586548956705817
2019-10-18 03:41:05,690 - sequence_tagger_bert - INFO - Validation loss: 0.03357239067554474
2019-10-18 03:41:05,690 Validation loss: 0.03357239067554474
2019-10-18 03:41:05,692 - sequence_tagger_bert - INFO - Validation metrics: (0.7040816326530612,)
2019-10-18 03:41:05,692 Validation metrics: (0.7040816326530612,)
2019-10-18 03:41:05,706 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:41:05,706 Current learning rate: 5e-05


Epoch:  60%|██████    | 6/10 [07:56<05:17, 79.29s/it]

2019-10-18 03:42:23,937 - sequence_tagger_bert - INFO - Train loss: 0.005374300019637486
2019-10-18 03:42:23,937 Train loss: 0.005374300019637486
2019-10-18 03:42:26,850 - sequence_tagger_bert - INFO - Validation loss: 0.033413343131542206
2019-10-18 03:42:26,850 Validation loss: 0.033413343131542206
2019-10-18 03:42:26,851 - sequence_tagger_bert - INFO - Validation metrics: (0.7,)
2019-10-18 03:42:26,851 Validation metrics: (0.7,)
2019-10-18 03:42:26,853 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:42:26,853 Current learning rate: 5e-05


Epoch:  70%|███████   | 7/10 [09:17<03:59, 79.84s/it]

2019-10-18 03:43:44,921 - sequence_tagger_bert - INFO - Train loss: 0.004776136249700802
2019-10-18 03:43:44,921 Train loss: 0.004776136249700802
2019-10-18 03:43:47,840 - sequence_tagger_bert - INFO - Validation loss: 0.037290848791599274
2019-10-18 03:43:47,840 Validation loss: 0.037290848791599274
2019-10-18 03:43:47,842 - sequence_tagger_bert - INFO - Validation metrics: (0.6989795918367347,)
2019-10-18 03:43:47,842 Validation metrics: (0.6989795918367347,)
2019-10-18 03:43:47,844 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:43:47,844 Current learning rate: 5e-05


Epoch:  80%|████████  | 8/10 [10:38<02:40, 80.19s/it]

2019-10-18 03:45:05,833 - sequence_tagger_bert - INFO - Train loss: 0.006042339135897139
2019-10-18 03:45:05,833 Train loss: 0.006042339135897139
2019-10-18 03:45:08,635 - sequence_tagger_bert - INFO - Validation loss: 0.034984175115823746
2019-10-18 03:45:08,635 Validation loss: 0.034984175115823746
2019-10-18 03:45:08,636 - sequence_tagger_bert - INFO - Validation metrics: (0.6401869158878505,)
2019-10-18 03:45:08,636 Validation metrics: (0.6401869158878505,)
2019-10-18 03:45:08,638 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:45:08,638 Current learning rate: 5e-05


Epoch:  90%|█████████ | 9/10 [11:59<01:20, 80.37s/it]

2019-10-18 03:46:25,671 - sequence_tagger_bert - INFO - Train loss: 0.0036513411675429303
2019-10-18 03:46:25,671 Train loss: 0.0036513411675429303
2019-10-18 03:46:28,520 - sequence_tagger_bert - INFO - Validation loss: 0.03348606079816818
2019-10-18 03:46:28,520 Validation loss: 0.03348606079816818
2019-10-18 03:46:28,522 - sequence_tagger_bert - INFO - Validation metrics: (0.6763990267639902,)
2019-10-18 03:46:28,522 Validation metrics: (0.6763990267639902,)
2019-10-18 03:46:28,524 - sequence_tagger_bert - INFO - Current learning rate: 5e-05
2019-10-18 03:46:28,524 Current learning rate: 5e-05


Epoch: 100%|██████████| 10/10 [13:19<00:00, 79.96s/it]


In [13]:
test_dataloader = create_loader_from_flair_corpus(corpus.test, 
                                                  sampler_ctor=SequentialSampler,
                                                  batch_size=PRED_BATCH_SIZE)

pred1, __, metrics = seq_tagger.predict(test_dataloader, evaluate=True, metrics=[f1_entity_level, f1_token_level])
metrics

(0.021918293243894976, 0.7372134038800705, 0.8101604278074866)

In [15]:
from seqeval.metrics import f1_score

from bert_active_learning_exp import prepare_corpus

X_test, y_test = prepare_corpus(corpus.test)

test_sampler = SequentialSampler(X_test)
test_dataloader = DataLoader(X_test, 
                             sampler=test_sampler, 
                             batch_size=1200,
                             collate_fn = lambda inpt: inpt)

pred, proba = seq_tagger.predict(test_dataloader, evaluate=False)

f1_score(y_test, pred)

0.5357271095152604

Genia: 0.736 0.7399374123232978  
CoNLL: 0.917   
Diabetes: 0.744  0.7389112903225806
Hypertension: 0.7357512953367875 0.7510584250635056 0.7366623986342296 0.7434072833821682 
CAD: 0.381 0.4272

In [26]:
torch.save(model.state_dict(), f'../workdir/models/bert/{attr}.pt')

# Evaluate document level

In [18]:
from i2b2_utils import drop_noise_samples

dataset_test_path = '../workdir/i2b2/i2b2_testing.json'
dataset_test = pd.read_json(dataset_test_path)
dataset_test.head()
test_selected_dataset = drop_noise_samples(dataset_test, attr.upper())

In [13]:
from flair.data import Sentence

def flair_process_i2b2(model, dataset, attr_name):
    res = model.predict([Sentence(t) for t in dataset.texts])
    res = [[e.tags[attr_name].value for e in sent] for sent in res]
    return res

In [14]:
from flair.models import SequenceTagger

#model = SequenceTagger.load('../models/new/DIABETES/fasttext/1.0/best-model.pt')
#model = SequenceTagger.load('../models/new/HYPERTENSION//elmo-pubmed/1.0/best-model.pt')
model = SequenceTagger.load('../models/new/CAD//elmo-pubmed/1.0/best-model.pt')

2019-07-06 15:53:14,376 loading file ../models/new/CAD//elmo-pubmed/1.0/best-model.pt


In [None]:
flair_results = flair_process_i2b2(model, test_selected_dataset, attr_name)
pos, pred_pos, tp = evaluation_level_document(flair_results, test_selected_dataset, attr_name)

In [23]:
from i2b2_utils import evaluation_level_document

In [21]:
# For bert
from bert_utils import annotate_text 

pred_tags = annotate_text(loaded_model, test_dataloader, tags_vals)

In [24]:
pos, pred_pos, tp = evaluation_level_document(pred_tags, test_selected_dataset, attr.upper())

In [25]:
recall = tp / pos
precision = tp / pred_pos
f1 = 2. * recall * precision / (recall + precision)

print('Recall: ', recall)
print('Precision: ', precision)
print('F1:', f1)

Recall:  0.9821428571428571
Precision:  0.9553349875930521
F1: 0.9685534591194969


In [None]:
Hypertension:
Recall:  0.9719387755102041
Precision:  0.9645569620253165
F1: 0.9682337992376113

In [None]:
CAD:
Recall:  0.9511111111111111
Precision:  0.6793650793650794
F1: 0.7925925925925925

In [None]:
Diabetes:
Recall:  0.9667590027700831
Precision:  0.8790931989924433
F1: 0.9208443271767809