In [1]:
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/77/e3/389c2dd8d0e6ca1d8fad11aa4940e8df6909a26a5d954c0eff01f0d78b57/flair-0.4.3-py3-none-any.whl (180kB)
[K     |████████████████████████████████| 184kB 879kB/s eta 0:00:01
Collecting hyperopt>=0.1.1 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/d1/c5/4b57fb376d24127b2960678ef98307fac1c6fb7a2ace7f67971949dd6b56/hyperopt-0.2.1-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 37.6MB/s eta 0:00:01
[?25hCollecting deprecated>=1.2.4 (from flair)
  Downloading https://files.pythonhosted.org/packages/88/0e/9d5a1a8cd7130c49334cce7b8167ceda63d6a329c8ea65b626116bc9e9e6/Deprecated-1.2.6-py2.py3-none-any.whl
Collecting segtok>=1.5.7 (from flair)
  Downloading https://files.pythonhosted.org/packages/1d/59/6ed78856ab99d2da04084b59e7da797972baa0efecb71546b16d48e49d9b/segtok-1.5.7.tar.gz
Collecting urllib3<1.25,>=1.20 (from flair)
[?25l  Downloading https://files.pythonhos

# Install dependencies

In [None]:
!git clone https://github.com/NVIDIA/apex ./packages/apex && cd ./packages/apex && pip install -v --no-cache-dir \
    --global-option="--cpp_ext" --global-option="--cuda_ext" ./

# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys

In [2]:
import torch

device = torch.device('cuda')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

Tesla V100-DGXS-16GB


In [3]:
from logger import initialize_logger
logger = initialize_logger('../workdir/logs/i2b2_active_learning.log')

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import json

# Global parameters

In [5]:
CACHE_DIR = '../workdir/models/0.4.0'

# MAX_LEN = 150
MAX_LEN = 100

#BATCH_SIZE = 105
#BATCH_SIZE = 45 
BATCH_SIZE = 32

PRED_BATCH_SIZE = 1200

random_state = 2019

# Load dataset

In [6]:
from flair.datasets import ColumnCorpus

from bert_utils import train, test, create_model_optimizer, create_tensors, prepare_flair_corpus_for_bert, to_torch_tensors
from bert_utils import make_bert_tag_dict_from_flair_corpus

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from pytorch_transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from flair.datasets import ColumnCorpus

In [7]:
# data_folder = '../workdir/i2b2/conll/'
# #attr = 'hypertension'
# attr = 'diabetes'
# #attr = 'cad'
# corpus = ColumnCorpus(data_folder, {0 : 'text', 1 : 'ner'},
#                       train_file=f'i2b2_training_{attr}.conll',
#                       test_file=f'i2b2_testing_{attr}.conll',
#                       dev_file=None)


data_folder = '../workdir/genia/conll/'
attr = 'genia'
corpus = ColumnCorpus(data_folder, 
                      {0 : 'text', 1 : 'ner'},
                      train_file='Genia4ERtask1.iob2',
                      test_file='Genia4EReval1.iob2',
                      dev_file=None)

print(corpus.obtain_statistics())

2019-08-21 20:00:54,453 Reading data from ../workdir/genia/conll
2019-08-21 20:00:54,454 Train: ../workdir/genia/conll/Genia4ERtask1.iob2
2019-08-21 20:00:54,454 Dev: None
2019-08-21 20:00:54,455 Test: ../workdir/genia/conll/Genia4EReval1.iob2
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 16691,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 443221,
            "min": 2,
            "max": 204,
            "avg": 26.55449044395183
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 3856,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 101039,
            "min": 2,
            "max": 208,
            "avg": 26.203060165975103
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 1855,
        "num

In [8]:
lengths = np.array([len(sent) for sent in corpus.train])
n_max_lengths = (lengths > (MAX_LEN-2)).sum()
print('N with more than max lengths:', n_max_lengths)
print('Ratio:', n_max_lengths / lengths.shape[0])

N with more than max lengths: 18
Ratio: 0.0010784254987717932


# Prepare model and preprocessing

In [9]:
bpe_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', cache_dir=CACHE_DIR, do_lower_case=False)
tags_vals, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)

In [10]:
bert_corpus = prepare_flair_corpus_for_bert(corpus, bpe_tokenizer, max_length=MAX_LEN)
numpy_tensors = {name : create_tensors(bpe_tokenizer, tag2idx, sub_corp[0], sub_corp[1], MAX_LEN)
                 for name, sub_corp in bert_corpus.items()}

torch_tensors = {name : to_torch_tensors(tensors) 
                 for name, tensors in numpy_tensors.items()}

In [10]:
model, optimizer, lr_scheduler = create_model_optimizer(tag2idx, 
                                                        cache_dir=CACHE_DIR,
                                                        full_finetuning=True, 
                                                        base_lr=5e-5,
                                                        bert_model='../workdir/bio_bert/torch2')

2019-08-21 20:01:14,741 - biomed_ie - INFO - Full finetuning: True
2019-08-21 20:01:14,744 - biomed_ie - INFO - N parameters: 108321807


In [11]:
from sequence_tagger_bert import SequenceTaggerBert

seq_tagger = SequenceTaggerBert(model, bpe_tokenizer, tags_vals, tag2idx)

def prepare_corpus(corpus):
    return ([[token.text for token in sent.tokens] for sent in corpus], 
            [[token.tags['ner'].value for token in sent.tokens] for sent in corpus])

def prep_tens(tokens, labels):
    bpe_tokens, max_len, token_ids, token_masks, bpe_masks = seq_tagger._make_tokens_tensors(tokens, seq_tagger._max_len)
    label_ids, loss_masks = seq_tagger._make_label_tensors(labels, bpe_masks, max_len)
    mask_sum = token_masks[loss_masks].sum()
    return token_ids, label_ids

torch_tensors = {'train' : prep_tens(*prepare_corpus(corpus.train)),
                 'test' : prep_tens(*prepare_corpus(corpus.test)),
                 'dev' : prep_tens(*prepare_corpus(corpus.dev))}

# Training

In [None]:
logger.info('Creating model...')
torch.cuda.empty_cache()
# model, optimizer, lr_scheduler = create_model_optimizer(tag2idx, 
#                                                         cache_dir=CACHE_DIR,
#                                                         full_finetuning=True, 
#                                                         base_lr=5e-5)


logger.info('Done.')

train_data = TensorDataset(*torch_tensors['train'])
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, 
                              batch_size=BATCH_SIZE)

valid_data = TensorDataset(*torch_tensors['dev'])
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, 
                              batch_size=PRED_BATCH_SIZE, shuffle=False)

train(model, optimizer, lr_scheduler, train_dataloader, valid_dataloader, 
      epochs=20, tags_vals=tags_vals, early_stopping=3)

In [13]:
# test_data = TensorDataset(*torch_tensors['test'])
# test_sampler = SequentialSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler=test_sampler, 
#                              batch_size=PRED_BATCH_SIZE, shuffle=False)

def prepare_corpus2(corpus):
    return list(zip([[token.text for token in sent.tokens] for sent in corpus], 
            [[token.tags['ner'].value for token in sent.tokens] for sent in corpus]))

collate_fn = lambda inpt: tuple(zip(*inpt))

val_dataset = prepare_corpus2(corpus.test)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, 
                            sampler=val_sampler, 
                            batch_size=1200,
                            collate_fn=collate_fn)

_, loss, f1 = seq_tagger.predict(val_dataloader, evaluate=True)
loss, f1

(7.301903985164783e-06, 0.7253837318575755)

In [42]:
test_data = TensorDataset(*torch_tensors['test'])
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, 
                             batch_size=PRED_BATCH_SIZE, shuffle=False)

logger.info('Evaluate:')
test(model, test_dataloader, tags_vals=tags_vals)

2019-08-21 18:20:45,613 - biomed_ie - INFO - Evaluate:
2019-08-21 18:20:53,372 - biomed_ie - INFO - Validation loss: 3.99536884437664e-06
2019-08-21 18:20:53,920 - biomed_ie - INFO - Validation F1-Score: 0.7184764733966368
2019-08-21 18:20:53,921 - biomed_ie - INFO - Validation accuracy: 0.9350969208679004


(3.99536884437664e-06, 0.7184764733966368)

In [26]:
torch.save(model.state_dict(), f'../workdir/models/bert/{attr}.pt')

In [27]:
from pytorch_pretrained_bert import BertForTokenClassification

BIO_BERT = '../workdir/bio_bert/torch'
tags_vals, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)
loaded_model = BertForTokenClassification.from_pretrained(BIO_BERT, num_labels=len(tag2idx))

loaded_model.load_state_dict(torch.load(f'../workdir/models/bert/{attr}.pt'))

# Evaluate document level

In [18]:
from i2b2_utils import drop_noise_samples

dataset_test_path = '../workdir/i2b2/i2b2_testing.json'
dataset_test = pd.read_json(dataset_test_path)
dataset_test.head()
test_selected_dataset = drop_noise_samples(dataset_test, attr.upper())

In [13]:
from flair.data import Sentence

def flair_process_i2b2(model, dataset, attr_name):
    res = model.predict([Sentence(t) for t in dataset.texts])
    res = [[e.tags[attr_name].value for e in sent] for sent in res]
    return res

In [14]:
from flair.models import SequenceTagger

#model = SequenceTagger.load('../models/new/DIABETES/fasttext/1.0/best-model.pt')
#model = SequenceTagger.load('../models/new/HYPERTENSION//elmo-pubmed/1.0/best-model.pt')
model = SequenceTagger.load('../models/new/CAD//elmo-pubmed/1.0/best-model.pt')

2019-07-06 15:53:14,376 loading file ../models/new/CAD//elmo-pubmed/1.0/best-model.pt


In [None]:
flair_results = flair_process_i2b2(model, test_selected_dataset, attr_name)
pos, pred_pos, tp = evaluation_level_document(flair_results, test_selected_dataset, attr_name)

In [23]:
from i2b2_utils import evaluation_level_document

In [21]:
# For bert
from bert_utils import annotate_text 

pred_tags = annotate_text(loaded_model, test_dataloader, tags_vals)

In [24]:
pos, pred_pos, tp = evaluation_level_document(pred_tags, test_selected_dataset, attr.upper())

In [25]:
recall = tp / pos
precision = tp / pred_pos
f1 = 2. * recall * precision / (recall + precision)

print('Recall: ', recall)
print('Precision: ', precision)
print('F1:', f1)

Recall:  0.9821428571428571
Precision:  0.9553349875930521
F1: 0.9685534591194969


## BERT Results

In [None]:
Hypertension:
Recall:  0.9719387755102041
Precision:  0.9645569620253165
F1: 0.9682337992376113

In [None]:
CAD:
Recall:  0.9511111111111111
Precision:  0.6793650793650794
F1: 0.7925925925925925

In [None]:
Diabetes:
Recall:  0.9667590027700831
Precision:  0.8790931989924433
F1: 0.9208443271767809

In [None]:
BERT Hypertension 0.7452339688041594

# Flair

In [None]:
# Entity-level f1 score 0.7830423940149627 (Diabetes Fast text Biomedical)
# Entity-level f1 score 0.7307017543859649 (HYPERTENSION) FastText
# Enitty-level f1 score 0.360471645143178 (CAD) fastext