# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys

In [2]:
import torch

device = torch.device('cuda')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

Tesla V100-DGXS-16GB


In [3]:
import logging
logger = logging.getLogger('biomed_ie')

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

fhandler = logging.FileHandler(filename='../workdir/i2b2_active_learning.log', mode='a')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)

handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)
logger.addHandler(handler)

logger.setLevel(logging.DEBUG)

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import json

# Global parameters

In [5]:
CACHE_DIR = '../workdir/models/0.4.0'

#MAX_LEN = 100
MAX_LEN = 100
#BATCH_SIZE = 105
BATCH_SIZE = 45 
# MAX_LEN = 150
# BATCH_SIZE = 32
PRED_BATCH_SIZE = 600
random_state = 2019

# Load dataset

In [6]:
from i2b2_utils import drop_noise_samples, split_train_test_by_document
from bert_utils import train, test, create_model_optimizer, tokenize_and_generate_labels, to_torch_tensors, create_tensors

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertTokenizer, BertConfig

from sklearn.model_selection import train_test_split

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [7]:
dataset_path = '../workdir/i2b2/i2b2_training.json'
dataset = pd.read_json(dataset_path)
print(dataset.shape)
dataset.head()

(46635, 5)


Unnamed: 0,HYPERTENSION,CAD,DIABETES,texts,doc_ids
0,[],[],[],Record date: 2154-07-21\n\n\n\tCARDIOLOGY\n\t\...,0
1,[],[],[],D.,0
10,"[[24, 36]]",[],[],She has well-controlled hypertension on stable...,0
100,[],[],[],a.,1
1000,[],[],[],"SOCIAL HISTORY, FAMILY HISTORY, AND REVIEW OF ...",18


In [8]:
attr_name = 'HYPERTENSION'
#attr_name = 'CAD'
#attr_name = 'DIABETES'

selected_dataset = drop_noise_samples(dataset, attr_name)
selected_dataset.shape

(9871, 5)

In [9]:
dataset_test_path = '../workdir/i2b2/i2b2_testing.json'
dataset_test = pd.read_json(dataset_test_path)
dataset_test.shape

(30208, 5)

In [10]:
test_selected_dataset = drop_noise_samples(dataset_test, attr_name)
train_selected_dataset = selected_dataset
print('train', selected_dataset.shape)
print('test', test_selected_dataset.shape)

train (9871, 5)
test (6813, 5)


# Prepare model and preprocessing

In [11]:
from isanlp.en.processor_tokenizer_nltk_en import ProcessorTokenizerNltkEn

word_tokenizer = ProcessorTokenizerNltkEn()
bpe_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', cache_dir=CACHE_DIR, do_lower_case=False)
train_sents, train_labels = tokenize_and_generate_labels(word_tokenizer, bpe_tokenizer, train_selected_dataset, attr_name, MAX_LEN)
test_sents, test_labels = tokenize_and_generate_labels(word_tokenizer, bpe_tokenizer, test_selected_dataset, attr_name, MAX_LEN)

In [12]:
lengths = np.array([len(sent) for sent in train_sents])
n_max_lengths = (lengths == MAX_LEN).sum()
print('N with max lengths:', n_max_lengths)
print('Ratio:', n_max_lengths / lengths.shape[0])

N with max lengths: 381
Ratio: 0.03859791307871543


In [13]:
tags_vals = ['B', 'I', 'O', 'X', '[CLS]', '[SEP]']
tag2idx = {t : i for i, t in enumerate(tags_vals)}

In [14]:
train_input_ids,train_attention_masks, train_tags = create_tensors(bpe_tokenizer, tag2idx, 
                                                                    train_sents, train_labels, MAX_LEN)
test_input_ids, test_attention_masks, test_tags = create_tensors(bpe_tokenizer, tag2idx, 
                                                                 test_sents, test_labels, MAX_LEN)

# Training

In [None]:
from i2b2_utils import subsample_dataset
# Only for CAD and Diabetes
train_input_ids, train_tags, train_attention_masks = subsample_dataset(train_input_ids, train_tags, 
                                                                       train_attention_masks, 
                                                                       positive_tag=tag2idx['B'], 
                                                                       negative_ratio=0.5, positive_ratio=1.)

In [15]:
logger.info('Creating model...')
torch.cuda.empty_cache()
#model, optimizer, lr_scheduler = create_model_optimizer(tag2idx, full_finetuning=True, base_lr=5e-4)
model, optimizer, lr_scheduler = create_model_optimizer(tag2idx, cache_dir=CACHE_DIR, full_finetuning=True, base_lr=5e-5)
logger.info('Done.')

t_tr_inputs = torch.tensor(train_input_ids)
t_tr_tags = torch.tensor(train_tags)
t_tr_masks = torch.tensor(train_attention_masks)

t_val_inputs = torch.tensor(test_input_ids)
t_val_tags = torch.tensor(test_tags)
t_val_masks = torch.tensor(test_attention_masks)

train_data = TensorDataset(t_tr_inputs, t_tr_masks, t_tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, 
                              batch_size=BATCH_SIZE)

valid_data = TensorDataset(t_val_inputs, t_val_masks, t_val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, 
                              batch_size=PRED_BATCH_SIZE, shuffle=False)

train(model, optimizer, lr_scheduler, train_dataloader, valid_dataloader, 
      epochs=20, device=device, tags_vals=tags_vals)

2019-08-11 14:32:29,784 - biomed_ie - INFO - Creating model...
2019-08-11 14:32:46,204 - biomed_ie - INFO - Full finetuning: True
2019-08-11 14:32:46,208 - biomed_ie - INFO - N parameters: 108314886
2019-08-11 14:32:46,209 - biomed_ie - INFO - Done.


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

2019-08-11 14:34:05,131 - biomed_ie - INFO - Train loss: 2.4389469706056364e-05
2019-08-11 14:34:17,770 - biomed_ie - INFO - Validation loss: 2.742502970673978e-07
2019-08-11 14:34:22,337 - biomed_ie - INFO - Validation F1-Score: 0.6705882352941176
2019-08-11 14:34:22,338 - biomed_ie - INFO - Validation accuracy: 0.9987509778105413


Epoch:   5%|▌         | 1/20 [01:36<30:26, 96.12s/it]

2019-08-11 14:35:41,157 - biomed_ie - INFO - Train loss: 3.4065052729973288e-06
2019-08-11 14:35:53,792 - biomed_ie - INFO - Validation loss: 2.1278160450146412e-07
2019-08-11 14:35:58,327 - biomed_ie - INFO - Validation F1-Score: 0.7181978798586572
2019-08-11 14:35:58,328 - biomed_ie - INFO - Validation accuracy: 0.9990664524872603


Epoch:  10%|█         | 2/20 [03:12<28:49, 96.08s/it]

2019-08-11 14:37:18,507 - biomed_ie - INFO - Train loss: 2.3676043060231826e-06
2019-08-11 14:37:31,156 - biomed_ie - INFO - Validation loss: 2.3649169305127434e-07
2019-08-11 14:37:35,700 - biomed_ie - INFO - Validation F1-Score: 0.7286084701815038
2019-08-11 14:37:35,701 - biomed_ie - INFO - Validation accuracy: 0.9989038864548694


Epoch:  15%|█▌        | 3/20 [04:49<27:19, 96.47s/it]

2019-08-11 14:38:55,767 - biomed_ie - INFO - Train loss: 1.7079229652016948e-06
2019-08-11 14:39:08,433 - biomed_ie - INFO - Validation loss: 2.633134165030893e-07
2019-08-11 14:39:12,957 - biomed_ie - INFO - Validation F1-Score: 0.7124183006535947
2019-08-11 14:39:12,958 - biomed_ie - INFO - Validation accuracy: 0.9988250177856897


Epoch:  20%|██        | 4/20 [06:26<25:47, 96.71s/it]

2019-08-11 14:40:32,445 - biomed_ie - INFO - Train loss: 1.3385674212699562e-06
2019-08-11 14:40:45,105 - biomed_ie - INFO - Validation loss: 2.549370564108908e-07
2019-08-11 14:40:49,637 - biomed_ie - INFO - Validation F1-Score: 0.7431072210065645
2019-08-11 14:40:49,638 - biomed_ie - INFO - Validation accuracy: 0.9990922055220945


Epoch:  25%|██▌       | 5/20 [08:03<24:10, 96.70s/it]

KeyboardInterrupt: 

# Evaluate

In [None]:
!pip install --upgrade pytorch-pretrained-bert

In [None]:
!pip install pytorch-pretrained-bert==0.4.0

In [20]:
from i2b2_utils import evaluation_level_document

In [13]:
from flair.data import Sentence
from flair.models import SequenceTagger

def flair_process_i2b2(model, dataset, attr_name):
    res = model.predict([Sentence(t) for t in dataset.texts])
    res = [[e.tags[attr_name].value for e in sent] for sent in res]
    return res

In [14]:
#model = SequenceTagger.load('../models/new/DIABETES/fasttext/1.0/best-model.pt')
#model = SequenceTagger.load('../models/new/HYPERTENSION//elmo-pubmed/1.0/best-model.pt')
model = SequenceTagger.load('../models/new/CAD//elmo-pubmed/1.0/best-model.pt')

2019-07-06 15:53:14,376 loading file ../models/new/CAD//elmo-pubmed/1.0/best-model.pt


In [15]:
flair_results = flair_process_i2b2(model, test_selected_dataset, attr_name)
pos, pred_pos, tp = evaluation_level_document(flair_results, test_selected_dataset, attr_name)

NameError: name 'evaluation_level_document' is not defined

In [17]:
pos, pred_pos, tp = evaluation_level_document(flair_results, test_selected_dataset, attr_name)

In [21]:
# For bert
from i2b2_utils import annotate_text 

pred_tags = annotate_text(model, valid_dataloader)
pos, pred_pos, tp = evaluation_level_document(pred_tags, test_selected_dataset, attr_name, tag2idx)

NameError: name 'valid_dataloader' is not defined

In [18]:
recall = tp / pos
precision = tp / pred_pos
f1 = 2. * recall * precision / (recall + precision)

print('Recall: ', recall)
print('Precision: ', precision)
print('F1:', f1)

Recall:  1.0
Precision:  0.4377431906614786
F1: 0.6089309878213802


# BERT Results

In [None]:
Hypertension:
Recall:  0.9719387755102041
Precision:  0.9645569620253165
F1: 0.9682337992376113

In [None]:
CAD:
Recall:  0.9511111111111111
Precision:  0.6793650793650794
F1: 0.7925925925925925

In [None]:
Diabetes:
Recall:  0.9667590027700831
Precision:  0.8790931989924433
F1: 0.9208443271767809