## Step 1: Pre-Preprocessing

In [2]:
%load_ext autoreload
%autoreload 2

# import preprocessing code
from src.preprocess import PreProcessor, df_to_train_set, df_to_XY

Using TensorFlow backend.


In [3]:
# save paths to the available datasets
from typing import NamedTuple, List

class Dataset(NamedTuple):
    """
    Interface for accessing data folders.
    """
    title: str
    preprocessed_folder: str
    raw_folders: List[str]

SAMPLE_DATA = Dataset(
    title = "sample_data",
    preprocessed_folder = "../de-ID_data/preprocessed/sample_data/",
    raw_folders = ["docs/Track1-de-indentification/PHI/"]
)

GOLD_1 = Dataset(
    title = "gold_1",
    preprocessed_folder = "../de-ID_data/preprocessed/gold_1/",
    raw_folders = ["../de-ID_data/raw/training-PHI-Gold-Set1/"]
)

GOLD_FULL = Dataset(
    title = "gold_full",
    preprocessed_folder = "../de-ID_data/preprocessed/gold_full/",
    raw_folders = ["../de-ID_data/raw/training-PHI-Gold-Set1/","../data/raw/training-PHI-Gold-Set2/"]
)

GOLD_TEST = Dataset(
    title = "gold_test",
    preprocessed_folder = "../de-ID_data/preprocessed/gold_test/",
    raw_folders = ["../de-ID_data/raw/testing-PHI-Gold-fixed/"]
)

COVID_DATA = Dataset(
    title = "covid_data",
    preprocessed_folder = "../de-ID_data/preprocessed/covid/",
    raw_folders = ["../de-ID_data/covid/"]
)

DATASETS = [SAMPLE_DATA,GOLD_1,GOLD_FULL, GOLD_TEST, COVID_DATA]

## Step 2: Preprocessing

In [4]:
# pick dataset and define loading boolean
train_data = DATASETS[4]
test_data = DATASETS[4]
isLoading = True

In [5]:
# attach data to PreProcessor object.
pp = PreProcessor(train_data.title)
if isLoading:
    X_train,y_train,X_train_words,df_train = pp.get_data(train_data.preprocessed_folder,isLoading = isLoading)
else:
    X_train,y_train,X_train_words,df_train = pp.get_data(train_data.raw_folders,isLoading = isLoading)
print("max length: ",pp.max_len)

100%|██████████| 60/60 [00:00<00:00, 2608.48it/s]
100%|██████████| 60/60 [00:00<00:00, 3749.49it/s]Loading preprocessed data...

100%|██████████| 60/60 [00:00<00:00, 3529.37it/s]Preprocessing complete.
max length:  872



In [6]:
# data exploration
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2462 entries, 0 to 2461
Data columns (total 9 columns):
Unnamed: 0        2462 non-null int64
docid             2462 non-null object
sentence          2462 non-null int64
token             2461 non-null object
token_id          2462 non-null int64
label             2461 non-null object
label_id          2462 non-null int64
characters        2462 non-null object
original_token    2461 non-null object
dtypes: int64(4), object(5)
memory usage: 173.2+ KB


In [9]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,docid,sentence,token,token_id,label,label_id,characters,original_token
0,0,ehr_1,0,DISCHARGE,17933,DISCHARGE,69,"(2, 11)",DISCHARGE
1,1,ehr_1,0,SUMMARY,20387,SUMMARY,69,"(12, 19)",SUMMARY
2,2,ehr_1,0,-,24073,-,69,"(20, 21)",-
3,3,ehr_1,0,UNK,1,DEATH,69,"(22, 27)",DEATH
4,4,ehr_1,0,SUMMARY,20387,SUMMARY,69,"(28, 35)",SUMMARY


In [None]:
# load test set
isLoadingTest = True
if isLoadingTest:
    X_test,y_test,X_test_words,df_test = pp.create_test_set(test_data.preprocessed_folder,isLoadingTest,test_data.title)
else:
    X_test,y_test,X_test_words,df_test = pp.create_test_set(test_data.raw_folders,isLoadingTest,test_data.title)

In [None]:
# test data exploration
df_test.tail()

In [None]:
X_test.shape

In [10]:
# import model stuff
from src.models.baseline import BaselineModel
from src.models.bilstm import BiLSTM
from src.models.bilstm_crf import BiLSTM_CRF
from src.models.transformer import Transformer
from src.models.transformer_crf import Transformer_CRF
from src.models.transformer_bilstm import TransformerBiLSTM
from src.models.bilstm_chars import BiLSTM_Chars
from src.models.bilstm_chars_crf import BiLSTM_Chars_CRF
from pipeline.visualization import sample_output
from pipeline.train import train
from random import randint
from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
from src.converter import get_label_positions, bio_to_i2d2
import xml.etree.ElementTree as ET
from typing import NamedTuple, List

In [11]:
# check if GPU is available
assert tf.test.is_built_with_cuda()
physical_devices = tf.config.list_physical_devices('GPU') 
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num GPUs:", len(physical_devices)) 
tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)

Num GPUs Available:  0
Num GPUs: 0


report_tensor_allocations_upon_oom: true

In [21]:
# build model
# model = BaselineModel(pp.vocab_size,pp.tag_size,pp.max_len)
# model = BiLSTM(pp.vocab_size,pp.tag_size,pp.max_len)
model = BiLSTM_CRF(pp.vocab_size,pp.tag_size,pp.max_len)
# model = Transformer(pp.vocab_size,pp.tag_size,pp.max_len)
# model = Transformer_CRF(pp.vocab_size, pp.tag_size, pp.max_len)
# model = TransformerBiLSTM(pp.vocab_size, pp.tag_size, pp.max_len)
# model = BiLSTM_Chars(pp.vocab_size, pp.tag_size, pp.max_len,pp.idx2word)
# model = BiLSTM_Chars_CRF(pp.vocab_size, pp.tag_size, pp.max_len,pp.idx2word)

Num GPUs Available:  0


In [24]:
# configure checkpoints and checkpoint manager
checkpoint_dir = 'models/checkpoints/' + 'gold_full' + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=10)

In [25]:
# restore checkpoint
checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))

Restored from models/checkpoints/gold_full/bi-lstm-crf/ckpt-10


In [None]:
# train
# print("Training ",model.title)
# losses = train(model,X_train,y_train,X_train_words,batch_size = 32, epochs=10, lr = 0.0005, sample_interval=10, manager=manager, pp=pp)

In [34]:
# sample a random train output
sample_output(model,X_train,y_train, pp = pp,rand_idx=None, use_true_tags = False)

Sentence #:  18
input shape:  (1, 872)
output shape:  (872,)
Word            Pred : (True)
He             :O     (O)
was            :O     (O)
started        :O     (O)
on             :O     (O)
heparin        :O     (O)
gtt            :O     (O)
due            :O     (O)
to             :O     (O)
concern        :O     (O)
of             :O     (O)
hypercoagulable:O     (O)
state          :O     (O)
and            :O     (O)
elevated       :O     (O)
D              :O     (O)
dimer          :O     (O)
.              :O     (O)


In [None]:
# sample a random test output
# print(X_test_words[0][:30])
sample_output(model,X_test,y_test, pp = pp,rand_idx=None, words = None)

In [None]:
# test model
# from pipeline.test import test_to_i2d2

# test_to_i2d2(model,df_test, pp, checkpoint, manager)

In [117]:
_,_,_,covid_df = pp.create_test_set(["../de-ID_data/covid/"], title="covid3")

100%|██████████| 1/1 [00:00<00:00, 21.00it/s]
100%|██████████| 127/127 [00:00<00:00, 5806.44it/s]
100%|██████████| 127/127 [00:00<00:00, 5318.31it/s]
  0%|          | 0/127 [00:00<?, ?it/s]Preprocessing data...
ehr_1.xml
ehr_2.xml
ehr_3.xml
ehr_4.xml
ehr_5.xml
ehr_6.xml
# of Tag Processing Errors:  0
Files with errors:  []
100%|██████████| 127/127 [00:00<00:00, 6567.82it/s]


In [118]:
from pipeline.test import predict_document

predictions, doc_df = predict_document(model, 'ehr_6', covid_df)
# print(doc_df)
print(predictions.shape)
doc_df.head()

100%|██████████| 37/37 [00:00<00:00, 4622.30it/s]
(37, 371)
(37, 371)


Unnamed: 0,docid,sentence,token,token_id,label,label_id,characters,original_token
3474,ehr_6,0,Report,8908,Report,69,"(4, 10)",Report
3475,ehr_6,0,created,15894,created,69,"(11, 18)",created
3476,ehr_6,0,by,4143,by,69,"(19, 21)",by
3477,ehr_6,0,Brian,17876,Brian,69,"(22, 27)",Brian
3478,ehr_6,0,W,6431,W,69,"(28, 29)",W


In [119]:
sentence_groups = doc_df.groupby(['sentence'])
sentence_lengths = [len(group) for _, group in sentence_groups]
print(sentence_lengths)

[35, 371, 3, 17, 16, 14, 20, 30, 4, 7, 11, 34, 21, 15, 8, 11, 16, 16, 36, 10, 6, 6, 12, 12, 11, 13, 13, 13, 11, 5, 4, 11, 16, 5, 9, 9, 5]


In [120]:
import numpy as np
predictions_copy = predictions.tolist()
for i,length in enumerate(sentence_lengths):
    predictions_copy[i] = predictions_copy[i][:length]

In [121]:
predictions_flattened = [j for sub in predictions_copy for j in sub]
predicted_tokens = [pp.idx2tag[prediction] for prediction in predictions_flattened]
doc_df['predictions'] = predicted_tokens

In [122]:
doc_df.head()

Unnamed: 0,docid,sentence,token,token_id,label,label_id,characters,original_token,predictions
3474,ehr_6,0,Report,8908,Report,69,"(4, 10)",Report,O
3475,ehr_6,0,created,15894,created,69,"(11, 18)",created,O
3476,ehr_6,0,by,4143,by,69,"(19, 21)",by,O
3477,ehr_6,0,Brian,17876,Brian,69,"(22, 27)",Brian,B-DOCTOR
3478,ehr_6,0,W,6431,W,69,"(28, 29)",W,I-DOCTOR


In [123]:
doc_df.to_csv("../de-ID_data/covid_predictions/ehr_6_predictions.csv", index = False)

In [126]:
import pandas as pd
doc2predictions = {i:[] for i in range(1,7)}
for i in range(1,7): # docs
    doc_df = pd.read_csv("../de-ID_data/covid_predictions/ehr_" + str(i) + "_predictions.csv")
    sentence_groups = doc_df.groupby(['sentence'])
    for j, group in sentence_groups: # sentences
        sent_predictions = group['predictions'].unique()
        if len(sent_predictions) > 1:
            doc2predictions[i].append(j)

In [127]:
doc2predictions

{1: [0, 1, 2, 13, 14, 20, 21, 22, 23, 26, 27, 31, 33, 34, 38, 40, 41],
 2: [0, 1, 2, 3],
 3: [0, 9, 13],
 4: [0, 6, 8, 9, 17, 21],
 5: [0, 2, 7],
 6: [0, 1, 11, 12, 18, 32]}