## Step 1: Pre-Preprocessing

In [1]:
%load_ext autoreload
%autoreload 2

# import preprocessing code
from src.preprocess import PreProcessor, df_to_train_set

Using TensorFlow backend.


In [2]:
# save paths to the available datasets
from typing import NamedTuple, List

class Dataset(NamedTuple):
    """
    Interface for accessing data folders.
    """
    title: str
    preprocessed_folder: str
    raw_folders: List[str]

SAMPLE_DATA = Dataset(
    title = "sample_data",
    preprocessed_folder = "../data/preprocessed/sample_data/",
    raw_folders = ["docs/Track1-de-indentification/PHI/"]
)

GOLD_1 = Dataset(
    title = "gold_1",
    preprocessed_folder = "../data/preprocessed/gold_1/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/"]
)

GOLD_FULL = Dataset(
    title = "gold_full",
    preprocessed_folder = "../data/preprocessed/gold_full/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/","../data/raw/training-PHI-Gold-Set2/"]
)

GOLD_TEST = Dataset(
    title = "gold_test",
    preprocessed_folder = "../data/preprocessed/gold_test/",
    raw_folders = ["../data/raw/testing-PHI-Gold-fixed/"]
)

DATASETS = [SAMPLE_DATA,GOLD_1,GOLD_FULL, GOLD_TEST]

## Step 2: Preprocessing

In [3]:
# pick dataset and define loading boolean
# train_data = DATASETS[2]
train_data = DATASETS[0]
test_data = DATASETS[3]
isLoading = True

In [4]:
# attach data to PreProcessor object.
pp = PreProcessor(train_data.title)
if isLoading:
    X_train,y_train,df_train = pp.get_data(train_data.preprocessed_folder,isLoading = isLoading)
else:
    X_train,y_train,df_train = pp.get_data(train_data.raw_folders,isLoading = isLoading)
print("max length: ",pp.max_len)

  8% |######                                                                  |

Loading preprocessed data...


100% |########################################################################|
100% |########################################################################|


Shape of X:  (234, 471)
Shape of y:  (234, 471)
Preprocessing complete.
max length:  471


In [5]:
# data exploration
df_train.head()

Unnamed: 0.1,Unnamed: 0,docid,sentence,sentence_ids,labels,labels_ids,characters,padded_sentence,padded_labels
0,0,320-01,"['Record', 'date', ':', '2080', '-', '03', '-'...","[760, 811, 499, 1090, 561, 989, 561, 269, 1631...","['O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', ...","[1, 1, 1, 18, 25, 25, 25, 25, 1, 1, 5, 24, 1, ...","[(3, 9), (10, 14), (14, 15), (16, 20), (20, 21...","[760, 811, 499, 1090, 561, 989, 561, 269, 1631...","[1, 1, 1, 18, 25, 25, 25, 25, 1, 1, 5, 24, 1, ..."
1,1,320-01,"['Met', 'with', 'PCP', 'in', 'Feb', 'for', 'mu...","[29, 966, 1198, 1712, 740, 217, 1045, 1062, 16...","['O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', ...","[1, 1, 1, 1, 18, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[(143, 146), (147, 151), (152, 155), (156, 158...","[29, 966, 1198, 1712, 740, 217, 1045, 1062, 16...","[1, 1, 1, 1, 18, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,2,320-01,"['No', 'h', '/', 'o', 'macro', 'or', 'microvas...","[1537, 1388, 214, 1187, 963, 742, 581, 241, 1491]","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']","[1, 1, 1, 1, 1, 1, 1, 1, 1]","[(1504, 1506), (1507, 1508), (1508, 1509), (15...","[1537, 1388, 214, 1187, 963, 742, 581, 241, 14...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
3,3,320-01,"['Hypertensive', 'disorder', ':', 'dx', '2060'...","[1199, 1361, 499, 132, 1207, 1404, 1680, 645, ...","['O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', ...","[1, 1, 1, 1, 18, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[(1549, 1561), (1562, 1570), (1571, 1572), (15...","[1199, 1361, 499, 132, 1207, 1404, 1680, 645, ...","[1, 1, 1, 1, 18, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1..."
4,4,320-01,"['H', '.', 'pylori', 'serology', '+', 'Helicob...","[1678, 1491, 1086, 126, 248, 1406, 1086, 499, ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 1, 1, 1, 1,...","[(1771, 1772), (1772, 1773), (1774, 1780), (17...","[1678, 1491, 1086, 126, 248, 1406, 1086, 499, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 1, 1, 1, 1,..."


In [None]:
# load test set
if isLoading:
    X_test,y_test,df_test = pp.create_test_set(test_data.preprocessed_folder,isLoading,test_data.title)
else:
    X_test,y_test,df_test = pp.create_test_set(test_data.raw_folders,isLoading,test_data.title)


In [None]:
# test data exploration
df_test.head()

In [6]:
# import model stuff
from src.models.baseline import BaselineModel
from src.models.bilstm import BiLSTM
from src.models.bilstm_crf import BiLSTM_CRF
from pipeline.visualization import sample_output
from pipeline.train import train_CRF, train_vanilla
from random import randint
from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
from src.converter import get_label_positions, bio_to_i2d2
import xml.etree.ElementTree as ET
from typing import NamedTuple, List

In [None]:
# build model
model = BaselineModel(pp.vocab_size,pp.tag_size,pp.max_len)
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

In [None]:
# restore checkpoint
checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))

In [None]:
# test model
sample_output(model,X_train,y_train, pp = pp)

from pipeline.test import test_to_i2d2, test_vanilla
# test_vanilla(model, X_test, y_test)

test_to_i2d2(model,df_test, pp, checkpoint, manager)

In [None]:
# build model
model = BiLSTM(pp.vocab_size,pp.tag_size,pp.max_len)
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

In [None]:
# train model
train_vanilla(model,X_train,y_train,batch_size = 64, epochs=2,sample_interval=10,manager=manager,pp=pp)

In [None]:
# build model
model = BiLSTM_CRF(pp.vocab_size,pp.tag_size,pp.max_len)
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

In [None]:
# train model
train_CRF(model,X_train,y_train,batch_size = 64, epochs=10,sample_interval=10,manager=manager,pp=pp)

In [None]:
print("done vanilla models!")

In [9]:
from src.models.transformer import Transformer
# build model
model = Transformer(pp.vocab_size,pp.tag_size,pp.max_len)
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

In [10]:
# train transformer
from pipeline.train import train_transformer

train_transformer(model,X_train,y_train,batch_size = 32, epochs = 10,sample_interval=10,manager=manager,pp=pp)


--------- EPOCH  0 -----------
Epoch: 0, Batch: 0, Loss: 0.236512
Epoch: 0, Batch: 5, Loss: 0.024003
Sentence #:  173


TypeError: in converted code:


    TypeError: tf__call() missing 1 required positional argument: 'labels'
