## Step 1: Pre-Preprocessing

In [None]:
%load_ext autoreload
%autoreload 2

# import preprocessing code
from src.preprocess import PreProcessor, df_to_train_set

In [None]:
# save paths to the available datasets
from typing import NamedTuple, List

class Dataset(NamedTuple):
    """
    Interface for accessing data folders.
    """
    title: str
    preprocessed_folder: str
    raw_folders: List[str]

SAMPLE_DATA = Dataset(
    title = "sample_data",
    preprocessed_folder = "../data/preprocessed/sample_data/",
    raw_folders = ["docs/Track1-de-indentification/PHI/"]
)

GOLD_1 = Dataset(
    title = "gold_1",
    preprocessed_folder = "../data/preprocessed/gold_1/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/"]
)

GOLD_FULL = Dataset(
    title = "gold_full",
    preprocessed_folder = "../data/preprocessed/gold_full/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/","../data/raw/training-PHI-Gold-Set2/"]
)

GOLD_TEST = Dataset(
    title = "gold_test",
    preprocessed_folder = "../data/preprocessed/gold_test/",
    raw_folders = ["../data/raw/testing-PHI-Gold-fixed/"]
)

DATASETS = [SAMPLE_DATA,GOLD_1,GOLD_FULL, GOLD_TEST]

## Step 2: Preprocessing

In [None]:
# pick dataset and define loading boolean
# train_data = DATASETS[2]
train_data = DATASETS[0]
test_data = DATASETS[3]
isLoading = True

In [None]:
# attach data to PreProcessor object.
pp = PreProcessor(train_data.title)
if isLoading:
    X_train,y_train,df_train = pp.get_data(train_data.preprocessed_folder,isLoading = isLoading)
else:
    X_train,y_train,df_train = pp.get_data(train_data.raw_folders,isLoading = isLoading)
print("max length: ",pp.max_len)

In [None]:
# data exploration
df_train.head()

In [None]:
# load test set
if isLoading:
    X_test,y_test,df_test = pp.create_test_set(test_data.preprocessed_folder,isLoading,test_data.title)
else:
    X_test,y_test,df_test = pp.create_test_set(test_data.raw_folders,isLoading,test_data.title)


In [None]:
# test data exploration
df_test.head()

In [None]:
# import model stuff
from src.models.baseline import BaselineModel
from src.models.bilstm import BiLSTM
from src.models.bilstm_crf import BiLSTM_CRF
from pipeline.visualization import sample_output
from pipeline.train import train
from random import randint
from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
from src.converter import get_label_positions, bio_to_i2d2
import xml.etree.ElementTree as ET
from typing import NamedTuple, List

In [36]:
# build model
model = BaselineModel(pp.vocab_size,pp.tag_size,pp.max_len)
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

In [41]:
# restore checkpoint
checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))

Restored from models/checkpoints/sample_data/transformer/ckpt-20


In [None]:
# train
train(model,X_train,y_train,batch_size = 32, epochs=10,sample_interval=10,manager=manager,pp=pp)

In [27]:
# test model
sample_output(model,X_train,y_train, pp = pp,rand_idx=None)

from pipeline.test import test_to_i2d2, test_vanilla
# test_vanilla(model, X_test, y_test)

# test_to_i2d2(model,df_test, pp, checkpoint, manager)

Sentence #:  16
Word            Pred : (True)
No             :O     (O)
acute          :O     (O)
ST             :O     (O)
changes        :O     (O)
,              :O     (O)
Q              :O     (O)
waves          :O     (O)
or             :O     (O)
T              :O     (O)
wave           :O     (O)
inversion      :O     (O)
.              :O     (O)


In [None]:
# build model
model = BiLSTM(pp.vocab_size,pp.tag_size,pp.max_len)
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

In [None]:
# train model
train(model,X_train,y_train,batch_size = 32, epochs=10,sample_interval=10,manager=manager,pp=pp)

In [28]:
# build model
model = BiLSTM_CRF(pp.vocab_size,pp.tag_size,pp.max_len)
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

Num GPUs Available:  1


In [29]:
# train model
train(model,X_train,y_train,batch_size = 32, epochs=10,sample_interval=10,manager=manager,pp=pp)

--------- EPOCH  0 -----------
Epoch: 0, Batch: 0, Loss: 46.821537
Epoch: 0, Batch: 5, Loss: 14.260431
Sentence #:  205
Word            Pred : (True)
R              :O     (O)
BUTTOCK        :O     (O)
PAIN           :O     (O)
:              :O     (O)
muscular       :O     (O)
spasm          :O     (O)
vs             :O     (O)
herniated      :O     (O)
disc           :O     (O)
-              :O     (O)
avoid          :O     (O)
significant    :O     (O)
NSAI           :O     (O)
Ds             :O     (O)
given          :O     (O)
CRI            :O     (O)
-              :O     (O)
morphine       :O     (O)
,              :O     (O)
percocet       :O     (O)
.              :O     (O)
--------- EPOCH  1 -----------
Epoch: 1, Batch: 0, Loss: 9.097906
Epoch: 1, Batch: 5, Loss: 7.009502
--------- EPOCH  2 -----------
Epoch: 2, Batch: 0, Loss: 7.349750
Epoch: 2, Batch: 5, Loss: 4.509981
--------- EPOCH  3 -----------
Epoch: 3, Batch: 0, Loss: 6.501145
Epoch: 3, Batch: 5, Loss: 7.315049
-

[<tf.Tensor: shape=(), dtype=float32, numpy=195.1694>,
 <tf.Tensor: shape=(), dtype=float32, numpy=79.40908>,
 <tf.Tensor: shape=(), dtype=float32, numpy=58.5198>,
 <tf.Tensor: shape=(), dtype=float32, numpy=37.968784>,
 <tf.Tensor: shape=(), dtype=float32, numpy=24.6073>,
 <tf.Tensor: shape=(), dtype=float32, numpy=16.798903>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.279422>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.9144254>,
 <tf.Tensor: shape=(), dtype=float32, numpy=4.465953>,
 <tf.Tensor: shape=(), dtype=float32, numpy=2.3829906>]

In [None]:
print("done vanilla models!")

In [63]:
sample_output(model,X_train,y_train, pp = pp,rand_idx=None)

Sentence #:  41
Word            Pred : (True)
Abdomen        :O     (O)
Soft           :O     (O)
without        :O     (O)
hepatosplenomegaly:O     (O)
,              :O     (O)
mass           :O     (O)
or             :O     (O)
ascites        :O     (O)
.              :O     (O)


In [47]:
from src.models.transformer import Transformer
# build model
model = Transformer(pp.vocab_size,pp.tag_size,pp.max_len)
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

In [48]:
# train transformer

train(model,X_train,y_train,batch_size = 32, epochs = 20,sample_interval=10,manager=manager,pp=pp)


--------- EPOCH  0 -----------
Epoch: 0, Batch: 0, Loss: 2420.453369
Epoch: 0, Batch: 5, Loss: 330.727173
Sentence #:  178
Word            Pred : (True)
Mild           :O     (O)
R              :O     (O)
buttock        :O     (O)
tenderness     :O     (O)
.              :O     (O)
--------- EPOCH  1 -----------
Epoch: 1, Batch: 0, Loss: 381.727234
Epoch: 1, Batch: 5, Loss: 690.594238
--------- EPOCH  2 -----------
Epoch: 2, Batch: 0, Loss: 260.386139
Epoch: 2, Batch: 5, Loss: 190.585938
--------- EPOCH3 -----------
Epoch: 3, Batch: 0, Loss: 484.448547
Epoch: 3, Batch: 5, Loss: 219.343170
--------- EPOCH  4 -----------
Epoch: 4, Batch: 0, Loss: 179.624207
Epoch: 4, Batch: 5, Loss: 184.797394
--------- EPOCH  5 -----------
Epoch: 5, Batch: 0, Loss: 188.531479
Epoch: 5, Batch: 5, Loss: 108.586884
--------- EPOCH  6 -----------
Epoch: 6, Batch: 0, Loss: 31.532297
Epoch: 6, Batch: 5, Loss: 83.518013
--------- EPOCH  7 -----------
Epoch: 7, Batch: 0, Loss: 23.492496
Epoch: 7, Batch: 5, Loss

[<tf.Tensor: shape=(), dtype=float32, numpy=4873.0864>,
 <tf.Tensor: shape=(), dtype=float32, numpy=2548.2107>,
 <tf.Tensor: shape=(), dtype=float32, numpy=1810.1182>,
 <tf.Tensor: shape=(), dtype=float32, numpy=1390.8214>,
 <tf.Tensor: shape=(), dtype=float32, numpy=902.7467>,
 <tf.Tensor: shape=(), dtype=float32, numpy=743.9336>,
 <tf.Tensor: shape=(), dtype=float32, numpy=570.60254>,
 <tf.Tensor: shape=(), dtype=float32, numpy=530.0657>,
 <tf.Tensor: shape=(), dtype=float32, numpy=465.03625>,
 <tf.Tensor: shape=(), dtype=float32, numpy=395.00317>,
 <tf.Tensor: shape=(), dtype=float32, numpy=379.0114>,
 <tf.Tensor: shape=(), dtype=float32, numpy=328.02307>,
 <tf.Tensor: shape=(), dtype=float32, numpy=269.7553>,
 <tf.Tensor: shape=(), dtype=float32, numpy=263.69153>,
 <tf.Tensor: shape=(), dtype=float32, numpy=272.52>,
 <tf.Tensor: shape=(), dtype=float32, numpy=244.89488>,
 <tf.Tensor: shape=(), dtype=float32, numpy=225.9397>,
 <tf.Tensor: shape=(), dtype=float32, numpy=212.97105>,
 