## Step 1: Pre-Preprocessing

In [1]:
%load_ext autoreload
%autoreload 2

# import preprocessing code
from src.preprocess import PreProcessor, df_to_train_set

Using TensorFlow backend.


In [2]:
# save paths to the available datasets
from typing import NamedTuple, List

class Dataset(NamedTuple):
    """
    Interface for accessing data folders.
    """
    title: str
    preprocessed_folder: str
    raw_folders: List[str]

SAMPLE_DATA = Dataset(
    title = "sample_data",
    preprocessed_folder = "../data/preprocessed/sample_data/",
    raw_folders = ["docs/Track1-de-indentification/PHI/"]
)

GOLD_1 = Dataset(
    title = "gold_1",
    preprocessed_folder = "../data/preprocessed/gold_1/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/"]
)

GOLD_FULL = Dataset(
    title = "gold_full",
    preprocessed_folder = "../data/preprocessed/gold_full/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/","../data/raw/training-PHI-Gold-Set2/"]
)

GOLD_TEST = Dataset(
    title = "gold_test",
    preprocessed_folder = "../data/preprocessed/gold_test/",
    raw_folders = ["../data/raw/testing-PHI-Gold-fixed/"]
)

DATASETS = [SAMPLE_DATA,GOLD_1,GOLD_FULL, GOLD_TEST]

## Step 2: Preprocessing

In [3]:
# pick dataset and define loading boolean
train_data = DATASETS[2]
# train_data = DATASETS[0]
test_data = DATASETS[3]
isLoading = True

In [4]:
# attach data to PreProcessor object.
pp = PreProcessor(train_data.title)
if isLoading:
    X_train,y_train,df_train = pp.get_data(train_data.preprocessed_folder,isLoading = isLoading)
else:
    X_train,y_train,df_train = pp.get_data(train_data.raw_folders,isLoading = isLoading)
print("max length: ",pp.max_len)

Loading preprocessed data...
100%|██████████| 31535/31535 [00:03<00:00, 9944.19it/s]
100%|██████████| 31535/31535 [00:03<00:00, 9460.24it/s]
Preprocessing complete.
max length:  1567


In [5]:
# data exploration
df_train.head()

Unnamed: 0.1,Unnamed: 0,docid,sentence,token,token_id,label,label_id,characters
0,0,220-01,0,Record,10503,O,1,"(3, 9)"
1,1,220-01,0,date,23254,O,1,"(10, 14)"
2,2,220-01,0,:,20619,O,1,"(14, 15)"
3,3,220-01,0,2067,18377,B-DATE,42,"(16, 20)"
4,4,220-01,0,-,24073,I-DATE,18,"(20, 21)"


In [6]:
# load test set
if isLoading:
    X_test,y_test,df_test = pp.create_test_set(test_data.preprocessed_folder,isLoading,test_data.title)
else:
    X_test,y_test,df_test = pp.create_test_set(test_data.raw_folders,isLoading,test_data.title)

Loading preprocessed test data...
100%|██████████| 21670/21670 [00:02<00:00, 9910.84it/s]
100%|██████████| 21670/21670 [00:02<00:00, 10054.57it/s]


In [7]:
# test data exploration
df_test.head()

Unnamed: 0.1,Unnamed: 0,docid,sentence,token,token_id,label,label_id,characters
0,0,110-01,0,Record,10503,O,1,"(3, 9)"
1,1,110-01,0,date,23254,O,1,"(10, 14)"
2,2,110-01,0,:,20619,O,1,"(14, 15)"
3,3,110-01,0,2069,9322,B-DATE,42,"(16, 20)"
4,4,110-01,0,-,24073,I-DATE,18,"(20, 21)"


In [8]:
# import model stuff
from src.models.baseline import BaselineModel
from src.models.bilstm import BiLSTM
from src.models.bilstm_crf import BiLSTM_CRF
from src.models.transformer import Transformer
from pipeline.visualization import sample_output
from pipeline.train import train
from random import randint
from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
from src.converter import get_label_positions, bio_to_i2d2
import xml.etree.ElementTree as ET
from typing import NamedTuple, List

In [9]:
# check if GPU is available
assert tf.test.is_gpu_available()
assert tf.test.is_built_with_cuda()
physical_devices = tf.config.list_physical_devices('GPU') 
print("Num GPUs:", len(physical_devices)) 

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Num GPUs: 1


In [10]:
# build model
# model = BaselineModel(pp.vocab_size,pp.tag_size,pp.max_len)
# model = BiLSTM(pp.vocab_size,pp.tag_size,pp.max_len)
# model = BiLSTM_CRF(pp.vocab_size,pp.tag_size,pp.max_len)
model = Transformer(pp.vocab_size,pp.tag_size,pp.max_len)

In [11]:
# configure checkpoints and checkpoint manager
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=10)

In [12]:
# restore checkpoint
checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))

Restored from models/checkpoints/gold_full/transformer/ckpt-10


In [None]:
# train
print("Training ",model.title)
losses = train(model,X_train,y_train,batch_size = 32, epochs=10, lr = 0.001, sample_interval=10,manager=manager,pp=pp)

In [17]:
# sample a random output
sample_output(model,X_train,y_train, pp = pp,rand_idx=None)

Sentence #:  5301
Word            Pred : (True)
He             :O     (O)
underwent      :O     (O)
a              :O     (O)
bone           :O     (O)
scan           :O     (O)
in             :O     (O)
11             :B-DATE (B-DATE)
/              :I-DATE (I-DATE)
75             :I-DATE (I-DATE)
showing        :O     (O)
worrisome      :O     (O)
uptake         :O     (O)
at             :O     (O)
L              :O     (O)
5              :O     (O)
.              :O     (O)


In [18]:
# test model
from pipeline.test import test_to_i2d2

test_to_i2d2(model,df_test, pp, checkpoint, manager)

0%|          | 0/494 [00:00<?, ?it/s]c:\Users\abdul\Desktop\de-identification
Loading checkpoint...
100%|██████████| 494/494 [00:54<00:00,  9.01it/s]
