In [23]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

from datasets.loading import DataLoader

# References
https://www.tensorflow.org/text/tutorials/classify_text_with_bert#loading_models_from_tensorflow_hub

# Global configuration

In [34]:
BATCH_SIZE = 32
SEED = 42

# Data processing

#### Loading & merging

In [19]:
dl = DataLoader()
dl.load()
dl.merge()

dl.train

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,annotation_length
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,1
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,1
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,1
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,2
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,1
...,...,...,...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,Stephanie madden is a 20 year old woman compla...,0
14296,95333_913,9,95333,913,[],[],Female,Stephanie madden is a 20 year old woman compla...,0
14297,95333_914,9,95333,914,[photobia],[274 282],Photophobia,Stephanie madden is a 20 year old woman compla...,1
14298,95333_915,9,95333,915,[no sick contacts],[421 437],No-known-illness-contacts,Stephanie madden is a 20 year old woman compla...,1


#### Transform to TensorFlow

In [49]:
all_history_lenght = dl.train['pn_history'].apply(len)
max_text_lenght = np.max(all_history_lenght)
print('Max history lenght : {}'.format(max_text_lenght))
print('Mean history lenght: {:.3f} +/- {:.3f}'.format(np.mean(all_history_lenght), np.std(all_history_lenght)))

Max history lenght : 950
Mean history lenght: 816.795 +/- 135.128


In [50]:
def create_label(df_location, df_annotation_length, max_lenght):
    # Fill label with 0
    label = np.zeros(shape=(len(df_location), max_lenght))

    # Loop over row in locations vector
    for row_idx in range(len(df_location)):

        # Stop if there is no annotoration
        if df_annotation_length[row_idx] == 0:
            continue

        # Loop over location in the current row
        for location in df_location[row_idx]:
            for loc in [s.split() for s in location.split(';')]:
                start_idx, end_idx = int(loc[0]), int(loc[1])
                label[row_idx, start_idx:end_idx] = 1
    return tf.constant(label)

In [51]:
train_data = tf.data.Dataset.from_tensor_slices((
    tf.constant(dl.train['pn_history']),
    create_label(dl.train['location'], dl.train['annotation_length'], max_text_lenght)
))
train_data = train_data.batch(BATCH_SIZE)
train_data = train_data.cache()
train_data = train_data.prefetch(buffer_size=tf.data.AUTOTUNE)

Test it

In [52]:
for text_batch, label_batch in train_data.take(1):
    print(f'history : {text_batch.numpy()[0]}')
    print(f'location: {label_batch.numpy()[0]}')


history : b'HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with "thyroid disease," dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms'
location: [0. 0. 0. 0. 0. 0. 0. 0. 0

# Modelling

#### Settings

Choose a model

In [2]:
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


#### Preprocess model

Load it

In [3]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

Test it

In [4]:
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


#### Bert model

Load it

In [17]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

Test it

In [6]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.7626287   0.99280983 -0.18611823  0.3667384   0.15233725  0.6550442
  0.9681154  -0.948627    0.0021617  -0.9877732   0.0684269  -0.97630596]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946355  0.34321266  0.33231547 ...  0.2130081   0.71020746
  -0.05771154]
 [-0.28742057  0.31980976 -0.23018584 ...  0.5845509  -0.21329743
   0.7269206 ]
 [-0.6615704   0.68876845 -0.87433034 ...  0.10877228 -0.2617323
   0.47855324]
 ...
 [-0.22561148 -0.28925583 -0.07064399 ...  0.47566003  0.83277017
   0.4002542 ]
 [-0.2982427  -0.2747311  -0.05450499 ...  0.48849723  1.0955354
   0.18163374]
 [-0.44378164  0.0093075   0.07223718 ...  0.17290066  1.1833248
   0.07898001]]


#### Custom model

Build it

In [7]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

Test it

In [8]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.8607125]], shape=(1, 1), dtype=float32)


Display it

In [11]:
classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

#### Model compilation

In [16]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_data).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    optimizer_type='adamw'
)

NameError: name 'train_ds' is not defined

Compiling BERT

In [13]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [14]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

NameError: name 'optimizer' is not defined

#### Training