In [174]:
# System imports.
import time
import numpy as np
from tqdm import tqdm

# TensorFlow imports.
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

from datasets.loading import DataLoader

# References
https://www.tensorflow.org/text/tutorials/classify_text_with_bert#loading_models_from_tensorflow_hub

# Global configuration

In [45]:
BATCH_SIZE = 4
SEED = 42

# Data processing

#### Loading & merging

In [3]:
dl = DataLoader()
dl.load()
dl.merge()

dl.train

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,annotation_length
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,1
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,1
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,1
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,2
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,1
...,...,...,...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,Stephanie madden is a 20 year old woman compla...,0
14296,95333_913,9,95333,913,[],[],Female,Stephanie madden is a 20 year old woman compla...,0
14297,95333_914,9,95333,914,[photobia],[274 282],Photophobia,Stephanie madden is a 20 year old woman compla...,1
14298,95333_915,9,95333,915,[no sick contacts],[421 437],No-known-illness-contacts,Stephanie madden is a 20 year old woman compla...,1


#### Transform to TensorFlow

In [177]:
def create_label(df_annotation, df_history, df_annotation_length,
                 tfhub_handle_preprocess, seq_length=500):
    """Densify location using tokenization.

    Parameters
    ----------
    df_annotation : ndarray
        Target of the task. Can be none, one or more.

    df_history : ndarray
        Text from which to extract annotation(s).

    df_annotation_length : ndarray
        Number of annotations per history.

    seq_length : int
        Maximum number of words in an tokenized history.

    Returns
    -------
    label : Tensor
        Contains 0 if word is not part of the annotation and 1 otherwise.

    Notes
    -----
    About ~7mn of elapsed time. Can increase the third for loop from ~100mn to
    ~10ms using tf.math functions. WIP.
    ```python
    out_2_sum = tf.math.reduce_sum(tf.cast(tf.concat([
        out_2['input_word_ids'] == tf.slice(out_1_extracted, [0, idx], [1, 1])
        for idx in range(out_1_lenght-2)
    ], axis=0), tf.int8), axis=0)
    [...]
    ```

    References
    ----------
    The `seq_length` variable is by default set to 500 according to `this
    notebook <https://www.kaggle.com/yasufuminakama/
    nbme-deberta-base-baseline-train?scriptVersionId=87264998&cellId=25>` where
    the max lenght was found to be 466.
    """

    # Load processing function
    preprocessor = hub.load(tfhub_handle_preprocess)

    # Step 1: tokenize batches of text inputs.
    text_inputs = [tf.keras.layers.Input(shape=(), dtype=tf.string)]
    tokenize = hub.KerasLayer(preprocessor.tokenize)

    # Step 3: pack input sequences for the Transformer encoder.
    seq_length = 456  # Your choice here.
    bert_pack_inputs = hub.KerasLayer(
        preprocessor.bert_pack_inputs,
        arguments=dict(seq_length=seq_length)  # Optional argument.
    )

    # Fill label with 0
    label = np.zeros(shape=(len(df_annotation), max_lenght))

    # Loop over row in locations vector
    for row_idx in tqdm(range(len(df_annotation))):

        # Stop if there is no annotoration
        if df_annotation_length[row_idx] == 0:
            continue

        # Loop over location in the current row
        history = df_history[row_idx]
        for annotation in df_annotation[row_idx]:
            out_1 = bert_pack_inputs([
                tokenize(segment) for segment in [tf.constant([annotation])]
            ])
            out_1_lenght = tf.math.reduce_sum(out_1['input_mask'])
            out_1_extracted = tf.slice(
                out_1['input_word_ids'], [0, 1], [1, out_1_lenght-2]
            )

            out_2 = bert_pack_inputs([
                tokenize(segment) for segment in [tf.constant([history])]
            ])
            out_2_lenght = tf.math.reduce_sum(out_2['input_mask'])

            # This can be improved
            start_idx, idx_annotation = 0, 0
            for idx_history in range(out_2_lenght):
                word = tf.slice(
                    out_2['input_word_ids'], [0, idx_history], [1, 1]
                )
                if word == tf.slice(out_1_extracted, [0, idx_annotation], [1, 1]):
                    if idx_annotation == 0:
                        start_idx = idx_history
                    idx_annotation += 1
                    if idx_annotation == out_1_lenght.numpy()-2:
                        break
                else:
                    idx_annotation = 0
            label[row_idx][start_idx:start_idx+idx_annotation] = 1

    return tf.constant(label)

100%|██████████| 14300/14300 [07:19<00:00, 32.57it/s]


In [10]:
train_data = tf.data.Dataset.from_tensor_slices((
    tf.constant(dl.train['pn_history']),
    create_label(
        dl.train['annotation'], dl.train['pn_history'],
        dl.train['annotation_length']
    )
))
train_data = train_data.batch(BATCH_SIZE)
train_data = train_data.cache()
train_data = train_data.prefetch(buffer_size=tf.data.AUTOTUNE)

Test it

In [31]:
for text_batch, label_batch in train_data.take(1):
    print(f'history : {text_batch.numpy()[0]}')
    print(bert_preprocess_model(text_batch), end='\n\n')
    print(bert_pack_inputs(text_batch), end='\n\n')
    print(f'location: {label_batch.numpy()[0]}')


history : b'HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with "thyroid disease," dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms'
{'input_word_ids': <tf.Tensor: shape

ValueError: Exception encountered when calling layer "keras_layer_6" (type KerasLayer).

Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (2 total):
    * Tensor("inputs:0", shape=(32,), dtype=string)
    * 128
  Keyword arguments: {}

 Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (2 total):
    * [RaggedTensorSpec(TensorShape([None, None]), tf.int32, 1, tf.int64)]
    * TensorSpec(shape=(), dtype=tf.int32, name='seq_length')
  Keyword arguments: {}

Option 2:
  Positional arguments (2 total):
    * [RaggedTensorSpec(TensorShape([None, None]), tf.int32, 1, tf.int64), RaggedTensorSpec(TensorShape([None, None]), tf.int32, 1, tf.int64)]
    * TensorSpec(shape=(), dtype=tf.int32, name='seq_length')
  Keyword arguments: {}

Option 3:
  Positional arguments (2 total):
    * [RaggedTensorSpec(TensorShape([None, None, None]), tf.int32, 2, tf.int64), RaggedTensorSpec(TensorShape([None, None, None]), tf.int32, 2, tf.int64)]
    * TensorSpec(shape=(), dtype=tf.int32, name='seq_length')
  Keyword arguments: {}

Option 4:
  Positional arguments (2 total):
    * [RaggedTensorSpec(TensorShape([None, None, None]), tf.int32, 2, tf.int64)]
    * TensorSpec(shape=(), dtype=tf.int32, name='seq_length')
  Keyword arguments: {}

Call arguments received:
  • inputs=tf.Tensor(shape=(32,), dtype=string)
  • training=None

# Modelling

#### Settings

Choose a model

In [6]:
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


#### Preprocess model

Load it

In [7]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

Test it

In [8]:
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_mask', 'input_type_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


#### Bert model

Load it

In [17]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

Test it

In [6]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.7626287   0.99280983 -0.18611823  0.3667384   0.15233725  0.6550442
  0.9681154  -0.948627    0.0021617  -0.9877732   0.0684269  -0.97630596]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946355  0.34321266  0.33231547 ...  0.2130081   0.71020746
  -0.05771154]
 [-0.28742057  0.31980976 -0.23018584 ...  0.5845509  -0.21329743
   0.7269206 ]
 [-0.6615704   0.68876845 -0.87433034 ...  0.10877228 -0.2617323
   0.47855324]
 ...
 [-0.22561148 -0.28925583 -0.07064399 ...  0.47566003  0.83277017
   0.4002542 ]
 [-0.2982427  -0.2747311  -0.05450499 ...  0.48849723  1.0955354
   0.18163374]
 [-0.44378164  0.0093075   0.07223718 ...  0.17290066  1.1833248
   0.07898001]]


#### Custom model

Build it

In [7]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

Test it

In [8]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.8607125]], shape=(1, 1), dtype=float32)


Display it

In [11]:
classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

#### Model compilation

In [16]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_data).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    optimizer_type='adamw'
)

NameError: name 'train_ds' is not defined

Compiling BERT

In [13]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [14]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

NameError: name 'optimizer' is not defined

#### Training