In [24]:
# System imports.
import os
import time
import random
import itertools
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

# TensorFlow imports.
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text # To use TensorFlow Hub this import is mandatory
from tensorflow.keras.mixed_precision import experimental as mixed_precision
from official.nlp import optimization  # to create AdamW optimizer

from datasets.loading import TrainLoader, TestLoader

# References
https://www.tensorflow.org/text/tutorials/classify_text_with_bert#loading_models_from_tensorflow_hub

# Global configuration

In [25]:
BATCH_SIZE = 4
SEED = 42
EPOCHS = 1
SEQ_LENGTH = 512
THRESHOLD = 0.5

tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


Set reproductibility

In [3]:
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

Detect hardware

In [4]:
# Detect hardware, set appropriate distribution strategy (CPU/GPU/TPU)

physical_CPU_devices = tf.config.list_physical_devices('CPU')
physical_GPU_devices = tf.config.list_physical_devices('GPU')
physical_TPU_devices = tf.config.list_physical_devices('TPU')

try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set. On Kaggle this is always the case.
    print('[INFO] Running on TPU ', TPU.master())
except:
    TPU = None
    if len(physical_GPU_devices)>0:
        print('[INFO] Running on GPU')
    else:
        print('[INFO] Running on CPU')


if TPU:
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
    # enable XLA optmizations
    tf.config.optimizer.set_jit(True)
else:
    # default distribution strategy in Tensorflow.
    # Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()


REPLICAS = strategy.num_replicas_in_sync
print(f'[INFO] REPLICAS: {REPLICAS}')

# # set half precision policy
mixed_precision.set_policy('mixed_bfloat16' if TPU else 'float32')

print('[INFO] Compute dtype: {}'.format(
    mixed_precision.global_policy().compute_dtype
))
print('[INFO] Variable dtype: {}'.format(
    mixed_precision.global_policy().variable_dtype
))

[INFO] Running on GPU
[INFO] REPLICAS: 1
[INFO] Compute dtype: float32
[INFO] Variable dtype: float32


# Data processing

#### Loading & merging

In [5]:
dl = TestLoader()
dl.load()
dl.merge()

dl.data

Unnamed: 0,id,case_num,pn_num,feature_num,feature_text,pn_history
0,00016_000,0,16,0,Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,Lightheaded,HPI: 17yo M presents with palpitations. Patien...


#### Transform to TensorFlow

In [18]:
print('[INFO] test data...')
test_data = tf.data.Dataset.from_tensor_slices((
    tf.constant(dl.data['pn_history'].to_numpy()),
))
test_data = test_data.batch(BATCH_SIZE)
test_data = test_data.cache()
test_data = test_data.prefetch(buffer_size=tf.data.AUTOTUNE)

[INFO] test data...


# Modelling

#### Preprocess model

Load it

In [8]:
# Load processing function
preprocessor = hub.load(tfhub_handle_preprocess)

# Step 1: tokenize batches of text inputs.
text_inputs = [tf.keras.layers.Input(shape=(), dtype=tf.string)]
tokenize = hub.KerasLayer(preprocessor.tokenize)
tokenized_inputs = [tokenize(segment) for segment in text_inputs]

# Step 3: pack input sequences for the Transformer encoder.
bert_pack_inputs = hub.KerasLayer(
    preprocessor.bert_pack_inputs,
    arguments=dict(seq_length=SEQ_LENGTH)  # Optional argument.
)
encoder_inputs = bert_pack_inputs(tokenized_inputs)

bert_preprocess_model_512 = tf.keras.Model(text_inputs, encoder_inputs)


In [9]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

Test it

In [10]:
text_test = ['this is such an amazing movie!']
# text_preprocessed = bert_preprocess_model(text_test)
text_preprocessed = bert_preprocess_model_512(tf.constant(text_test))

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 512)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


#### Bert model

Load it

In [11]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

Test it

In [12]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.7626282   0.9928099  -0.18611862  0.3667383   0.15233758  0.655044
  0.9681154  -0.94862705  0.0021616  -0.9877732   0.06842764 -0.97630596]
Sequence Outputs Shape:(1, 512, 512)
Sequence Outputs Values:[[-0.28946292  0.34321183  0.33231512 ...  0.21300802  0.7102092
  -0.05771042]
 [-0.28741995  0.31980985 -0.23018652 ...  0.5845511  -0.21329862
   0.72692007]
 [-0.6615692   0.68876815 -0.8743301  ...  0.1087728  -0.26173076
   0.47855455]
 ...
 [-0.22561137 -0.2892573  -0.07064426 ...  0.47566032  0.8327724
   0.40025347]
 [-0.2982421  -0.27473164 -0.05450544 ...  0.4884972   1.0955367
   0.18163365]
 [-0.4437818   0.00930662  0.07223704 ...  0.17290089  1.1833239
   0.07897975]]


#### Custom model

Build it

In [13]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    # preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    # encoder_inputs = preprocessing_layer(text_input)
    encoder_inputs = bert_preprocess_model_512(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(SEQ_LENGTH, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

Load it

In [15]:
classifier_model = build_classifier_model()
classifier_model.load_weights('output/weights/model_epoch_1')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2174dc01460>

Display it

In [16]:
classifier_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 model (Functional)             {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 512),                                                      
                                 'input_type_ids':                                                
                                (None, 512),                                                      
                                 'input_mask': (Non                                               
                                e, 512)}                                                    

In [39]:
def get_result(predictions, threshold=THRESHOLD):
    results = []
    for pred in predictions:
        result = np.where(pred >= threshold)[0] + 1
        result = [
            list(g) for _, g in itertools.groupby(
                result, key=lambda n, c=itertools.count(): n - next(c)
            )
        ]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results

In [40]:
pred = classifier_model.predict(test_data)
pred = tf.sigmoid(pred).numpy()
get_result(pred, 0.01)

['2 108;113 114;118 120;125 127;130 130;135 135;142 142;144 145;148 148;174 174;193 193;196 196;204 204;210 210;380 380',
 '2 108;113 114;118 120;125 127;130 130;135 135;142 142;144 145;148 148;174 174;193 193;196 196;204 204;210 210;380 380',
 '2 108;113 114;118 120;125 127;130 130;135 135;142 142;144 145;148 148;174 174;193 193;196 196;204 204;210 210;380 380',
 '2 108;113 114;118 120;125 127;130 130;135 135;142 142;144 145;148 148;174 174;193 193;196 196;204 204;210 210;380 380',
 '2 108;113 114;118 120;125 127;130 130;135 135;142 142;144 145;148 148;174 174;193 193;196 196;204 204;210 210;380 380']

#### Submission

In [None]:
submission[['id', 'location']].to_csv('submission.csv', index=False)