In [1]:
import sys
sys.path.append("..")

In [2]:
import os
import pandas as pd
import numpy as np

from src.models import models
from src.tools import utils
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import AutoTokenizer, AutoConfig, TFAutoModel

2022-04-12 18:52:46.796623: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-12 18:52:46.796694: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  from .autonotebook import tqdm as notebook_tqdm


## Constants

In [3]:
# Data storage
data_folder = "/data"
models_folder = os.path.join(data_folder, "models")

raw_folder = os.path.join(data_folder, "raw")

patient_notes_file_path = os.path.join(raw_folder, "patient_notes.csv")
features_file_path = os.path.join(raw_folder, "features.csv")
train_file_path = os.path.join(raw_folder, "train.csv")
test_file_path = os.path.join(raw_folder, "test.csv")

dataset_columns = [
    "pn_history",
    "feature_text",
    "annotation_length",
    "location",
]

# Model params
model_name = "microsoft/deberta-base"
tokenizer_path = os.path.join(models_folder, f"{model_name}_tokenizer")
batch_size = 8
autotune = tf.data.AUTOTUNE
epochs = 20
model_checkpoint = "model.h5"
sequence_length = 512
learning_rate = 2e-5
clip_norm = 1000

## Extract

In [4]:
patient_notes_df = pd.read_csv(patient_notes_file_path)
features_df = pd.read_csv(features_file_path)

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

### Prepare tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(tokenizer_path)

config = AutoConfig.from_pretrained(model_name)
config.save_pretrained(tokenizer_path)

In [6]:
# model = models.create_model(model_name, sequence_length)
# model.summary()

## Transform

In [10]:
train_df = pd.merge(
    train_df, features_df, on=["feature_num", "case_num"], how="left")
train_df = pd.merge(
    train_df, patient_notes_df, on=["pn_num", "case_num"], how="left")

train_df["annotation_length"] = train_df.annotation.str.len()

train_df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,annotation_length
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,32
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,29
2,00016_002,0,16,2,['chest pressure'],['203 217'],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,18
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,36
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,40


### Dataset tokenitzation

In [11]:
utils.decode_location(train_df.location)

0                  [(696, 724)]
1                  [(668, 693)]
2                  [(203, 217)]
3        [(70, 91), (176, 183)]
4                  [(222, 258)]
                  ...          
14295                        []
14296                        []
14297              [(274, 282)]
14298              [(421, 437)]
14299              [(314, 330)]
Name: location, Length: 14300, dtype: object

In [12]:
def create_inputs_OLD(
    pn_history: str, feature_text: str, tokenizer, max_length=512, 
    padding="max_length", add_special_tokens = True):
    
    tokens = tokenizer(
        pn_history, 
        feature_text, 
        max_length=max_length, 
        padding=padding, 
        add_special_tokens=add_special_tokens
    )
    
    input_ids = np.array(tokens["input_ids"])
    attention_mask = np.array(tokens["attention_mask"])
    
    return input_ids, attention_mask


def decode_location_OLD(location: str): # -> List[Tuple[int]]:
    """
    This function decodes ['ab cd ...'] format of location annotations
    from dataset and return list of tuples of locations
    """
    location = location.replace("[", '')
    location = location.replace("]", '')
    location = location.replace("'", '')
    location = location.replace(",", '')
    location = location.replace(";", ' ')
    location = location.split(" ")
    if list(filter(None, location)) == []:
        return []
    
    location = list(map(int, location))
    location_tuple_list = []
    
    for i in range(0, len(location), 2):
        x1 = location[i]
        x2 = location[i+1]
        location_tuple_list.append((x1, x2))
    
    return location_tuple_list


# https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train
def create_labels(pn_history, annotation_length, location_list):
    """
    This function creates labels with are vectors of zeros (no entity)
    and ones (entity)
    """
    tokenized = tokenizer(
        pn_history,
        add_special_tokens=True,
        max_length=sequence_length,
        padding="max_length",
        return_offsets_mapping=True
    )
    
    offset_mapping = tokenized["offset_mapping"]
    label = np.zeros(len(offset_mapping))
    if annotation_length != 0:
        locations = decode_location(location_list)
        for location in locations:
            start_idx, end_idx = -1, -1
            start, end = location
            for idx in range(len(offset_mapping)):
                if (start_idx == -1) & (start < offset_mapping[idx][0]):
                    start_idx = idx - 1
                if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                    end_idx = idx + 1
            if start_idx == -1:
                start_idx = end_idx
            if (start_idx != -1) & (end_idx != -1):
                label[start_idx:end_idx] = 1
            
    return np.array(label)


def get_dataset_generator(df: pd.DataFrame):
    zipped = zip(
        df["pn_history"].values,
        df["feature_text"].values, 
        df["feature_text"].values,
        df["location"].values,
    )
    
    for pn_history, feature_text, annotation_length, location in zipped:
        inputs, masks = create_inputs(pn_history, feature_text, tokenizer)
        labels = create_labels(pn_history, annotation_length, location)
        
        yield (inputs, masks), labels
        
        
def get_dataloader(dataset_generator) -> tf.data.Dataset:
    dataloader = tf.data.Dataset.from_generator(
        dataset_generator,
        output_signature=(
            (
                tf.TensorSpec(shape=(sequence_length,), dtype=tf.dtypes.int32, name="inputs"),
                tf.TensorSpec(shape=(sequence_length,), dtype=tf.dtypes.int32, name="attention_masks"),
            ),
            tf.TensorSpec(shape=(sequence_length,), dtype=tf.dtypes.int32, name="labels"),
        )
    )

    dataloader = dataloader.batch(batch_size)
    return dataloader.prefetch(autotune)

### Split into train and test

In [None]:
train_df, val_df = train_test_split(
    train_df[dataset_columns], test_size=.2)

### Training

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    min_delta=1e-5, 
    patience=4, 
    verbose=1,
    mode='auto', 
    restore_best_weights=True
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=1e-5, 
    patience=2, 
    mode='auto', 
    min_delta=0.001
)

In [None]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.f1 = tfa.metrics.F1Score(num_classes=2, average='micro', threshold=0.50)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, (-1, sequence_length))
        y_pred = tf.reshape(y_pred, (-1, sequence_length))
        self.f1.update_state(y_true, y_pred)
        
    def reset_state(self):
        self.f1.reset_state()
    
    def result(self):
        return self.f1.result()

In [None]:
metrics = [
    F1Score(), 
    tf.keras.metrics.Recall(thresholds=[0.5]), 
    tf.keras.metrics.Precision(thresholds=[0.5])
]

callbacks = [reduce_lr, early_stopping]
optimizer = tf.keras.optimizers.Adam(learning_rate, clipnorm=clip_norm)
loss = tf.keras.losses.BinaryCrossentropy(reduction="none")

In [None]:
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

history = model.fit(
    get_dataloader(lambda: get_dataset_generator(train_df)), 
    epochs=2,
    validation_data=get_dataloader(lambda: get_dataset_generator(val_df)),
    callbacks=callbacks,
)