# Introduction

Named Entity Recognition(NER) is the process of identifying named entities in text. Some examples of named entities are: 'Person', 'Time', 'Location', 'Organization'. NER is essentially a token classification task where every token is classified into one or more predetermined categories.

In this exercise, I will train a simple Transformer based model to perform NER. I will use data from the Kaggle competition: 'NBME - Score Clinical Patient Notes'

In [1]:
import pandas as pd
import os
import re
import numpy as np
import tensorflow as tf
import string
from tensorflow import keras
from tensorflow.keras import layers
from datasets import load_dataset
from collections import Counter
from acquire import prep_and_split_data
from acquire import basic_clean_v2
from acquire import basic_clean_features
from conlleval import evaluate  

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization


I will use the transformer implementation from this fantastic [example](https://keras.io/examples/nlp/text_classification_with_transformer/)

# Define a TransformerBlock layer:

In [2]:
class TransformerBlock(layers.Layer):
  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
    super(TransformerBlock, self).__init__()
    self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.ffn = keras.Sequential(
        [layers.Dense(ff_dim, activation='relu'), layers.Dense(embed_dim)]
    )
    self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)

  def call(self, inputs, training):
    attn_output = self.att(inputs, inputs)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(inputs + attn_output)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training=training)
    return self.layernorm2(out1 + ffn_output)

# Define a TokenAndPositionEmbedding layer:

I will use the `TokenAndPositionEmbedding` layer from [ner transformer](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/ner_transformers.ipynb#scrollTo=YY0STTK9sKP3):

In [3]:
class TokenAndPositionEmbedding(layers.Layer):
  def __init__(self, maxlen, vocab_size, embed_dim):
    super(TokenAndPositionEmbedding, self).__init__()
    self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
    self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

  def call(self, inputs):
    maxlen = tf.shape(inputs)[-1]
    positions = tf.range(start=0, limit=maxlen, delta=1)
    position_embeddings = self.pos_emb(positions)
    token_embeddings = self.token_emb(inputs)
    return token_embeddings + position_embeddings

# Build the NER model class as a `keras.Model` subclass

In [31]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=170, embed_dim=170, num_heads=3, ff_dim=170
    ):
        super(NERModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation='relu')
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags)

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x
    

# Load the data from 'features.csv' and 'patient_notes.csv'.

In [5]:
train, validate, test = prep_and_split_data()

Number of rows in training set: 37931
Number of rows in validation set: 2108
Number of rows in test set: 2107


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


# Here I will manually create a word embedding layer.

### Using the Embedding layer

In [6]:
# Embed a batch of 7000 words with ovservations of length 163.
embedding_layer = tf.keras.layers.Embedding(7000, 163)

### Text preprocessing

Find a reasonable number for the vocab size and sequence length.

In [7]:
unique_words = pd.Series(' '.join(train.basic_clean_v2).split()).unique()

In [8]:
word_counts = pd.Series(' '.join(train.basic_clean_v2).split()).value_counts()

In [9]:
word_counts[word_counts >= 10]

,          429737
.          372297
:          178467
and        126653
no         114796
            ...  
values         10
wee            10
pots           10
(when          10
familiy        10
Length: 7215, dtype: int64

There are roughly 7000 vocabulary words.

Here I will define the dataset preprocessing steps. I will initialize a TextVectorization layer with my desired parameters to vectorize the student notes.

In [10]:
# Create a custom standardization function.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 7000
sequence_length = 163

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize = None,
    max_tokens = vocab_size,
    output_mode = 'int',
    output_sequence_length = sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text = [x for x in train.basic_clean_v2]
text_ds = tf.data.Dataset.from_tensor_slices(text)
vectorize_layer.adapt(text_ds)

2022-02-27 18:05:16.397205: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
vectorize_layer(train.basic_clean_v2)

<tf.Tensor: shape=(37931, 163), dtype=int64, numpy=
array([[ 323,   46,   74, ..., 1421,    2,   81],
       [2488,  188,    7, ...,  130,   39,    2],
       [  41,   15,   11, ...,    0,    0,    0],
       ...,
       [1029,    3,  148, ...,   56,    9,  281],
       [ 226,   46,  116, ...,    6,  372,   94],
       [ 323,   46,   74, ...,    2,  357,    3]])>

In [12]:
encoded_notes = vectorize_layer(text).numpy()

In [13]:
text_split = []
for note in text:
    text_split.append(note.split())

### Export the data to a tab-separated file format which will be easy to read as a `tf.data.Dataset` object.

In [14]:
def export_to_file(export_file_path, data):
    with open(export_file_path, "w") as f:
        for i in np.arange(len(data)):
            ner_tags = encoded_notes[i]
            tokens = text_split[i]
            f.write(
                str(len(tokens))
                + "\t"
                + "\t".join(tokens)
                + "\t"
                + "\t".join(map(str, ner_tags))
                + "\n"
            )

# export_to_file("./data/train.txt", train)

### Maybe set the targets as the 'ner_labels'
Make a single list fo all unique targets.

In [15]:
list_of_targets = []
for list_of_ailments in train.targets:
    for ailment in list_of_ailments:
        list_of_targets.append(ailment)

In [16]:
list_of_targets = pd.Series(list_of_targets).unique()

In [17]:
list_of_targets[:5]

array(['increased appetite', 'son died 3 weeks ago', 'female',
       'auditory hallucination once', 'tossing and turning'], dtype=object)

# Make the NER label lookup table

NER labels are usually provided in IOB, IOB2 or IOBES formats. Checkout this link for more information: [Wikipedia]("https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging")

I will start the label numbering from 1 since 0 is reserved for padding. 

In [18]:
def make_tag_lookup_table():
    iob_labels = ['B', 'I']
    ner_labels = list_of_targets
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ['-'.join([a,b]) for a, b in all_labels]
    all_labels = ['[PAD]', 'O'] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))

mapping = make_tag_lookup_table()
print(mapping)


{0: '[PAD]', 1: 'O', 2: 'B-increased appetite', 3: 'I-increased appetite', 4: 'B-son died 3 weeks ago', 5: 'I-son died 3 weeks ago', 6: 'B-female', 7: 'I-female', 8: 'B-auditory hallucination once', 9: 'I-auditory hallucination once', 10: 'B-tossing and turning', 11: 'I-tossing and turning', 12: 'B-67 year', 13: 'I-67 year', 14: 'B-difficulty falling asleep', 15: 'I-difficulty falling asleep', 16: 'B-hallucinations after taking ambien', 17: 'I-hallucinations after taking ambien', 18: 'B-duration 3 weeks', 19: 'I-duration 3 weeks', 20: 'B-unsuccessful napping', 21: 'I-unsuccessful napping', 22: 'B-sleeping medication ineffective', 23: 'I-sleeping medication ineffective', 24: 'B-diminished energy or feeling drained', 25: 'I-diminished energy or feeling drained', 26: 'B-loss of interest', 27: 'I-loss of interest', 28: 'B-visual hallucination once', 29: 'I-visual hallucination once', 30: 'B-fhx of depression or family history of depression', 31: 'I-fhx of depression or family history of de

### Get  a list of all tokens in the training dataset. This will be used to create the vocabulary.

In [19]:
all_tokens = text_split
all_tokens_array = np.array(' '.join(pd.Series(text)).split())

counter = Counter(all_tokens_array)
print(len(counter))

num_tags = len(mapping)
print(num_tags)
vocab_size = 7000

vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# The StringLook class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(
    vocabulary=vocabulary
)

54380
264


In [20]:
pd.Series(all_tokens_array)

0               67
1               yo
2                f
3          present
4             with
            ...   
6224087       etoh
6224088     drinks
6224089       2-3x
6224090         wk
6224091          .
Length: 6224092, dtype: object

In [21]:
vocabulary[:10]

[',', '.', ':', 'and', 'no', 'with', 'she', 'of', 'the', 'a']

## Create 2 new `Dataset` objects from the training and validation data

In [22]:
train_data = tf.data.TextLineDataset('./data/train.txt')

In [23]:
train_data

<TextLineDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

### Print out one line to make sure it looks good. The first record in the line is the number of tokens. After that I will have all the tokens followed by all the ner tags.

In [24]:
list(train_data.as_numpy_iterator())[1]

b'186\t20yof\tpresenting\twith\tha\t.\tshe\tsaid\tthe\tha\tstarted\tyesterday\tand\thas\tbeen\ta\tconstant\tdull\tpain\tthat\tis\tworsening\t.\tshe\tsays\tthe\tha\tinvolved\ther\twhole\thead\t.\tshe\thas\ttried\ttaking\tibuprofen\t,\ttylenol\t,\tand\tresting\tfor\tthe\tha\tand\tthese\thave\tnot\tbeen\thelpful\t.\tshe\tsays\tthe\tha\tis\tworse\twith\twalking\tand\tbending\tover\t.\tshe\tsays\tshe\thas\thad\tnausea\tand\thas\tvomited\t3x\t.\tshe\tis\thaving\tphotophobia\tbut\tno\tphonophobia\t.\tshe\tsays\tshe\thas\ta\trunny\tnose\tbut\tdenies\tany\teye\ttearing\tor\tredness\t.\tshe\tsaid\tshe\tfelt\twarm\tbut\tdid\tnot\tactually\ttake\ther\ttemperature\t.\tshe\tdenies\tany\tvisual\tchanges\t,\tcp\t,\tsob\t.\tshe\tdenies\tany\trecent\tillnesses\tor\thead\ttrauma\t.\tshe\thas\tnever\thad\tthis\thappen\tbefore\t.\tros\t:\tnegative\texcept\thpi\tpmh\t:\tnone\tmeds\t:\tocp\tnkda\tfh\t:\tmom-\tmigraines\tsh\t:\tno\ttobacco\tuse\t,\tetoh\t2-3\tweek\t,\tsmokes\tmarijuana\t3-4\ttimes\tweek\t,\ts

### I will use the following map function to transform the data in the dataset:

In [35]:
def map_record_to_training_data(record):
    record = tf.strings.split(record, sep='\t')
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags

def lowercase_and_convert_to_ids(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)

# I use `padded_batch` here because each record in the dataset has a different length.
batch_size = 169
train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

ner_model = NERModel(num_tags, vocab_size, embed_dim=170, num_heads = 3, ff_dim = 170)

In [36]:
train_dataset

<PaddedBatchDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

### I will use a custom loss function that will ignore the loss from padded tokens.

In [37]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name='custom_ner_loss'):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=keras.losses.Reduction.NONE
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

loss = CustomNonPaddingTokenLoss()


## Compile and fit the model

In [38]:
ner_model.compile(optimizer='adam', loss = loss)
ner_model.fit(train_dataset, epochs = 10)

def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lowercase_and_convert_to_ids(tokens)

# Sample inference using the trained model
sample_input = tokenize_and_convert_to_ids(
    '67 yo f present with trouble sleeping .  pt states that it began 3 weeks ago and describes as difficulty falling asleep ,  waking up early ,  and tossing and turning at night .  never had this problem before ,  tried taking a friends ambien ,  and it did not help .  pts son died on aug 17 in an mva .  pt states that she has a sad mood ,  loss of interest in activites ,  decreased energy level ,  difficulty sleeping ,  increased appetite ,  and hallucinations of seeing son ,  and hearing people next door .     ros :  negative except for above  allergies :  none  meds hctz 25 mg qd ,  lisinopril 20 mg qd  pmh :  htn (15 years) ,  in remission for breast cancer (10 years) ,  ruptured appendicitis  pshx :  lumpectomy ,  laperatomy in 20s  fh :  father with htn ,  hypercholesterolemia ,  died of stroke ,  mother with dm  sh :  married (45 years) ,  drinks etoh 2-3 times per week (-) cage ,  good support system ,  denies tobacco and recreational drug use'
)
sample_input = tf.reshape(sample_input, shape=[1, -1])
print(sample_input)

output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [mapping[i] for i in prediction]

print(prediction)

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node 'ner_model_2/token_and_position_embedding_2/embedding_6/embedding_lookup' defined at (most recent call last):
    File "/usr/local/anaconda3/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/local/anaconda3/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "/usr/local/anaconda3/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
      app.start()
    File "/usr/local/anaconda3/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 677, in start
      self.io_loop.start()
    File "/usr/local/anaconda3/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/usr/local/anaconda3/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/usr/local/anaconda3/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/usr/local/anaconda3/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "/usr/local/anaconda3/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 353, in dispatch_shell
      await result
    File "/usr/local/anaconda3/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "/usr/local/anaconda3/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2901, in run_cell
      result = self._run_cell(
    File "/usr/local/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2947, in _run_cell
      return runner(coro)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3172, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/usr/local/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3364, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/usr/local/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/jl/s3ptdwdx55v01d2g2wrs7vdc0000gn/T/ipykernel_7779/3145190542.py", line 2, in <module>
      ner_model.fit(train_dataset, epochs = 10)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 859, in train_step
      y_pred = self(x, training=True)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/var/folders/jl/s3ptdwdx55v01d2g2wrs7vdc0000gn/T/ipykernel_7779/832956628.py", line 14, in call
      x = self.embedding_layer(inputs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/var/folders/jl/s3ptdwdx55v01d2g2wrs7vdc0000gn/T/ipykernel_7779/3882870019.py", line 10, in call
      position_embeddings = self.pos_emb(positions)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/layers/embeddings.py", line 197, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'ner_model_2/token_and_position_embedding_2/embedding_6/embedding_lookup'
indices[170] = 170 is not in [0, 170)
	 [[{{node ner_model_2/token_and_position_embedding_2/embedding_6/embedding_lookup}}]] [Op:__inference_train_function_123030]