In [48]:
# Handle depencies 

In [49]:
!pip3 install --upgrade pip
!pip3 install -r requirements.txt



# Named Entity Recognition
@author: Abdullahi S. Adamu

In [50]:
import os
import shutil
import pandas as pd
from glob import glob
import seaborn as sns
from matplotlib import pylab as plt
import tensorflow as tf
# depenciy handling
if tf.__version__ != '2.3.0':
    !pip install tensorflow-gpu==2.3.0
    
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization 

In [51]:
tf.get_logger().setLevel('INFO')

In [52]:
print(f'tensorflow version :{tf.__version__}')

tensorflow version :2.3.0


## Read Classes as Look up Table

In [53]:
def read_classes_as_tf_lookup_table(filepath='./storage/classes.txt'):
    """
    Reads classes text file and creates as tensorflow lookup table 
    
    params:
    - filepath - path to class lookup table
    
    returns:
    - table (tf.lookup.StaticHashTable) - lookup table for classes
    
    """
    init = tf.lookup.TextFileInitializer(filename=filepath,
                            key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
                            value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
    # initialise  lookup table
    table = tf.lookup.StaticHashTable(initializer=init, default_value=-1, name='class_lookup')
    return table


In [54]:
# read classes as lookup
tf_class_lookup = read_classes_as_tf_lookup_table()

In [55]:
# verify lookup table works as expected
assert tf_class_lookup.lookup(tf.constant('Building')).numpy() == 6 

In [56]:
# load dataset
dataset = pd.read_csv('./storage/data.csv')

In [57]:
dataset.head()

Unnamed: 0,Class,Name
0,1,E. D. Abbott Ltd
1,1,Schwan-Stabilo
2,1,Q-workshop
3,1,Marvell Software Solutions Israel
4,1,Bergan Mercy Medical Center


## Preprocessing 

In [58]:
# define stopwords from exploratory analysis
stopwords = [')', '(', '.', ':', '.']

# date check function 
is_date =  lambda token:  token.isnumeric() and len(token) == 4 and int(token) < 2999
# is numeric token 
is_other_numeric = lambda token: token.isnumeric() and len(token) != 4 

def preprocessor(token):
    """
    preprocess tokens to reudce dimensionality of the 
    vocabulary as explained in our exploratory analysis
    
    params:
        token (string) - token from text
    returns:
        processed token (string) - processed token
    """
    if is_date(token):
        return '<DATE>'
    elif is_other_numeric:
        return '<NUMBER>'
    else:
        return token.lower().strip()

In [59]:
from sklearn.model_selection import train_test_split

output_dir='./ner'
train_file_path = os.path.join(output_dir, 'train.csv')
val_file_path = os.path.join(output_dir, 'val.csv')
test_file_path = os.path.join(output_dir, 'test.csv')

def train_val_test_split(dataset, target_var, save=True, output_dir='./ner'):
    """performs train, val and test split on the given dataset"""
    
    # Train-Test Split
    train_val_df, test_df, _, _ = train_test_split(dataset,dataset[target_var], test_size=0.1)

    # Train-Val Split
    train_df, val_df,_,_ = train_test_split(train_val_df, train_val_df[target_var], test_size=0.2)
    
    
    if not os.path.exists(output_dir) and save:
        os.mkdir(output_dir)
    
    # show sizes for train test and 
    print(f'train size: {len(train_df)}')
    print(f'val size: {len(val_df)}')
    print(f'test size: {len(test_df)}')
    
    if save:
        test_df.to_csv(test_file_path)
        train_df.to_csv(train_file_path)
        val_df.to_csv(val_file_path)

In [60]:
# run train, val and test split and store to an output dir for reproducibility
train_val_test_split(dataset, 'Class')

train size: 395126
val size: 98782
test size: 54879


## Read Tensorflow Datasets

In [61]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
batch_size=64
target_class = 'Class'

In [62]:
# read train dataset and batch 
train_batch = tf.data.experimental.make_csv_dataset(
    train_file_path, batch_size=batch_size,
    label_name=target_class).cache().prefetch(buffer_size=AUTOTUNE)



In [63]:
def test_tf_dataset(tf_dataset):
    for feature_batch, label_batch in tf_dataset.take(1):
        assert len(label_batch) == batch_size
        assert max(label_batch) <= 14
        
test_tf_dataset(train_batch)
    
        

## Using Small BERT for NER

Here we are going to use a smaller transformer architecture (SmallBERT), which enables us to finetune the model alot of faster compared to the full architecture (i.e. BERT) which has more transformer blocks, hencer more hyperparameters.

In [64]:
tf.get_logger().setLevel('INFO')
# bert model map
model_map = {
    "small_bert/bert_en_uncased_L-4_H-512_A-8": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
}

# bert preprocessor map
preprocssor_map = {
    "small_bert/bert_en_uncased_L-4_H-512_A-8": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1"
}

class BERTForNER(tf.keras.Model):
    """
    Define our BERForNER Model
    """
    def __init__(self, bert_architecture='small_bert/bert_en_uncased_L-4_H-512_A-8', num_classes=14):
        super(BERTForNER, self).__init__()
        self.bert_architecture = bert_architecture
        self.preprocessor = self.init_preprocessor(self.bert_architecture)
        self.bert_model = self.init_bert_model(self.bert_architecture)
        self.input_layer = tf.keras.layers.Input(shape=(None,), dtype=tf.string, name='sentences')
        self.mlp_l1 = tf.keras.layers.Dense(10, activation='swish', name='mlp_hidden_layer_1')
        self.mlp_dropout = tf.keras.layers.Dropout(0.1)
        self.mlp_l2 = tf.keras.layers.Dense(10, activation='swish', name='mlp_hidden_layer_2')
        self.class_prob = tf.keras.layers.Dense(num_classes, activation='softmax', name='class_prob')
        
        
    def init_preprocessor(self, bert_architecture):
        tf_hub_handle = preprocssor_map.get(bert_architecture, None)
        if tf_hub_handle:
            return hub.KerasLayer(tf_hub_handle, name='preprocessing')
        else:
            raise Exception(f"Could not find {bert_architecture} in preprocossor map")
    
    def init_bert_model(self, bert_architecture):
        tf_hub_handle = preprocssor_map.get(bert_architecture, None)
        if tf_hub_handle:
            return hub.KerasLayer(tf_hub_handle, trainable=True, name=f'{bert_architecture}')
        else:
            raise Exception(f"Could not find {bert_architecture} in model map")
      
    def call(self, inputs):
        inputs = self.input_layer(inputs)
        encoder_inputs = self.preprocessor(self.input_layer)
        bert_pooled_output = self.bert_model(encoder_inputs)['pooled_output']
        out = self.mlp_l1(bert_pooled_output)
        out = self.mlp_dropout(out)
        mlp_out = self.mlp_l2(out)
        class_prob = self.class_prob(mlp_out)
        
        return class_prob
    
    
def build_classifier_model(bert_architecture='small_bert/bert_en_uncased_L-4_H-512_A-8', num_classes=14):
    text_input = tf.keras.layers.Input(shape=(None,), dtype=tf.string, name='sentences')
    preprocessing_layer = hub.KerasLayer(preprocssor_map.get(bert_architecture), name='preprocessor')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(model_map.get(bert_architecture), trainable=True, name='SmallBERT')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    
    return tf.keras.Model(text_input, net)

In [65]:
small_bert_ner = build_classifier_model()









ValueError: in user code:

    /usr/local/lib/python3.6/dist-packages/tensorflow_hub/keras_layer.py:235 call  *
        result = smart_cond.smart_cond(training,
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/saved_model/load.py:509 _call_attribute  **
        return instance.__call__(*args, **kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py:780 __call__
        result = self._call(*args, **kwds)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py:823 _call
        self._initialize(args, kwds, add_initializers_to=initializers)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py:697 _initialize
        *args, **kwds))
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py:2855 _get_concrete_function_internal_garbage_collected
        graph_function, _, _ = self._maybe_define_function(args, kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py:3213 _maybe_define_function
        graph_function = self._create_graph_function(args, kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py:3075 _create_graph_function
        capture_by_value=self._capture_by_value),
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py:986 func_graph_from_py_func
        func_outputs = python_func(*func_args, **func_kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py:600 wrapped_fn
        return weak_wrapped_fn().__wrapped__(*args, **kwds)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/saved_model/function_deserialization.py:257 restored_function_body
        "\n\n".join(signature_descriptions)))

    ValueError: Could not find matching function to call loaded from the SavedModel. Got:
      Positional arguments (3 total):
        * Tensor("inputs:0", shape=(None, None), dtype=string)
        * False
        * None
      Keyword arguments: {}
    
    Expected these arguments to match one of the following 4 option(s):
    
    Option 1:
      Positional arguments (3 total):
        * TensorSpec(shape=(None,), dtype=tf.string, name='sentences')
        * False
        * None
      Keyword arguments: {}
    
    Option 2:
      Positional arguments (3 total):
        * TensorSpec(shape=(None,), dtype=tf.string, name='inputs')
        * True
        * None
      Keyword arguments: {}
    
    Option 3:
      Positional arguments (3 total):
        * TensorSpec(shape=(None,), dtype=tf.string, name='inputs')
        * False
        * None
      Keyword arguments: {}
    
    Option 4:
      Positional arguments (3 total):
        * TensorSpec(shape=(None,), dtype=tf.string, name='sentences')
        * True
        * None
      Keyword arguments: {}


In [21]:
#small_bert_ner.summary()

In [22]:
#small_bert_ner(tf.constant("Hello there"))

In [23]:
# small_bert_ner.compile(optimizer='Adam',
#                          loss='categorical_cross_entropy',
#                          metrics=['categorical_cross_entropy', 'accuracy'])

## Visualise our Model

In [None]:
# small_bert_ner.fit(train_batch)

In [None]:
# tf.keras.utils.plot_model(small_bert_ner)