In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import sparse_categorical_crossentropy
import tensorflow_hub as hub
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
import re

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

W0803 06:55:51.877785 140067398080320 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [2]:
with open('ner_dataset/final_entity_sentences.csv', 'r') as f:
  entity_sentences = f.readlines()
  
with open('ner_dataset/final_sentences.csv', 'r') as f:
  sentences = f.readlines()

In [3]:
# Tensorflow hub path to BERT module
bert_url = "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"

#Maximum token length
max_length = 30

#### Create Tokenizer

In [4]:
def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(bert_url)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
    return tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

W0803 06:56:35.891704 140067398080320 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



#### Functions that convert to features

In [5]:
def convert_example_to_features(example, max_len, tokenizer):
  
  example_tokens = tokenizer.tokenize(example)
  
  if len(example_tokens)> max_len -2:
    example_tokens = example_tokens[: (max_len -2)]
   
  tokens = ['[CLS]']
  tokens = tokens + example_tokens + ['[SEP]']
  tokens=  tokens + ['[PAD]'] * (max_len - len(tokens))
  
  segment_ids = [0 for token in tokens]
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  input_masks = [1] * (len(example_tokens) + 1) + [0] * (max_len - len(example_tokens) -1)
  
  return input_ids, input_masks, segment_ids


def convert_example_list_to_features(tokenizer, examples, max_len):
  input_ids = []
  input_masks = []
  segment_ids = []
  labels = []
  
  for example in examples:
    input_id, input_mask, segment_id = convert_example_to_features(example, max_len, tokenizer)
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)
  
  return np.array(input_ids), np.array(input_masks), np.array(segment_ids)

In [6]:
f = convert_example_list_to_features(tokenizer, sentences[4], 30)

In [7]:
f[0][0]

array([101, 157, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0])

In [43]:
def create_ner_tokens(entity_sentence, max_len):
  ner_tokens = ['[nerCLS]']
  entity_sentence = entity_sentence.replace('OTHER', 'O')
  entity_sentence = re.sub('INVESTOR|STARTUP', 'B-org', entity_sentence)
  tokens = entity_sentence.split()
  
  if len(tokens) > max_len -2:
    tokens = tokens[: (max_len - 2)]
    
  ner_tokens = ner_tokens + tokens + ['[nerSEP]']
  ner_tokens = ner_tokens + ['[nerPAD]'] * (max_len - len(ner_tokens))
  return ner_tokens

def get_tokens_for_list(entity_sentences, max_len):
  ner_tokens_list = []
  for sentence in entity_sentences:
    ner_tokens = create_ner_tokens(sentence, max_len)
    ner_tokens_list.append(ner_tokens)
  
  return np.array(ner_tokens_list)

In [44]:
get_tokens_for_list(['OTHER OTHER STARTUP'], 10)

array([['[nerCLS]', 'O', 'O', 'B-org', '[nerSEP]', '[nerPAD]',
        '[nerPAD]', '[nerPAD]', '[nerPAD]', '[nerPAD]']], dtype='<U8')

### Convert Data into format BERT can use

In [45]:
max_len = 30

ner_tokens = get_tokens_for_list(entity_sentences, max_len)
ner_tokens[2]

array(['[nerCLS]', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org',
       'O', 'O', 'O', 'O', '[nerSEP]', '[nerPAD]', '[nerPAD]', '[nerPAD]',
       '[nerPAD]', '[nerPAD]', '[nerPAD]', '[nerPAD]', '[nerPAD]',
       '[nerPAD]', '[nerPAD]', '[nerPAD]', '[nerPAD]', '[nerPAD]',
       '[nerPAD]'], dtype='<U8')

In [46]:
ner_id_dict = {'STARTUP':0, 'INVESTOR': 1, 'O': 2, '[nerCLS]': 3, '[nerSEP]': 4, '[nerPAD]':5 }

In [47]:
new_ner_id_dict = {'B-org':0, 'O': 1, '[nerCLS]': 2, '[nerSEP]': 3, '[nerPAD]':4 }

In [49]:
ner_labels =[]

for sent in ner_tokens:
  sent_ids = []
  for token in sent:
    sent_ids.append(new_ner_id_dict[token])
  ner_labels.append(sent_ids)

In [13]:
(input_ids, input_masks, segment_ids) = convert_example_list_to_features(tokenizer, sentences, max_len)

In [50]:
 ner_labels = np.array(ner_labels)

In [59]:
def custom_loss(y_true, y_pred):
    """
    calculate loss function explicitly, filtering out 'extra inserted labels'
    
    y_true: Shape: (batch x (max_length + 1) )
    y_pred: predictions. Shape: (batch x x (max_length + 1) x num_distinct_ner_tokens ) 
    
    returns:  cost
    """

    #get labels and predictions
    
    y_label = tf.reshape(tf.layers.Flatten()(tf.cast(y_true, tf.int32)),[-1])
    
    mask = (y_label < 2)   # This mask is used to remove all tokens that do not correspond to the original base text.

    y_label_masked = tf.boolean_mask(y_label, mask)  # mask the labels
    
    y_flat_pred = tf.reshape(tf.layers.Flatten()(tf.cast(y_pred, tf.float32)),[-1, 5])
    
    y_flat_pred_masked = tf.boolean_mask(y_flat_pred, mask) # mask the predictions
    
    return tf.reduce_mean(sparse_categorical_crossentropy(y_label_masked, y_flat_pred_masked,from_logits=False ))


def custom_acc_orig_tokens(y_true, y_pred):
    """
    calculate loss dfunction filtering out also the newly inserted labels
    
    y_true: Shape: (batch x (max_length) )
    y_pred: predictions. Shape: (batch x x (max_length + 1) x num_distinct_ner_tokens ) 
    
    returns: accuracy
    """

    #get labels and predictions
    
    y_label = tf.reshape(tf.layers.Flatten()(tf.cast(y_true, tf.int64)),[-1])
    
    mask = (y_label < 2)
    y_label_masked = tf.boolean_mask(y_label, mask)
    
    y_predicted = tf.math.argmax(input = tf.reshape(tf.layers.Flatten()(tf.cast(y_pred, tf.float64)),\
                                                    [-1, 5]), axis=1)
    
    y_predicted_masked = tf.boolean_mask(y_predicted, mask)

    return tf.reduce_mean(tf.cast(tf.equal(y_predicted_masked,y_label_masked) , dtype=tf.float64))
  

def custom_acc_orig_non_other_tokens(y_true, y_pred):
  """
  calculate loss dfunction explicitly filtering out also the 'Other'- labels

  y_true: Shape: (batch x (max_length) )
  y_pred: predictions. Shape: (batch x x (max_length + 1) x num_distinct_ner_tokens ) 

  returns: accuracy
  """

  #get labels and predictions

  y_label = tf.reshape(tf.layers.Flatten()(tf.cast(y_true, tf.int64)),[-1])

  mask = (y_label < 1)
  y_label_masked = tf.boolean_mask(y_label, mask)

  y_predicted = tf.math.argmax(input = tf.reshape(tf.layers.Flatten()(tf.cast(y_pred, tf.float64)),\
                                                  [-1, 5]), axis=1)

  y_predicted_masked = tf.boolean_mask(y_predicted, mask)

  return tf.reduce_mean(tf.cast(tf.equal(y_predicted_masked,y_label_masked) , dtype=tf.float64))

#### Split data into datasets for training, validation and testing

In [51]:
train_size = round(0.8 * len(input_ids))

dev_end = train_size + round(0.1 * train_size)

In [52]:
shuffle = np.random.permutation(len(input_ids))

input_ids = input_ids[shuffle]
input_masks = input_masks[shuffle]
segment_ids = segment_ids[shuffle]
ner_labels = segment_ids[shuffle]

In [53]:
train_input_ids, train_input_masks, train_segment_ids =   input_ids[:train_size], input_masks[:train_size], segment_ids[:train_size]
dev_input_ids, dev_input_masks, dev_segment_ids =   input_ids[train_size:dev_end], input_masks[train_size:dev_end], segment_ids[train_size:dev_end]
test_input_ids, test_input_masks, test_segment_ids =   input_ids[dev_end:], input_masks[dev_end:], segment_ids[dev_end:]

In [54]:
train_labels = ner_labels[:train_size]
dev_labels = ner_labels[train_size:dev_end]
test_labels = ner_labels[dev_end:]

### Creating the model

In [20]:

def initialize_session(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [21]:
class BertLayer(tf.keras.layers.Layer):
    """
    Create BERT layer, following https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b
    init:  initialize layer. Specify various parameters regarding output types and dimensions. Very important is
           to set the number of trainable layers.
    build: build the layer based on parameters
    call:  call the BERT layer within a model
    """
    
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="sequence",
        bert_url="https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_url = bert_url

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_url, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
        trainable_layers = []


        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

        mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)

        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [22]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

In [60]:
def ner_model(max_input_length, train_layers):
    """
    Implementation of NER model
    
    variables:
        max_input_length: number of tokens (max_length + 1)
        train_layers: number of layers to be retrained
        optimizer: optimizer to be used
    
    returns: model
    """
    
    in_id = tf.keras.layers.Input(shape=(max_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_length,), name="segment_ids")
    
    
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_sequence = BertLayer(n_fine_tune_layers=train_layers)(bert_inputs)
    
    print(bert_sequence)
    
    dense = tf.keras.layers.Dense(256, activation='relu', name='dense')(bert_sequence)
    
    dense = tf.keras.layers.Dropout(rate=0.1)(dense)
    
    pred = tf.keras.layers.Dense(21, activation='softmax', name='ner')(dense)
     
    print('pred: ', pred)
    
    ## Prepare for multipe loss functions, although not used here
    
    losses = {
        "ner": custom_loss,
        }
    lossWeights = {"ner": 1.0
                  }
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    
    #model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[custom_acc_orig_tokens])

    model.compile(loss=losses, optimizer='adam', metrics=[custom_acc_orig_tokens, 
                                                         custom_acc_orig_non_other_tokens])
    
    
    model.summary()
    
    return model

In [61]:
model = ner_model(max_len, train_layers=4)

Tensor("bert_layer_4/bert_layer_4_module_apply_tokens/bert/encoder/Reshape_13:0", shape=(?, ?, 768), dtype=float32)
pred:  Tensor("ner_4/truediv:0", shape=(?, ?, 21), dtype=float32)
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 30)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 30)]         0                                            
__________________________________________________________________________________________________
bert_laye

In [62]:
sess = tf.Session()
initialize_session(sess)

In [63]:
history = model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    {"ner": train_labels },
    validation_data=([dev_input_ids, dev_input_masks, dev_segment_ids], {"ner": dev_labels }),
    epochs=10,
    batch_size=128
    #callbacks=[tensorboard]
)

Train on 9226 samples, validate on 923 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
