<a href="https://colab.research.google.com/github/ameasure/colab_tutorials/blob/master/BERT_with_TF_Hub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bert-tensorflow
!pip install --upgrade pandas
!wget --no-clobber 'https://github.com/ameasure/autocoding-class/raw/master/msha.xlsx'

Requirement already up-to-date: pandas in /usr/local/lib/python3.6/dist-packages (0.24.2)
File ‘msha.xlsx’ already there; not retrieving.



In [2]:
import pandas as pd

df = pd.read_excel('msha.xlsx')
df['ACCIDENT_YEAR'] = df['ACCIDENT_DT'].apply(lambda x: x.year)
df['ACCIDENT_YEAR'].value_counts()
df_train = df[df['ACCIDENT_YEAR'].isin([2010, 2011])].copy().sample(1000)
df_valid = df[df['ACCIDENT_YEAR'] == 2012].copy().sample(1000)
print('training rows:', len(df_train))
print('validation rows:', len(df_valid))

training rows: 1000
validation rows: 1000


Convert the labels to 1-hot arrays.

In [0]:
from sklearn.preprocessing import LabelEncoder

labeler = LabelEncoder().fit(df['INJ_BODY_PART'])
df_train['LABEL'] = labeler.transform(df_train['INJ_BODY_PART'])
df_valid['LABEL'] = labeler.transform(df_valid['INJ_BODY_PART'])

In [4]:
import tensorflow as tf
import keras
from keras import backend as K
import tensorflow_hub as hub
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

Using TensorFlow backend.
W0613 22:13:07.845517 139730755979136 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [0]:
# preprocess data, convert each row into a bert "InputExample" object
def bert_preprocess(row, axis=None):
  return bert.run_classifier.InputExample(guid=None,
                                          text_a=row['NARRATIVE'],
                                          text_b=None,
                                          label=row['INJ_BODY_PART'])

processed_train = df_train.apply(bert_preprocess, axis=1)
processed_valid = df_valid.apply(bert_preprocess, axis=1)

In [0]:
# Get the tokenizer for our BERT MODEL
# path to BERT MODEL
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

with tf.Graph().as_default():
  # load the model from tensorflow hub
  bert_module = hub.Module(BERT_MODEL_HUB)
  # get the vocab file and do_lower_case function from the module
  with tf.Session() as sess:
    tokenization_info = bert_module(signature='tokenization_info', as_dict=True)
    vocab_file = sess.run(tokenization_info['vocab_file'])
    do_lower_case = sess.run(tokenization_info['do_lower_case'])

In [7]:
tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocab_file,
                                            do_lower_case=do_lower_case)

W0613 22:13:11.776346 139730755979136 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [8]:
tokenizer.tokenize('EE was loading a Gabion Grizzly when he was struck by falling debris')

['ee',
 'was',
 'loading',
 'a',
 'ga',
 '##bio',
 '##n',
 'gr',
 '##izzly',
 'when',
 'he',
 'was',
 'struck',
 'by',
 'falling',
 'debris']

In [9]:
# Now we'll convert our inputs to the numeric representation that BERT expects,
# a list of feature objects. Each feature object has 4 attributes:
# input_ids = a list of numbers representing words in our narrative
# input_mask = a list of 1/0s indicating words which are masked (for training only)
# segment_ids = a list of 1/0s indicating which sequence each token belongs to (for multi-segment tasks)
# label_id = id indicating the code for this example
MAX_SEQ_LENGTH = 128
LABELS = df['INJ_BODY_PART'].unique()
train_features = bert.run_classifier.convert_examples_to_features(processed_train, 
                                                                  LABELS, 
                                                                  MAX_SEQ_LENGTH,
                                                                  tokenizer)
valid_features = bert.run_classifier.convert_examples_to_features(processed_valid, 
                                                                  LABELS, 
                                                                  MAX_SEQ_LENGTH,
                                                                  tokenizer)

train_input_ids, train_input_mask, train_segment_ids, train_label_id = [], [], [], []
for f in train_features:
  train_input_ids.append(f.input_ids)
  train_input_mask.append(f.input_mask)
  train_segment_ids.append(f.segment_ids)
  train_label_id.append(f.label_id)
  
valid_input_ids, valid_input_mask, valid_segment_ids, valid_label_id = [], [], [], []
for f in valid_features:
  valid_input_ids.append(f.input_ids)
  valid_input_mask.append(f.input_mask)
  valid_segment_ids.append(f.segment_ids)
  valid_label_id.append(f.label_id)

W0613 22:13:11.922515 139730755979136 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/run_classifier.py:774: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.



In [0]:
# Create the Keras layer which will hold our BERT model
class BertLayer(keras.layers.Layer):
    def __init__(self, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            BERT_MODEL_HUB,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        trainable_vars = self.bert.variables
        
        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        
        # Select how many layers to fine tune
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
        
        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
        
        # Add non-trainable weights
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
            "pooled_output"
        ]
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [0]:
def create_model():
  # Build model
  in_id = keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="input_ids")
  in_mask = keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="input_masks")
  in_segment = keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="segment_ids")
  bert_inputs = [in_id, in_mask, in_segment]

  # Instantiate the custom Bert Layer defined above
  bert_output = BertLayer(n_fine_tune_layers=1)(bert_inputs)

  # Build the rest of the classifier 
  dense = keras.layers.Dense(256, activation='relu')(bert_output)
  pred = keras.layers.Dense(len(LABELS), activation='softmax')(dense)

  model = keras.models.Model(inputs=bert_inputs, outputs=pred) 
  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

with tf.Graph().as_default():
  sess = tf.Session()
  K.set_session(sess)
  model = create_model()
  model.fit(
      [train_input_ids, train_input_mask, train_segment_ids], 
      train_label_id,
      validation_data=([valid_input_ids, valid_input_mask, valid_segment_ids], valid_label_id),
      epochs=10,
      batch_size=32
  )

W0613 22:13:13.812247 139730755979136 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0613 22:13:13.818176 139730755979136 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0613 22:13:16.702840 139730755979136 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0613 22:13:16.750687 139730755979136 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3341: The name tf.log is deprecated. Please use tf.math.log instead.

W0613 22:13:16.890019 139730755979136 deprecation.py:323] From /usr/

Train on 1000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10