In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
import numpy as np
import tf_metrics

In addition to the standard libraries we imported above, we'll need to install BERT's python package.

In [2]:
!pip install bert-tensorflow

[33mYou are using pip version 19.0.2, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.

Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.

Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).

In [4]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = "/Users/ellieking/Documents/content-similarity/BERT/bert_experiment_output"#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = False #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = False #@param {type:"boolean"}
BUCKET = 'BUCKET_NAME' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: /Users/ellieking/Documents/content-similarity/BERT/bert_experiment_output *****


#Data

In [5]:
OUTPUT_DIR

'/Users/ellieking/Documents/content-similarity/BERT/bert_experiment_output'

First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub).

In [6]:
labelled = pd.read_csv('/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/2019-02-11/labelled.csv.gz', 
                       compression='gzip', 
                       low_memory=False)

In [7]:
business = labelled[labelled['level1taxon']=='Business and industry'].copy()

In [8]:
business.shape

(52715, 19)

In [9]:
business = business.assign(level=np.where(business.level5taxon.notnull(), 5, 0))
business.loc[business['level4taxon'].notnull() & business['level5taxon'].isnull(
), 'level'] = 4
business.loc[business['level3taxon'].notnull() & business['level4taxon'].isnull(
), 'level'] = 3
business.loc[business['level2taxon'].notnull() & business['level3taxon'].isnull(
), 'level'] = 2
business.loc[business['level1taxon'].notnull() & business['level2taxon'].isnull(
), 'level'] = 1


In [10]:
deep = business[business['level']>2]

In [11]:
deep.shape

(21363, 20)

In [23]:
deep.to_csv('/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/2019-02-11/business_deep.csv.gz', 
            compression='gzip', 
            index=False)

In [12]:
train, test = train_test_split(deep, test_size=0.33, random_state=42, stratify=deep['level'])

To keep training fast, we'll take a sample of 5000 train and test examples, respectively.

In [13]:
train.columns

Index(['base_path', 'content_id', 'description', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'title', 'body', 'combined_text', 'taxon_id',
       'taxon_base_path', 'taxon_name', 'level1taxon', 'level2taxon',
       'level3taxon', 'level4taxon', 'level5taxon', 'level'],
      dtype='object')

For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)

In [14]:
DATA_COLUMN = 'combined_text'
LABEL_COLUMN = 'taxon_id'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = list(deep.taxon_id.unique())

In [15]:
len(label_list)

81

In [17]:
train.taxon_id.nunique()

80

In [19]:
train.content_id.nunique()

9413

In [20]:
train.shape

(14313, 20)

In [18]:
test.taxon_id.nunique()

81

#Data Preprocessing
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

In [15]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):


1. Lowercase our text (if we're using a BERT lowercase model)
2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
4. Map our words to indexes using a vocab file that BERT provides
5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

Happily, we don't have to worry about most of these details.




To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:

In [16]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Using /var/folders/jy/47p744c95hz67738zkn74rwr0002j9/T/tfhub_modules to cache modules.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info["do_lower_case"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:

Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands.

In [None]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

In [None]:
train_features[0].label_id

In [None]:
train_features[0].input_ids

In [None]:
train_features[0].input_mask

#Creating a model

Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning).

For sequence-level classification tasks, BERT fine-tuning is straightforward. 

To obtain a fixed-dimensional pooled representation of the input sequence, we take the final hidden state (ie the output of the transformer) for the first token in the input, which by construction corresponds to the special [CLS] word embedding. We denote this vector as $C\in \mathbb{R}^{H}$, where $H$ is the hidden size. `output layer`

The only new parameters added during fine-tuning are for a classification layer $W \in \mathbb{R}^{K x H}$, where $K$ is the number of classification labels.`output_weights`

The label probabilities $P \in \mathbb{R}^{K}$ are computed with a standard softmax, $P = softmax(CW)^{T}$ `log_probs`

All of the parameters of BERT and $W$ `output_weights` are fine-tuned jointly to maximise the log-probability of the correct label. 

The follwoing hyperparameter vlaues were found to work well across tasks:
- *Batch size*: 16, 32 
- *Learning rate (Adam)*: 5e-3, 3e-5, 2e-5
- *Number of epochs*: 3, 4

In [180]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to fine-tune.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    one_hot_predicted_labels = tf.one_hot(predicted_labels, depth=num_labels, dtype=tf.float32)

    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs, one_hot_predicted_labels, one_hot_labels)

    # If we're train/eval, compute loss between predicted and actual label
    loss = tf.losses.sparse_softmax_cross_entropy(labels, log_probs)
#     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
#     loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs, one_hot_predicted_labels, one_hot_labels)


Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction.

In [182]:
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs, one_hot_predicted_labels, one_hot_labels) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(one_hot_labels, one_hot_predicted_labels):

        pos_indices = 1   
        average = 'micro'

        # Tuple of (value, update_op)
        precision = tf_metrics.precision(
            one_hot_labels, one_hot_predicted_labels, num_labels, pos_indices, average=average)
        recall = tf_metrics.recall(
            one_hot_labels, one_hot_predicted_labels, num_labels, pos_indices, average=average)
        f2 = tf_metrics.fbeta(
            one_hot_labels, one_hot_predicted_labels, num_labels, pos_indices, average=average, beta=2)
        f1 = tf_metrics.f1(
            one_hot_labels, one_hot_predicted_labels, num_labels, pos_indices, average=average)

        return {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'f2': f2
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs, one_hot_predicted_labels, one_hot_labels) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels,
          'y_true': one_hot_labels,
          'y_pred': one_hot_predicted_labels,

      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn


In [20]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [21]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [22]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [183]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})


INFO:tensorflow:Using config: {'_model_dir': '/Users/ellieking/Documents/content-similarity/BERT/bert_experiment_output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1722289b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators).

In [27]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes.

In [24]:
# print(f'Beginning Training!')
# current_time = datetime.now()
# estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
# print("Training took time ", datetime.now() - current_time)

Now let's use our test data to see how well our model did:

In [25]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [None]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

In [163]:
test_preds = estimator.predict(input_fn=test_input_fn)

In [164]:
pred_list = list(test_preds)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/ellieking/Documents/content-similarity/BERT/bert_experiment_output/model.ckpt-1341
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [167]:
pred_list[0]

{'probabilities': array([-1.8557117, -1.5610144, -1.6814368, -1.6570656, -1.6684177,
        -5.813551 , -7.3384275, -8.2665825, -8.073975 , -7.9055824,
        -7.539958 , -7.947551 , -7.5277023, -7.6875324, -7.2341404,
        -3.296257 , -6.9909315, -7.6236315, -8.402364 , -7.7072306,
        -6.946128 , -7.2227106, -8.401003 , -7.058935 , -7.529442 ,
        -8.596278 , -7.952729 , -7.92295  , -7.641367 , -7.811639 ,
        -8.181353 , -7.3375435, -7.3382263, -8.124301 , -8.546824 ,
        -7.9741344, -7.9750624, -7.7907763, -7.91045  , -7.8591003,
        -8.067461 , -8.626943 , -7.9372845, -8.285039 , -8.051117 ,
        -7.2252483, -8.675693 , -7.947386 , -7.9224377, -8.4383745,
        -8.046499 , -8.909634 , -7.9269447, -8.507811 , -7.201808 ,
        -8.185943 , -8.106163 , -8.790627 , -7.893656 , -7.669444 ,
        -7.9067764, -7.7766495, -8.172624 , -7.9639587, -7.692419 ,
        -8.278391 , -8.043953 , -8.630077 , -8.189078 , -8.236936 ,
        -7.811632 , -8.580658 ,

In [171]:
y_true = np.array([pred['y_true'] for pred in pred_list])
y_pred = np.array([pred['y_pred'] for pred in pred_list])

In [170]:
y_true.shape

(7050, 81)

In [172]:
y_pred.shape

(7050, 81)

In [178]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_true, y_pred, beta=1.0,  pos_label=1)

  'precision', 'predicted', average, warn_for)


(array([0.2139738 , 0.26589595, 0.20867769, 0.16358464, 0.16831683,
        0.41263941, 0.58730159, 0.45283019, 0.5       , 0.5       ,
        0.47826087, 0.56880734, 0.41153846, 0.6971831 , 0.62215909,
        0.83870968, 0.41314554, 0.88888889, 1.        , 0.76785714,
        0.78888889, 0.46218487, 0.        , 0.90353698, 0.56756757,
        0.73333333, 0.83950617, 0.52884615, 0.91304348, 0.60606061,
        0.87878788, 0.7       , 0.65957447, 0.68627451, 0.33898305,
        0.3       , 0.        , 0.31147541, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.63888889, 0.        , 0.375     , 0.        , 1.        ,
        0.84      , 0.83333333, 0.92307692, 0.        , 1.        ,
        0.        , 0.        , 0.83870968, 0.        , 0.        ,
        0.        , 0.86153846, 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [56]:
np.argmax(next(test_preds)['probabilities'], axis=-1)

16