In [43]:
#!/usr/bin/env python
# coding: utf-8

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.contrib.tensorboard.plugins import projector

import numpy as np
import os
import pandas as pd

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from datetime import datetime
from sklearn.model_selection import train_test_split

In [10]:
# ### Set-up and get data


module_url = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

DATADIR = os.getenv("DATADIR")

print("read in labelled")
labelled = pd.read_csv(os.path.join(DATADIR, 'labelled.csv.gz'), compression='gzip', low_memory=False)

read in labelled


In [23]:
train, test = train_test_split(labelled, test_size=0.33, random_state=42, stratify=labelled['level1taxon'])

In [24]:
train.shape

(204821, 19)

In [25]:
test.shape

(100882, 19)

In [26]:
DATA_COLUMN = 'combined_text'
LABEL_COLUMN = 'level1taxon'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'

label_list = list(labelled.level1taxon.unique())

In [27]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [28]:
def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(module_url)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [29]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 512
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

INFO:tensorflow:Writing example 0 of 204821
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] di ##t round ##table boost ##s uk business opportunities in pakistan international trade minister greg hands hosted a trade round ##table to help uk business secure contracts for chinese investment in pakistan . china is supporting $ 51 billion of infrastructure development in pakistan as part of the ‘ china pakistan economic corridor ’ ( cp ##ec ) to develop key infrastructure projects like roads railways and power stations which will modern ##ise pakistan ’ s economy and boost access to trade . it is part of china ’ s broader ‘ belt and road initiative ’ to replicate the ancient silk road trade routes with modern trading relationships and investments across asia the middle east and into europe . greg hands met yesterday ( 4 april ) with leading uk businesses policy experts and senior representatives of the chinese and pakistani governments as the uk is 

In [5]:
# ### Prepare data
corpus = clean_content['combined_text'].tolist()

TEXT_LENGTH = 512
short_corpus=[]
for text in corpus:
    words = text.split()
    truncated = " ".join(words[0:TEXT_LENGTH])
    short_corpus.append(truncated)

In [45]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
    """Creates a classification model."""

    bert_module = hub.Module(module_url, trainable=True)
    bert_inputs = dict(
        input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
    bert_outputs = bert_module(
        inputs=bert_inputs, signature="tokens", as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):

        # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        predicted_labels = tf.squeeze(
            tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (predicted_labels, log_probs)

        # If we're train/eval, compute loss between predicted and actual label
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)


In [46]:
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

        # TRAIN and EVAL
        if not is_predicting:

            (loss, predicted_labels, log_probs) = create_model(
                is_predicting, input_ids, input_mask, segment_ids, label_ids,
                num_labels)

            train_op = bert.optimization.create_optimizer(
                loss,
                learning_rate,
                num_train_steps,
                num_warmup_steps,
                use_tpu=False)

            # Calculate evaluation metrics.
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                f1_score = tf.contrib.metrics.f1_score(label_ids,
                                                       predicted_labels)
                auc = tf.metrics.auc(label_ids, predicted_labels)
                recall = tf.metrics.recall(label_ids, predicted_labels)
                precision = tf.metrics.precision(label_ids, predicted_labels)
                true_pos = tf.metrics.true_positives(label_ids,
                                                     predicted_labels)
                true_neg = tf.metrics.true_negatives(label_ids,
                                                     predicted_labels)
                false_pos = tf.metrics.false_positives(label_ids,
                                                       predicted_labels)
                false_neg = tf.metrics.false_negatives(label_ids,
                                                       predicted_labels)
                return {
                    "eval_accuracy": accuracy,
                    "f1_score": f1_score,
                    "auc": auc,
                    "precision": precision,
                    "recall": recall,
                    "true_positives": true_pos,
                    "true_negatives": true_neg,
                    "false_positives": false_pos,
                    "false_negatives": false_neg
                }

            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs) = create_model(
                is_predicting, input_ids, input_mask, segment_ids, label_ids,
                num_labels)

            predictions = {
                'probabilities': log_probs,
                'labels': predicted_labels
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn


In [74]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [75]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [76]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=DATADIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [77]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})


INFO:tensorflow:Using config: {'_model_dir': '/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/2019-02-11', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1f6c797f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [80]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [81]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/2019-02-11/model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/2019-02-11/model.ckpt.
INFO:tensorflow:loss = 3.12635, step = 1


KeyboardInterrupt: 

In [68]:
!pwd

/Users/ellieking/Documents/content-similarity/BERT


In [73]:
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
print_tensors_in_checkpoint_file(file_name="model.ckpt-0", tensor_name='', all_tensors=False)

global_step (DT_INT64) []
module/bert/embeddings/LayerNorm/beta (DT_FLOAT) [768]
module/bert/embeddings/LayerNorm/beta/adam_m (DT_FLOAT) [768]
module/bert/embeddings/LayerNorm/beta/adam_v (DT_FLOAT) [768]
module/bert/embeddings/LayerNorm/gamma (DT_FLOAT) [768]
module/bert/embeddings/LayerNorm/gamma/adam_m (DT_FLOAT) [768]
module/bert/embeddings/LayerNorm/gamma/adam_v (DT_FLOAT) [768]
module/bert/embeddings/position_embeddings (DT_FLOAT) [512,768]
module/bert/embeddings/position_embeddings/adam_m (DT_FLOAT) [512,768]
module/bert/embeddings/position_embeddings/adam_v (DT_FLOAT) [512,768]
module/bert/embeddings/token_type_embeddings (DT_FLOAT) [2,768]
module/bert/embeddings/token_type_embeddings/adam_m (DT_FLOAT) [2,768]
module/bert/embeddings/token_type_embeddings/adam_v (DT_FLOAT) [2,768]
module/bert/embeddings/word_embeddings (DT_FLOAT) [30522,768]
module/bert/embeddings/word_embeddings/adam_m (DT_FLOAT) [30522,768]
module/bert/embeddings/word_embeddings/adam_v (DT_FLOAT) [30522,768]
m

In [None]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [None]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

In [None]:
#     session.run(embed(corpus))


# ### Save out embedding vectors
print("save out embeddings")
np.save('embedded_clean_content'+os.path.basename(DATADIR)+'.npy', embedded_sentences)