In [1]:
import pandas as pd
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1 import keras
import tensorflow_hub as hub
from datetime import datetime
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

import os, re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
  return train_df, test_df

BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)





In [2]:
train, test = download_and_load_datasets()
train = train.sample(5000)
test = test.sample(5000)
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None,text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)

MAX_SEQ_LENGTH = 128

In [29]:
# Model

BERT_EMBEDDING_DIM = 128
NUM_LABELS = 2

def bert_model(features, labels, mode, params):
    batch_size = params['batch_size']

    input_ids = features['input_ids']
    input_mask = features['input_mask']
    segment_ids = features['segment_ids']
    labels_one_hot = tf.one_hot(labels, depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32)
    bert_module = hub.Module(
        "https://tfhub.dev/google/small_bert/bert_uncased_L-2_H-128_A-2/1",
        trainable=True, tags={"train"})
    bert_inputs = dict(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids)
    bert_outputs = bert_module(bert_inputs, signature="tokens", as_dict=True)
    #pooled_output = bert_outputs["pooled_output"]
    sequence_output = tf.stop_gradient(bert_outputs["sequence_output"])

    feature_extracting_projection = tf.get_variable(
        "feature_extracting_projection",
        [BERT_EMBEDDING_DIM, BERT_EMBEDDING_DIM],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    logit_layer = tf.get_variable(
        "logit_layer",
        [BERT_EMBEDDING_DIM, NUM_LABELS],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    with tf.variable_scope('loss'):
        def _extract_single_embedding_vector(embedding_vector):
            embedding_vector = tf.expand_dims(embedding_vector, axis=0)
            res = tf.matmul(embedding_vector, feature_extracting_projection)
            res = tf.matmul(res, logit_layer)
            return tf.squeeze(res)

        def _extract_sequence_of_embeddings(embedding_seq):
            return tf.map_fn(_extract_single_embedding_vector, embedding_seq, infer_shape=False)

        logits = tf.map_fn(_extract_sequence_of_embeddings, sequence_output, infer_shape=False)
        logits = tf.ensure_shape(logits, [batch_size, MAX_SEQ_LENGTH, NUM_LABELS])
        softmax = tf.nn.softmax(logits, axis=2)
        per_timestep_loss = -tf.reduce_sum(softmax * labels_one_hot, axis=2)
        batch_loss = tf.reduce_mean(per_timestep_loss)

        predicted_labels = tf.argmax(softmax, axis=-1)

    def _metric_fn(ground_truth_labels, preds):
        accuracy = tf.metrics.accuracy(ground_truth_labels, preds)

        is_ing_mask = tf.equal(ground_truth_labels, 1)
        is_ing_preds = tf.boolean_mask(preds, is_ing_mask)
        is_ing_labels = tf.boolean_mask(ground_truth_labels, is_ing_mask)
        is_ing_accuracy = tf.metrics.accuracy(is_ing_labels, is_ing_preds)

        is_not_ing_mask = tf.equal(ground_truth_labels, 0)
        is_not_ing_preds = tf.boolean_mask(preds, is_not_ing_mask)
        is_not_ing_labels = tf.boolean_mask(ground_truth_labels, is_not_ing_mask)
        is_not_ing_accuracy = tf.metrics.accuracy(is_not_ing_labels, is_not_ing_preds)

        return {
            'accuracy': accuracy,
            'is_ing_accuracy': is_ing_accuracy,
            'is_not_ing_accuracy': is_not_ing_accuracy,
        }

    if mode == tf.estimator.ModeKeys.TRAIN:
        opt = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999, epsilon=1e-08)
        train_op = opt.minimize(batch_loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=batch_loss, train_op=train_op)
    elif mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=batch_loss, eval_metric_ops=_metric_fn(labels, predicted_labels))
    elif mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions=predicted_labels)


In [4]:
import os

def vocab_file():
    vocab_file = os.path.join(os.path.abspath(''), 'small_bert_bert_uncased_L-2_H-128_A-2_1/assets/vocab.txt')
    return vocab_file

def read_vocab():
    f = open(vocab_file(), 'rb')
    words = f.readlines()
    words = [word.rstrip() for word in words]
    return words

def has_ing():
    vocab = read_vocab()
    return [s.endswith('ing') for s in vocab]

class IngLabelFunction(object):

    def __init__(self):
        super(IngLabelFunction, self).__init__()
        self.lookup = tf.constant(has_ing())

    def label(self, ids):
        res = tf.gather(self.lookup, ids)
        res = tf.cast(res, dtype=tf.int32)
        return res



In [6]:
# Input
def make_input_fn(examples):

    def input_fn():
        tokenizer = create_tokenizer_from_hub_module()
        feats = bert.run_classifier.convert_examples_to_features(examples, label_list, MAX_SEQ_LENGTH, tokenizer)
        vanilla_fn = bert.run_classifier.input_fn_builder(
            features=feats,
            seq_length=MAX_SEQ_LENGTH,
            is_training=True,
            drop_remainder=False
        )
        params = {
            "batch_size": 5
        }        
        ds = vanilla_fn(params)
    
        labeler = IngLabelFunction()
        def _add_labels(record):
            ids = record['input_ids']
            labels = labeler.label(ids)
            return record, labels

        ds = ds.map(_add_labels)
        return ds

    return input_fn

# tf.reset_default_graph()
# tf.disable_v2_behavior()

# features = train_InputExamples[:5]
# ds = make_input_fn(features)()._dataset
# it = tf.data.make_one_shot_iterator(ds)
# with tf.Session() as sess:
#     ds_value = it.get_next()
#     model_output = bert_model(ds_value[0], ds_value[1], tf.estimator.ModeKeys.EVAL, {'batch_size': 5})
#     print(model_output)


In [26]:
# Train
BATCH_SIZE=5

run_config = tf.estimator.RunConfig(
    model_dir=os.path.join(os.path.abspath(''), 'ckpts/classification_128/')
)

estimator = tf.estimator.Estimator(
    model_fn=bert_model,
    config=run_config,
    params= {
        'batch_size': BATCH_SIZE
    }
)
estimator.train(input_fn=make_input_fn(train_InputExamples), max_steps=10000)

212, step = 201 (8.353 sec)
INFO:tensorflow:loss = -0.9081212, step = 201 (8.353 sec)
INFO:tensorflow:global_step/sec: 11.6001
INFO:tensorflow:global_step/sec: 11.6001
INFO:tensorflow:loss = -0.9371659, step = 301 (8.620 sec)
INFO:tensorflow:loss = -0.9371659, step = 301 (8.620 sec)
INFO:tensorflow:global_step/sec: 11.5264
INFO:tensorflow:global_step/sec: 11.5264
INFO:tensorflow:loss = -0.9370707, step = 401 (8.676 sec)
INFO:tensorflow:loss = -0.9370707, step = 401 (8.676 sec)
INFO:tensorflow:global_step/sec: 11.8141
INFO:tensorflow:global_step/sec: 11.8141
INFO:tensorflow:loss = -0.96170646, step = 501 (8.464 sec)
INFO:tensorflow:loss = -0.96170646, step = 501 (8.464 sec)
INFO:tensorflow:global_step/sec: 11.3092
INFO:tensorflow:global_step/sec: 11.3092
INFO:tensorflow:loss = -0.96058834, step = 601 (8.843 sec)
INFO:tensorflow:loss = -0.96058834, step = 601 (8.843 sec)
INFO:tensorflow:global_step/sec: 12.3816
INFO:tensorflow:global_step/sec: 12.3816
INFO:tensorflow:loss = -0.9644394, s

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x147e42550>

In [27]:
estimator.evaluate(input_fn=make_input_fn(test_InputExamples), steps=1000)

2008 2081 1996 2034 3185 1037 4569 2307 4536 2003 2999 2007 1037 2062 6517 1998 2061 19908 2143 1012 1999 2023 3185 1010 1047 4948 2243 10229 1037 2307 3066 1997 8220 2012 2116 2500 1005 11727 1012 2009 3138 2185 2172 2051 2008 2071 2022 2985 2012 4526 1037 2062 22249 2143 1012 1026 7987 1013 1028 1026 7987 1013 1028 1047 4948 2243 1005 1055 2047 14100 9144 2007 2048 3441 1024 1061 2480 2863 5651 2005 3477 5963 1998 2028 5796 1012 4743 4381 8069 2000 4154 1047 4948 2243 1005 102
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

{'accuracy': 0.9769422,
 'global_step': 10000,
 'is_ing_accuracy': 0.0,
 'is_not_ing_accuracy': 1.0,
 'loss': -0.97691745}