In [1]:
from __future__ import print_function

import pandas as pd
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1 import keras
import tensorflow_hub as hub
from datetime import datetime
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
import numpy as np

import os, re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
  return train_df, test_df

BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

def debug(pass_through, tensor):
    print_op = tf.print(tensor, output_stream=sys.stdout)
    with tf.control_dependencies([print_op]):
        pass_through = tf.identity(pass_through)
    return pass_through






In [17]:
# Constants

BERT_EMBEDDING_DIM = 128
MAX_SEQ_LENGTH = 128
BATCH_SIZE = 5
NUM_PAIRS_PER_FPROP = 10


In [3]:
# Download Data

train, test = download_and_load_datasets()
train = train.sample(10000)
test = test.sample(5000)
validation = test.sample(50)
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None,text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)
validation_InputExamples = validation.apply(lambda x: bert.run_classifier.InputExample(guid=None,text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)



In [13]:
# Model

import os

def vocab_file():
    vocab_file = os.path.join(os.path.abspath(''), 'small_bert_bert_uncased_L-2_H-128_A-2_1/assets/vocab.txt')
    return vocab_file

def read_vocab():
    f = open(vocab_file(), 'rb')
    words = f.readlines()
    words = [word.rstrip() for word in words]
    return words

def has_ing():
    vocab = read_vocab()
    return [s.endswith('ing') for s in vocab]

def word_length():
    vocab = read_vocab()
    return [len(s) for s in vocab]

class WordLengthDistanceFunction(object):

    def __init__(self):
        super(WordLengthDistanceFunction, self).__init__()
        self.lookup = tf.constant(word_length())

    def distance(self, ids_1, ids_2):
        lengths_1 = tf.gather(self.lookup, ids_1)
        lengths_2 = tf.gather(self.lookup, ids_2)
        distance = tf.math.abs(lengths_2 - lengths_1)
        distance = tf.cast(distance, dtype=tf.float32)
        return distance

class IngDistanceFunction(object):

    def __init__(self):
        super(IngDistanceFunction, self).__init__()
        self.lookup = tf.constant(has_ing())

    def is_ing(self, ids):
        ings = tf.gather(self.lookup, ids)
        return ings

    def distance(self, ids_1, ids_2):
        ings_1 = tf.gather(self.lookup, ids_1)
        ings_2 = tf.gather(self.lookup, ids_2)
        are_different = tf.logical_xor(ings_1, ings_2)
        distance = tf.cast(are_different, dtype=tf.float32)
        return distance

def bert_model(features, labels, mode, params):
    batch_size = params['batch_size']

    def _run_bert(input_ids, input_mask, segment_ids):
        bert_module = hub.Module(
            "https://tfhub.dev/google/small_bert/bert_uncased_L-2_H-128_A-2/1",
            trainable=True, tags={"train"})
        bert_inputs = dict(
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids)
        bert_outputs = bert_module(bert_inputs, signature="tokens", as_dict=True)
        #pooled_output = bert_outputs["pooled_output"]
        sequence_output = tf.stop_gradient(bert_outputs["sequence_output"])
        return sequence_output

    with tf.variable_scope('bert_1'):
        bert_embs_1 = _run_bert(features['input_ids_1'], features['input_mask_1'], features['segment_ids_1'])
    with tf.variable_scope('bert_2'):
        bert_embs_2 = _run_bert(features['input_ids_2'], features['input_mask_2'], features['segment_ids_2'])

    feature_extracting_projection = tf.get_variable(
        "feature_extracting_projection",
        [BERT_EMBEDDING_DIM, BERT_EMBEDDING_DIM],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    def _apply_extractor(bert_embs):
        def _extract_single_embedding_vector(embedding_vector):
            embedding_vector = tf.expand_dims(embedding_vector, axis=0)
            res = tf.matmul(embedding_vector, feature_extracting_projection)
            return tf.squeeze(res)

        def _extract_sequence_of_embeddings(embedding_seq):
            return tf.map_fn(_extract_single_embedding_vector, embedding_seq, infer_shape=False)

        logits = tf.map_fn(_extract_sequence_of_embeddings, bert_embs, infer_shape=False)
        logits = tf.ensure_shape(logits, [batch_size, MAX_SEQ_LENGTH, BERT_EMBEDDING_DIM])
        return logits
        
    with tf.variable_scope('extract_1'):
        extracted_features_1 = _apply_extractor(bert_embs_1)
    with tf.variable_scope('extract_2'):
        extracted_features_2 = _apply_extractor(bert_embs_2)

    def _sample(ids, mask, feats):
        def _get_lowest_index_with_zero(x):
            is_zero = tf.where(tf.equal(x, 0))
            is_zero = tf.squeeze(is_zero)
            is_zero = tf.cast(is_zero, dtype=tf.int32)
            earliest_zero = tf.cond(
                tf.equal(tf.size(is_zero), 0),
                lambda: tf.squeeze(tf.shape(x)),
                lambda: tf.reduce_min(is_zero))
            return earliest_zero
        # [batch_size], where each element is the last index that is not masked
        example_is_ended = tf.map_fn(_get_lowest_index_with_zero, mask, infer_shape=False)

        def _sample_indexes_for_example(is_ended):
            inx = tf.random.uniform([NUM_PAIRS_PER_FPROP], minval=0, maxval=is_ended, dtype=tf.int32)
            return inx

        # [batch_size, NUM_PAIRS_PER_FPROP], sampled indexes
        indexes = tf.map_fn(_sample_indexes_for_example, example_is_ended, infer_shape=False)

        def _take(inputs):
            p, ind = inputs
            return tf.gather(p, ind), ind

        # [batch_size, NUM_PAIRS_PER_FPROP], sampled ids and extracted features
        sampled_ids, _ = tf.map_fn(_take, (ids, indexes), infer_shape=False)
        sampled_ids = tf.ensure_shape(sampled_ids, [batch_size, NUM_PAIRS_PER_FPROP])
        sampled_feats, _ = tf.map_fn(_take, (feats, indexes), infer_shape=False)
        sampled_feats = tf.ensure_shape(sampled_feats, [batch_size, NUM_PAIRS_PER_FPROP, BERT_EMBEDDING_DIM])

        return sampled_ids, sampled_feats

    with tf.variable_scope('sample_ids_and_feats_1'):
        sampled_ids_1, sampled_feats_1 = _sample(features['input_ids_1'], features['input_mask_1'], extracted_features_1)     
    with tf.variable_scope('sample_ids_and_feats_2'):
        sampled_ids_2, sampled_feats_2 = _sample(features['input_ids_2'], features['input_mask_2'], extracted_features_2)

    with tf.variable_scope('loss'):
        def _pointwise_euclidean_distance(a, b):
            difference = tf.math.abs(a - b)
            squared_difference = tf.math.square(difference)
            sum_of_squared_difference = tf.reduce_sum(squared_difference, axis=-1)
            euclidean_distance = tf.math.sqrt(sum_of_squared_difference)
            return euclidean_distance

        distance_function = WordLengthDistanceFunction()
        true_distance = distance_function.distance(sampled_ids_1, sampled_ids_2)
        feats_distance = _pointwise_euclidean_distance(sampled_feats_1, sampled_feats_2)
        
        loss_matrix = tf.math.abs(true_distance - feats_distance)
        loss = tf.reduce_mean(loss_matrix)

    with tf.variable_scope('metrics'):
        pass
        ings_1 = distance_function.is_ing(sampled_ids_1)
        ings_2 = distance_function.is_ing(sampled_ids_2)

        both_ing = tf.logical_and(ings_1, ings_2)
        either_ing = tf.logical_xor(ings_1, ings_2)
        neither_ing = tf.logical_not(tf.logical_or(ings_1, ings_2))

        def _avg_feats_distance(feats_distance_matrix, indexes):
            indexes_as_float = tf.cast(indexes, dtype=tf.float32)
            num_matches = tf.reduce_sum(indexes_as_float)
            matching_distances = tf.math.multiply(feats_distance_matrix, indexes_as_float)
            sum_distances = tf.reduce_sum(matching_distances)
            return tf.cond(
                tf.equal(num_matches, 0),
                lambda: 0.0,
                lambda: (sum_distances / num_matches)
            )

        both_dists = _avg_feats_distance(feats_distance, both_ing)
        either_dists = _avg_feats_distance(feats_distance, either_ing)
        neither_dists = _avg_feats_distance(feats_distance, neither_ing)

    if mode == tf.estimator.ModeKeys.TRAIN:
        opt = tf.train.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08)
        train_op = opt.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
    elif mode == tf.estimator.ModeKeys.EVAL:
        pass
        # def _metric_fn(both_dists, either_dists, neither_dists):
        #     return {
        #         'both_dists': tf.metrics.mean(both_dists),
        #         'either_dists': tf.metrics.mean(either_dists),
        #         'neither_dists': tf.metrics.mean(neither_dists),
        #     }
        # return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=_metric_fn(both_dists, either_dists, neither_dists))
    elif mode == tf.estimator.ModeKeys.PREDICT:
        # In prediction mode, we emit sampled features for the entirety of the first input sequence
        res = {
            'ids': features['input_ids_1'],
            'masks': features['input_mask_1'],
            'extracted_features': extracted_features_1,
        }
        return tf.estimator.EstimatorSpec(mode, predictions=res)


In [14]:
# Input
def make_input_fn(examples):

    def input_fn():
        tokenizer = create_tokenizer_from_hub_module()
        feats = bert.run_classifier.convert_examples_to_features(examples, label_list, MAX_SEQ_LENGTH, tokenizer)
        vanilla_fn = bert.run_classifier.input_fn_builder(
            features=feats,
            seq_length=MAX_SEQ_LENGTH,
            is_training=True,
            drop_remainder=False
        )
        params = {
            "batch_size": BATCH_SIZE
        }        
        ds = vanilla_fn(params)
        #ds = ds.shuffle(buffer_size=1000)
        ds = ds.batch(2)
    
        def _unstack(record):
            stacked_input_ids = tf.ensure_shape(record['input_ids'], [2, BATCH_SIZE, MAX_SEQ_LENGTH])
            input_ids_1, input_ids_2 = tf.unstack(stacked_input_ids, axis=0)

            stacked_input_mask = tf.ensure_shape(record['input_mask'], [2, BATCH_SIZE, MAX_SEQ_LENGTH])
            input_mask_1, input_mask_2 = tf.unstack(stacked_input_mask, axis=0)

            stacked_segment_ids = tf.ensure_shape(record['segment_ids'], [2, BATCH_SIZE, MAX_SEQ_LENGTH])
            segment_ids_1, segment_ids_2 = tf.unstack(stacked_segment_ids, axis=0)

            return {
                'input_ids_1': input_ids_1,
                'input_ids_2': input_ids_2,
                'input_mask_1': input_mask_1,
                'input_mask_2': input_mask_2,
                'segment_ids_1': segment_ids_1,
                'segment_ids_2': segment_ids_2
            }
        ds = ds.map(_unstack)

        return ds
    return input_fn

# tf.reset_default_graph()
# tf.disable_v2_behavior()

# features = train_InputExamples[:(BATCH_SIZE * 5)]
# ds = make_input_fn(features)()._dataset
# it = tf.data.make_one_shot_iterator(ds)
# with tf.Session() as sess:
#     feats = it.get_next()
#     model_output = bert_model(feats, None, tf.estimator.ModeKeys.TRAIN, {'batch_size': BATCH_SIZE})
#     sess.run(tf.global_variables_initializer())
#     print(model_output)
#     n, s = sess.run([model_output, tf.shape(model_output)])
#     print(n)


In [15]:
run_config = tf.estimator.RunConfig(
    model_dir=os.path.join(os.path.abspath(''), 'ckpts/6/'),
    keep_checkpoint_max=100,
    save_checkpoints_steps=500
)

estimator = tf.estimator.Estimator(
    model_fn=bert_model,
    config=run_config,
    params= {
        'batch_size': BATCH_SIZE
    }
)

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 100, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x141bcda50>, '_model_dir': '/Users/cpeyser/Desktop/bert/ckpts/6/', '_protocol': None, '_save_checkpoints_steps': 500, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_session_creation_timeout_secs': 7200, '_experimental_distribute': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_experimental_max_worker_delay_secs': None, '_evaluation_master': '', '_eval_distribute': None, '_global_id_in_cluster': 0, '_master': ''}
INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_session_co

In [16]:
estimator.train(input_fn=make_input_fn(validation_InputExamples), max_steps=500)

1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x14469eed0>

In [None]:
CKPT=os.path.join(os.path.abspath(''), 'ckpts/3/model.ckpt-1000')
estimator.evaluate(input_fn=make_input_fn(test_InputExamples), steps=100, checkpoint_path=CKPT)

In [None]:
CKPT=os.path.join(os.path.abspath(''), 'ckpts/3/model.ckpt-1000')
predictions = estimator.predict(input_fn=make_input_fn(test_InputExamples), checkpoint_path=CKPT)


In [None]:
preds = []
for i in predictions:
    preds.append(i)
    if len(preds) >= 100:
        break

vocab = read_vocab()
def _lookup(id):
    return vocab[id]
v_lookup = np.vectorize(_lookup)

def _ends_ing(id):
    return vocab[id].endswith('ing')
v_ends_ing = np.vectorize(_ends_ing)

def _add_words_to_prediction(p):
    p['words'] = v_lookup(p['ids'])
    p['ends_with_ing'] = v_ends_ing(p['ids'])
    return p

preds_with_words = [_add_words_to_prediction(p) for p in preds]


In [None]:
import matplotlib.pyplot as plt

pred_0 = preds_with_words[13]
feats = pred_0['extracted_features']
ends_with_ing = pred_0['ends_with_ing']

feats_no_ing = [f for (i, f) in enumerate(feats) if not ends_with_ing[i]]
feats_ing = [f for (i, f) in enumerate(feats) if ends_with_ing[i]]

plt.figure()
plt.subplot(211)
plt.plot(feats_no_ing, np.ones_like(feats_no_ing), 'bo')
plt.axis([-0.05, 0.02, 0.9, 1.1])

plt.subplot(212)
plt.plot(feats_ing, np.ones_like(feats_ing), 'ro')
plt.axis([-0.05, 0.02, 0.9, 1.1])

plt.show()

 