# Bert Classification

In [8]:
from datetime import datetime
import os

import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [2]:
data_dir = "data"

In [45]:
tf.logging.set_verbosity(tf.logging.WARN)

## Load Data

In [3]:
imdb_data = pd.read_pickle(f"{data_dir}/imdb_data.pickle.gz")

In [5]:
sample_size = 50
seed = 11989

train = imdb_data[imdb_data.data_set.str.lower() == 'train'].sample(sample_size, random_state = seed)
test = imdb_data[imdb_data.data_set.str.lower() == 'test'].sample(sample_size, random_state = seed*2)

## Setup Bert

In [35]:
class BertClassifier:
    
    DEFAULT_CONFIG = {
        'bert_url': 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1',
        'output_dir': 'data/output',
        'data_column': None,
        'label_column': None,
        'max_seq_length': 128,
        'label_list': [0, 1]
    }
    
    def __init__(self, **config):
        self.config = BertClassifier.DEFAULT_CONFIG.copy()
        self.config.update(config)
        
        # Make sure the output directory exists.
        if not os.path.isdir(self.config['output_dir']):
            os.makedirs(self.config['output_dir'])
        
        self._tokenizer = None
        
    def test(self, test):
        pass
        
    def train(self, train):
        train_features = self._feature_extractor(train)

        label_list = [0, 1]
        
        #compute train steps and warmup steps from batch size
        BATCH_SIZE = 32
        LEARNING_RATE = 2e-5
        NUM_TRAIN_EPOCHS = 3.0
        # Warmup is a period of time where the learning rate is small and gradually increases--usually helps training.
        WARMUP_PROPORTION = 0.1

        SAVE_CHECKPOINTS_STEPS = 500
        SAVE_SUMMARY_STEPS = 100
        # Compute # train and warmup steps from batch size

        num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
        num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

        #output directory and number of checkpoint steps to to save
        run_config = tf.estimator.RunConfig(
            model_dir=self.config['output_dir'],
            save_summary_steps=SAVE_SUMMARY_STEPS,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

        #created model function
        model_fn = self._model_fn_builder(
            num_labels=len(label_list),
            learning_rate=LEARNING_RATE,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps)

        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            config=run_config,
            params={"batch_size": BATCH_SIZE})

        #create an input function for training; drop_remainder=True for TPUs
        train_input_fn = bert.run_classifier.input_fn_builder(
            features=train_features,
            seq_length=self.config['max_seq_length'],
            is_training=True,
            drop_remainder=False)

        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        
        self.estimator = estimator
        return self.estimator
        
    def tokenizer(self):
        if self._tokenizer is None:
            with tf.Graph().as_default():
                bert_module = hub.Module(self.config['bert_url'])
                tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
                with tf.Session() as sess:
                    vocab_file, do_lower_case = sess.run(
                        [tokenization_info["vocab_file"],
                        tokenization_info["do_lower_case"]])

            self._tokenizer = bert.tokenization.FullTokenizer(
                vocab_file=vocab_file, do_lower_case=do_lower_case)

        return self._tokenizer
    
            
    def _feature_extractor(self, df):
        inputs = df.apply(lambda row: bert.run_classifier.InputExample(
            guid=None,
            text_a = row[self.config['data_column']],
            text_b = None,
            label = row[self.config['label_column']]), axis=1)
            
        tokenizer = self.tokenizer()
        label_list = [0, 1]
        
        
        print(f"{label_list} -> {self.config['max_seq_length']}")
        
        return bert.run_classifier.convert_examples_to_features(
            inputs,
            label_list,
            self.config['max_seq_length'],
            tokenizer)
    
    def _create_model(self, is_predicting, input_ids, input_mask, segment_ids, labels, num_labels):
        bert_module = hub.Module(self.config['bert_url'], trainable=True)
        bert_inputs = dict(
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids)
        bert_outputs = bert_module(
            inputs=bert_inputs,
            signature="tokens",
            as_dict=True)

        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_outputs" for token-level output.
        output_layer = bert_outputs["pooled_output"]

        hidden_size = output_layer.shape[-1].value

        # Create our own layer to tune for politeness data.
        output_weights = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable(
            "output_bias", [num_labels], initializer=tf.zeros_initializer())

        with tf.variable_scope("loss"):

            # Dropout helps prevent overfitting
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            # Convert labels into one-hot encoding
            one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

            predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
            # If we're predicting, we want predicted labels and the probabiltiies.
            if is_predicting:
                return (predicted_labels, log_probs)

            # If we're train/eval, compute loss between predicted and actual label
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
            loss = tf.reduce_mean(per_example_loss)
            return (loss, predicted_labels, log_probs)

    # model_fn_builder actually creates our model function
    # using the passed parameters for num_labels, learning_rate, etc.
    def _model_fn_builder(self, num_labels, learning_rate, num_train_steps, num_warmup_steps):
        
        # Calculate evaluation metrics. 
        def metric_fn(label_ids, predicted_labels):
            accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
            f1_score = tf.contrib.metrics.f1_score(
                label_ids,
                predicted_labels)
            auc = tf.metrics.auc(
                label_ids,
                predicted_labels)
            recall = tf.metrics.recall(
                label_ids,
                predicted_labels)
            precision = tf.metrics.precision(
                label_ids,
                predicted_labels) 
            true_pos = tf.metrics.true_positives(
                label_ids,
                predicted_labels)
            true_neg = tf.metrics.true_negatives(
                label_ids,
                predicted_labels)   
            false_pos = tf.metrics.false_positives(
                label_ids,
                predicted_labels)  
            false_neg = tf.metrics.false_negatives(
                label_ids,
                predicted_labels)
            return {
                "eval_accuracy": accuracy,
                "f1_score": f1_score,
                "auc": auc,
                "precision": precision,
                "recall": recall,
                "true_positives": true_pos,
                "true_negatives": true_neg,
                "false_positives": false_pos,
                "false_negatives": false_neg
            }
               
        def create_model_fn_predicting(features, labels, mode, params):
            input_ids = features["input_ids"]
            input_mask = features["input_mask"]
            segment_ids = features["segment_ids"]
            label_ids = features["label_ids"]

            (predicted_labels, log_probs) = self._create_model(
                    True, input_ids, input_mask, segment_ids, label_ids, num_labels)

            predictions = {
                'probabilities': log_probs,
                'labels': predicted_labels
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)
        
        def create_model_fn_not_predicting(features, labels, mode, params):
            input_ids = features["input_ids"]
            input_mask = features["input_mask"]
            segment_ids = features["segment_ids"]
            label_ids = features["label_ids"]
            
            (loss, predicted_labels, log_probs) = self._create_model(
                False, input_ids, input_mask, segment_ids, label_ids, num_labels)

            train_op = bert.optimization.create_optimizer(
                loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics)

        def model_fn(features, labels, mode, params):
            if mode == tf.estimator.ModeKeys.PREDICT:
                return create_model_fn_predicting(features, labels, mode, params)
            else:
                return create_model_fn_not_predicting(features, labels, mode, params)
        
        
        # Return the actual model function in the closure
        return model_fn

In [46]:
bc = BertClassifier(data_column='sentence', label_column='polarity')
bc.train(train)
bc.test(test)

[0, 1] -> 128
0:00:03.372587


In [37]:
bc.estimator

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x1c56a802f98>

In [None]:
# This is a path to an uncased (all lowercase) version of BERT
bert_model_url = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def bert_create_tokenizer(bert_model_url):
    with tf.Graph().as_default():
        bert_module = hub.Module(bert_model_url)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run(
                [tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"]])
      
    return bert.tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = bert_create_tokenizer(bert_model_url)

In [None]:
def BERT_Classification(train, test, DATA_COLUMN, LABEL_COLUMN, OUTPUT_DIR):
    
    bert_model_hub="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1" #need to write it as a function later
    
    train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

    test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
    
   
    tokenizer = bert_create_tokenizer(bert_model_url)

    MAX_SEQ_LENGTH = 128
    # Convert our train and test features to InputFeatures that BERT understands.
    train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

    #compute train steps and warmup steps from batch size
    BATCH_SIZE = 32
    LEARNING_RATE = 2e-5
    NUM_TRAIN_EPOCHS = 3.0
    # Warmup is a period of time where the learning rate is small and gradually increases--usually helps training.
    WARMUP_PROPORTION = 0.1

    SAVE_CHECKPOINTS_STEPS = 500
    SAVE_SUMMARY_STEPS = 100
    # Compute # train and warmup steps from batch size
    
    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    #output directory and number of checkpoint steps to to save
    run_config = tf.estimator.RunConfig(
        model_dir=OUTPUT_DIR,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    #created model function
    model_fn = model_fn_builder(
      num_labels=len(label_list),
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": BATCH_SIZE})
   
    #create an input function for training; drop_remainder=True for TPUs
    train_input_fn = bert.run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)
    
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    #create an input function for testing; drop_remainder=True for TPUs
    test_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)

    result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None)
    
    return {
        "Model_Evaluation":result_dict, 
        "Model": estimator
    }

