# IMDB movie rating classifier using BERT

This is mostly a copy of https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb.

## Imports and definitions

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
import bert
import os
import re
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [2]:
OUTPUT_DIR = 'output'
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
LABEL_LIST = [0, 1]

## Fetch data

In [3]:
def read_from_folder(folder):
    data = {}
    data['sentence'] = []
    data['sentiment'] = []
    for path in os.listdir(folder):
        with tf.gfile.GFile(os.path.join(folder, path), 'r') as handle:
            data['sentence'].append(handle.read())
            data['sentiment'].append(re.match('\d+_(\d+)\.txt', path).group(1))
    return pd.DataFrame.from_dict(data)

def read_data(folder):
    pos_df = read_from_folder(os.path.join(folder, 'pos'))
    neg_df = read_from_folder(os.path.join(folder, 'neg'))
    pos_df['polarity'] = 1
    neg_df['polarity'] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

def fetch_data():
    path = tf.keras.utils.get_file(
        fname='aclImdb.tar.gz',
        origin='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
        extract=True)
    train_df = read_data(os.path.join(os.path.dirname(path), 'aclImdb', 'train'))
    test_df = read_data(os.path.join(os.path.dirname(path), 'aclImdb', 'test'))
    return train_df, test_df

In [4]:
train, test = fetch_data()

In [5]:
print("Average review length train set:", train.sentence.apply(len).mean())
print("Average review length test set:", test.sentence.apply(len).mean())
print("Average numer of words train set:", train.sentence.apply(lambda s: len(s.split())).mean())
print("Average numer of words test set:", test.sentence.apply(lambda s: len(s.split())).mean())

Average review length train set: 1325.06964
Average review length test set: 1293.7924
Average numer of words train set: 233.7872
Average numer of words test set: 228.52668


In [6]:
train = train.sample(5000)
test = test.sample(5000)
print("#train:", len(train))
print("#test:", len(test))

#train: 5000
#test: 5000


## Data preprocessing

In [7]:
def create_input_example(row):
    return bert.run_classifier.InputExample(
        guid=None,
        text_a = row[DATA_COLUMN],
        text_b = None,
        label = row[LABEL_COLUMN])

train_exs = train.apply(create_input_example, axis=1)
test_exs = test.apply(create_input_example, axis=1)

In [8]:
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'

def create_tokenizer_from_hub_module():
    
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature='tokenization_info', as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info['vocab_file'],
                                                  tokenization_info['do_lower_case']])
    return bert.tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'.
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 41.62MB
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 51.62MB
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 61.62MB
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 81.62MB
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 91.62MB
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 101.62MB
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 121.62MB
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 141.62MB
INFO:tensorflow:Downloading https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1: 151.62MB

In [9]:
print(tokenizer.tokenize("This here's an example of using the BERT tokenizer"))

['this', 'here', "'", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'token', '##izer']


In [10]:
MAX_SEQ_LENGTH = 128
train_features = bert.run_classifier.convert_examples_to_features(train_exs, LABEL_LIST, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_exs, LABEL_LIST, MAX_SEQ_LENGTH, tokenizer)

INFO:tensorflow:Writing example 0 of 5000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] one of the other comment ##ers mentioned that they almost walked out . if i hadn ' t been with my wife , who wanted to stay , i would have left . it ' s a shame , too , because i think it could have been a good movie . but this is easily one of the worst adapted screenplay ##s i ' ve ever seen . it starts out nowhere and it goes nowhere ( i would say it goes nowhere fast , but it really goes nowhere slow . . . painfully slow ) . from time to time there are hints that something interesting might happen , or that there is potentially some depth underneath one of the characters , but [SEP]
INFO:tensorflow:input_ids: 101 2028 1997 1996 2060 7615 2545 3855 2008 2027 2471 2939 2041 1012 2065 1045 2910 1005 1056 2042 2007 2026 2564 1010 2040 2359 2000 2994 1010 1045 2052 2031 2187 1012 2009 1005 1055 1037 9467 1010 2205 1010 2138 1045 2228 2009 2071 2031 2042 1037

INFO:tensorflow:input_ids: 101 2023 6298 4038 3475 1005 1056 2205 2919 1012 2045 2024 2070 6057 2477 6230 2182 1998 2045 1010 1998 2045 2024 2070 2738 13432 3494 1999 2009 1012 1026 7987 1013 1028 1026 7987 1013 1028 1996 3772 1010 2174 1010 2003 5515 4509 1006 2007 1996 6453 1997 1996 13448 1007 1012 2096 2070 5019 2024 2307 4569 1010 2500 2024 3432 16436 1012 1999 3327 1010 1045 2179 1996 1000 6298 1000 2112 1997 1996 2466 3532 1012 1026 7987 1013 1028 1026 7987 1013 1028 2035 1999 2035 1010 1045 3984 2009 1005 1055 4276 3773 2065 2017 2066 2374 1998 6298 22092 1012 2009 1005 1055 2025 2428 1037 2919 3185 1010 1998 1996 4566 2106 2514 3243 2204 1012 2074 102
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: 1 (id = 1)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] i remember watching this for the first time in the 80 ' s as a teen . man , i ' ve read the reviews on this trash and i find myself astonished by the voting . this movie does not deserve four stars ! ! ! this movie is not better than top ##gun . top ##gun has its own problems ; don ' t get me wrong . this m

## Creating a model

In [11]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, num_labels):
    """Create the model by stacking a fully connected layer on top of a pretrained
    BERT model from Tensorflow hub. The input of the BERT model is sequence of
    tokens, the output is a sequence representation (due to 'pooled_output').
    
    Adding an extra layer is just using standard Tensorflow functionality. 
    """
    
    bert_module = hub.Module(
        BERT_MODEL_HUB,
        trainable=True)
    bert_inputs = dict(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids)
    bert_outputs = bert_module(
        inputs=bert_inputs,
        signature='tokens',
        as_dict=True)
    
    output_layer = bert_outputs['pooled_output']
    
    hidden_size = output_layer.shape[-1].value
    
    output_weights = tf.get_variable(
        'output_weights', [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    
    output_bias = tf.get_variable(
        'output_bias', [num_labels], initializer=tf.zeros_initializer())
    
    with tf.variable_scope('loss'):
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        
        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        
        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        if is_predicting:
            return (predicted_labels, log_probs)
        
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)

In [22]:
def model_fn_builder(num_labels, learning_rate, num_train_steps, num_warmup_steps):
    
    def model_fn(features, labels, mode, params):
        input_ids = features['input_ids']
        input_mask = features['input_mask']
        segment_ids = features['segment_ids']
        label_ids = features['label_ids']
        
        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
        
        if not is_predicting:
            (loss, predicted_labels, log_probs) = create_model(
                is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
            
            train_op = bert.optimization.create_optimizer(
                loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
            
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                f1_score = tf.contrib.metrics.f1_score(label_ids, predicted_labels)
                auc = tf.metrics.auc(label_ids, predicted_labels)
                recall = tf.metrics.recall(label_ids, predicted_labels)
                precision = tf.metrics.precision(label_ids, predicted_labels)
                true_pos = tf.metrics.true_positives(label_ids, predicted_labels)
                true_neg = tf.metrics.true_negatives(label_ids, predicted_labels)
                false_pos = tf.metrics.false_positives(label_ids, predicted_labels)
                false_neg = tf.metrics.false_negatives(label_ids, predicted_labels)
                return {'eval_accuracy': accuracy,
                        'f1_score': f1_score,
                        'auc': auc,
                        'precision': precision,
                        'recall': recall,
                        'true_positives': true_pos,
                        'true_negatives': true_neg,
                        'false_positives': false_pos,
                        'false_negatives': false_neg}
    
            eval_metrics = metric_fn(label_ids, predicted_labels)
            
            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs) = create_model(
                is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
            predictions = {
                'probabilities': log_probs,
                'labels': predicted_labels}
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)
        
    return model_fn

In [23]:
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [24]:
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [25]:
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [26]:
model_fn = model_fn_builder( 
    num_labels=len(LABEL_LIST),
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    config=run_config,
    params={'batch_size': BATCH_SIZE}
)

INFO:tensorflow:Using config: {'_model_dir': 'output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f387896e358>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [27]:
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [28]:
print('Starting to train...')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('Training took time ', datetime.now() - current_time)

Starting to train...
INFO:tensorflow:Skipping training since max_steps has already saved.
Training took time  0:00:00.005710


In [29]:
test_input_fn = bert.run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [30]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-02-17-21:35:43
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from output/model.ckpt-468
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-02-17-22:21:48
INFO:tensorflow:Saving dict for global step 468: auc = 0.86803305, eval_accuracy = 0.868, f1_score = 0.866017, false_negatives = 372.0, false_positives = 288.0, global_step = 468, loss = 0.47416788, precision = 0.8810409, recall = 0.851497, true_negatives = 2207.0, true_positives = 2133.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 468: output/model.ckpt-468


{'auc': 0.86803305,
 'eval_accuracy': 0.868,
 'f1_score': 0.866017,
 'false_negatives': 372.0,
 'false_positives': 288.0,
 'loss': 0.47416788,
 'precision': 0.8810409,
 'recall': 0.851497,
 'true_negatives': 2207.0,
 'true_positives': 2133.0,
 'global_step': 468}

In [31]:
def get_predictions(in_sentences):
    labels = ['Negative', 'Positive']
    input_examples = [create_input_example({DATA_COLUMN: x, LABEL_COLUMN: 0})
                                           for x in in_sentences]
    input_features = bert.run_classifier.convert_examples_to_features(
        input_examples, LABEL_LIST, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = bert.run_classifier.input_fn_builder(
        features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    return [(sentence, prediction['probabilities'], labels[prediction['labels']])
            for sentence, prediction in zip(in_sentences, predictions)]

In [32]:
sentences = [
    'That movie was absolutely awful',
    'The acting was a bit lacking',
    'The film was creative and surprising',
    'Absolutely fantastic'
]

In [35]:
predictions = get_predictions(sentences)

INFO:tensorflow:Writing example 0 of 4
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] that movie was absolutely awful [SEP]
INFO:tensorflow:input_ids: 101 2008 3185 2001 7078 9643 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [36]:
predictions

[('That movie was absolutely awful',
  array([-1.9294472e-03, -6.2514567e+00], dtype=float32),
  'Negative'),
 ('The acting was a bit lacking',
  array([-3.1934001e-03, -5.7482734e+00], dtype=float32),
  'Negative'),
 ('The film was creative and surprising',
  array([-5.2045507, -0.0055067], dtype=float32),
  'Positive'),
 ('Absolutely fantastic',
  array([-4.8700013, -0.007703 ], dtype=float32),
  'Positive')]