# Bert Classification

Uses Bert to classify movie reviews. This example adds classification as a layer to the pre-trained Bert model.

In [1]:
from datetime import datetime
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf

## Configuration

In [2]:
data_dir = "data"

In [3]:
tf.logging.set_verbosity(tf.logging.ERROR)

## Load Data

In [4]:
imdb_data = pd.read_pickle(f"{data_dir}/imdb_data.pickle.gz")

In [5]:
sample_size = 100
seed = 11989

train = imdb_data[imdb_data.data_set.str.lower() == 'train'].sample(sample_size, random_state = seed)
test = imdb_data[imdb_data.data_set.str.lower() == 'test'].sample(sample_size, random_state = seed*2)

In [6]:
print("Training size: {}".format(len(train)))
print("Testing size: {}".format(len(test)))

Training size: 100
Testing size: 100


## Setup Bert

In [8]:
from bert_classifier import *

W0606 22:59:21.008330 139798227105536 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [9]:
bc = BertClassifier(data_column='sentence', label_column='polarity')

In [10]:
bc.config

{'bert_url': 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1',
 'output_dir': 'data/output',
 'data_column': 'sentence',
 'label_column': 'polarity',
 'max_seq_length': 128,
 'label_values': None,
 'batch_size': 32,
 'learning_rate': 2e-05,
 'num_train_epochs': 3.0,
 'warmup_proportion': 0.1,
 'save_checkpoints_steps': 500,
 'save_summary_steps': 100}

In [11]:
start = datetime.now()
bc.train(train)
et = datetime.now() - start
print("Training time: {}".format(et))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Training time: 0:02:20.872586


In [12]:
start = datetime.now()
results = bc.test(test)
et = datetime.now() - start
print("Test time: {}".format(et))
print("Results:")
print(pd.DataFrame([results]).iloc[0])

Test time: 0:00:36.058869
Results:
auc                 0.576030
eval_accuracy       0.570000
f1_score            0.666667
false_negatives     6.000000
false_positives    37.000000
global_step         9.000000
loss                0.637070
precision           0.537500
recall              0.877551
true_negatives     14.000000
true_positives     43.000000
Name: 0, dtype: float64


In [13]:
start = datetime.now()
predictions = bc.predict(test)
et = datetime.now() - start
print("Prediction time: {}".format(et))

Prediction time: 0:00:16.972973


In [14]:
pd.DataFrame(predictions)[0:10]

Unnamed: 0,polarity,probabilities,sentence
0,1,"{0: 0.431754, 1: 0.568246}",As I sit and think about Poison for the Fairie...
1,1,"{0: 0.3931019, 1: 0.6068981}","First of all, it is interesting to note that o..."
2,1,"{0: 0.39992294, 1: 0.6000771}",This series has recently been unearthed and ex...
3,0,"{0: 0.5919361, 1: 0.4080639}","Oh, the horror, the unspeakable horror of this..."
4,1,"{0: 0.3128256, 1: 0.68717444}",Why this film was only released in 4 states is...
5,1,"{0: 0.31602004, 1: 0.68398}","The use of ""astral projection""(wandering soul)..."
6,1,"{0: 0.45369482, 1: 0.5463051}",This movie is not very bad tjough. But one can...
7,1,"{0: 0.36119476, 1: 0.63880527}",Based on its current IMDb rating as well as se...
8,1,"{0: 0.45577654, 1: 0.54422355}",That was one of the worst movies I've ever see...
9,1,"{0: 0.4143858, 1: 0.5856142}","I'd never heard of zero budget ""auteur"" Neil J..."
