# Bert Classification

Uses Bert to classify movie reviews. This example adds classification as a layer to the pre-trained Bert model.

In [1]:
from datetime import datetime
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf

## Configuration

In [2]:
data_dir = "data"

In [3]:
tf.logging.set_verbosity(tf.logging.ERROR)

## Load Data

In [4]:
imdb_data = pd.read_pickle(f"{data_dir}/imdb_data.pickle.gz")

In [5]:
sample_size = 100
seed = 11989

train = imdb_data[imdb_data.data_set.str.lower() == 'train'].sample(sample_size, random_state = seed)
test = imdb_data[imdb_data.data_set.str.lower() == 'test'].sample(sample_size, random_state = seed*2)

In [6]:
print("Training size: {}".format(len(train)))
print("Testing size: {}".format(len(test)))

Training size: 100
Testing size: 100


## Setup Bert

In [7]:
from bert_classifier import *

W0606 21:07:26.793935 139947718018816 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [8]:
bc = BertClassifier(data_column='sentence', label_column='polarity')

In [9]:
bc.config

{'bert_url': 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1',
 'output_dir': 'data/output',
 'data_column': 'sentence',
 'label_column': 'polarity',
 'max_seq_length': 128,
 'label_values': None,
 'batch_size': 32,
 'learning_rate': 2e-05,
 'num_train_epochs': 3.0,
 'warmup_proportion': 0.1,
 'save_checkpoints_steps': 500,
 'save_summary_steps': 100}

In [10]:
start = datetime.now()
bc.train(train)
et = datetime.now() - start
print("Training time: {}".format(et))

Training time: 0:00:10.393154


In [11]:
start = datetime.now()
results = bc.test(test)
et = datetime.now() - start
print("Test time: {}".format(et))
print("Results:")
print(pd.DataFrame([results]).iloc[0])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Test time: 0:00:35.620029
Results:
auc                 0.839536
eval_accuracy       0.840000
f1_score            0.833333
false_negatives     9.000000
false_positives     7.000000
global_step        46.000000
loss                0.330236
precision           0.851064
recall              0.816327
true_negatives     44.000000
true_positives     40.000000
Name: 0, dtype: float64


In [12]:
start = datetime.now()
predictions = bc.predict(test)
et = datetime.now() - start
print("Prediction time: {}".format(et))

Prediction time: 0:00:16.487120


In [13]:
pd.DataFrame(predictions)[0:10]

Unnamed: 0,polarity,probabilities,sentence
0,0,"{0: 0.9678986, 1: 0.032101378}",As I sit and think about Poison for the Fairie...
1,0,"{0: 0.90832955, 1: 0.091670424}","First of all, it is interesting to note that o..."
2,1,"{0: 0.32386047, 1: 0.67613953}",This series has recently been unearthed and ex...
3,0,"{0: 0.9676147, 1: 0.032385323}","Oh, the horror, the unspeakable horror of this..."
4,1,"{0: 0.118904956, 1: 0.88109505}",Why this film was only released in 4 states is...
5,0,"{0: 0.9129528, 1: 0.08704721}","The use of ""astral projection""(wandering soul)..."
6,0,"{0: 0.9807438, 1: 0.019256212}",This movie is not very bad tjough. But one can...
7,1,"{0: 0.06782116, 1: 0.9321788}",Based on its current IMDb rating as well as se...
8,0,"{0: 0.97693217, 1: 0.023067826}",That was one of the worst movies I've ever see...
9,0,"{0: 0.9652635, 1: 0.034736495}","I'd never heard of zero budget ""auteur"" Neil J..."
