# Bert Classification

In [1]:
from datetime import datetime
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf

## Configuration

In [3]:
data_dir = "data"

In [4]:
tf.logging.set_verbosity(tf.logging.ERROR)

## Load Data

In [5]:
imdb_data = pd.read_pickle(f"{data_dir}/imdb_data.pickle.gz")

In [6]:
sample_size = 500
seed = 11989

train = imdb_data[imdb_data.data_set.str.lower() == 'train'].sample(sample_size, random_state = seed)
test = imdb_data[imdb_data.data_set.str.lower() == 'test'].sample(sample_size, random_state = seed*2)

In [7]:
print("Training size: {}".format(len(train)))
print("Testing size: {}".format(len(test)))

Training size: 500
Testing size: 500


## Setup Bert

In [8]:
from bert_classifier import *

W0604 14:49:24.797538 13092 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [9]:
bc = BertClassifier(data_column='sentence', label_column='polarity')

In [10]:
bc.config

{'bert_url': 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1',
 'output_dir': 'data/output',
 'data_column': 'sentence',
 'label_column': 'polarity',
 'max_seq_length': 128,
 'label_values': None,
 'batch_size': 32,
 'learning_rate': 2e-05,
 'num_train_epochs': 3.0,
 'warmup_proportion': 0.1,
 'save_checkpoints_steps': 500,
 'save_summary_steps': 100}

In [11]:
start = datetime.now()
bc.train(train)
et = datetime.now() - start
print("Training time: {}".format(et))

Training time: 0:00:54.281849


In [12]:
start = datetime.now()
results = bc.test(test)
et = datetime.now() - start
print("Test time: {}".format(et))
print("Results:")
print(pd.DataFrame([results]).iloc[0])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Test time: 0:06:26.642179
Results:
auc                  0.543673
eval_accuracy        0.538000
f1_score             0.682477
false_negatives    159.000000
false_positives     72.000000
global_step          4.000000
loss                 0.684899
precision            0.581395
recall               0.386100
true_negatives     169.000000
true_positives     100.000000
Name: 0, dtype: float64


In [13]:
start = datetime.now()
predictions = bc.predict(test)
et = datetime.now() - start
print("Prediction time: {}".format(et))

Prediction time: 0:05:48.799207


In [15]:
pd.DataFrame(predictions)[0:10]

Unnamed: 0,polarity,probabilities,sentence
0,0,"{0: 0.5062175, 1: 0.49378252}",As I sit and think about Poison for the Fairie...
1,1,"{0: 0.46726444, 1: 0.5327356}","First of all, it is interesting to note that o..."
2,1,"{0: 0.4934246, 1: 0.50657547}",This series has recently been unearthed and ex...
3,0,"{0: 0.5307941, 1: 0.46920598}","Oh, the horror, the unspeakable horror of this..."
4,1,"{0: 0.43584517, 1: 0.56415486}",Why this film was only released in 4 states is...
5,1,"{0: 0.4841201, 1: 0.51587987}","The use of ""astral projection""(wandering soul)..."
6,1,"{0: 0.41334036, 1: 0.5866596}",This movie is not very bad tjough. But one can...
7,1,"{0: 0.44178122, 1: 0.5582188}",Based on its current IMDb rating as well as se...
8,1,"{0: 0.34644163, 1: 0.65355843}",That was one of the worst movies I've ever see...
9,1,"{0: 0.485607, 1: 0.51439303}","I'd never heard of zero budget ""auteur"" Neil J..."
