# Bert Classification

In [15]:
from datetime import datetime
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf

## Configuration

In [16]:
data_dir = "data"

In [17]:
tf.logging.set_verbosity(tf.logging.ERROR)

## Load Data

In [18]:
imdb_data = pd.read_pickle(f"{data_dir}/imdb_data.pickle.gz")

In [19]:
sample_size = 1000
seed = 11989

train = imdb_data[imdb_data.data_set.str.lower() == 'train'].sample(sample_size, random_state = seed)
test = imdb_data[imdb_data.data_set.str.lower() == 'test'].sample(sample_size, random_state = seed*2)

In [20]:
print("Training size: {}".format(len(train)))
print("Testing size: {}".format(len(test)))

Training size: 1000
Testing size: 1000


## Setup Bert

In [21]:
from bert_classifier import *

In [22]:
bc = BertClassifier(data_column='sentence', label_column='polarity')

In [23]:
bc.config

{'bert_url': 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1',
        'output_dir': 'data/output',
 'data_column': 'sentence',
 'label_column': 'polarity',
        'max_seq_length': 128,
 'label_values': None,
 'batch_size': 32,
 'learning_rate': 2e-05,
 'num_train_epochs': 3.0,
 'warmup_proportion': 0.1,
 'save_checkpoints_steps': 500,
 'save_summary_steps': 100}

In [24]:
start = datetime.now()
bc.train(train)
et = datetime.now() - start
print("Training time: {}".format(et))

Training time: 0:00:08.629046


In [25]:
start = datetime.now()
results = bc.test(test)
et = datetime.now() - start
print("Test time: {}".format(et))
print("Results:")
print(pd.DataFrame([results]).iloc[0])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Test time: 0:05:22.605909
Results:
auc                  0.873093
eval_accuracy        0.873000
f1_score             0.874877
false_negatives     67.000000
false_positives     60.000000
global_step        468.000000
loss                 0.441786
precision            0.880952
recall               0.868885
true_negatives     429.000000
true_positives     444.000000
Name: 0, dtype: float64


In [26]:
start = datetime.now()
predictions = bc.predict(test)
et = datetime.now() - start
print("Prediction time: {}".format(et))

Prediction time: 0:05:11.126579


In [27]:
pd.DataFrame(predictions)[0:10]

Unnamed: 0,polarity,probabilities,sentence
0,0,"{0: 0.7381339, 1: 0.26186618}",As I sit and think about Poison for the Fairie...
1,0,"{0: 0.8665315, 1: 0.13346854}","First of all, it is interesting to note that o..."
2,1,"{0: 0.012707621, 1: 0.9872924}",This series has recently been unearthed and ex...
3,0,"{0: 0.9968266, 1: 0.0031734183}","Oh, the horror, the unspeakable horror of this..."
4,1,"{0: 0.0076506557, 1: 0.9923494}",Why this film was only released in 4 states is...
5,0,"{0: 0.87304544, 1: 0.12695454}","The use of ""astral projection""(wandering soul)..."
6,0,"{0: 0.99695075, 1: 0.0030492153}",This movie is not very bad tjough. But one can...
7,1,"{0: 0.012144838, 1: 0.98785514}",Based on its current IMDb rating as well as se...
8,0,"{0: 0.9964149, 1: 0.003585108}",That was one of the worst movies I've ever see...
9,0,"{0: 0.9970387, 1: 0.002961272}","I'd never heard of zero budget ""auteur"" Neil J..."
