# Install & Import

In [None]:
!pip install transformers



In [None]:
# TensorFlow
import tensorflow as tf
import tensorflow_datasets as tfds

# HuggingFace NLP library
from transformers import TFBertForSequenceClassification, BertTokenizer

# Model Evaluation
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import datetime

# Import Data

In [None]:
df = pd.read_csv("IMDB Dataset.csv.zip")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
df['sentiment'] = df['sentiment'].astype('category').cat.codes
df['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.value_counts()

0    20039
1    19961
Name: sentiment, dtype: int64

In [None]:
y_test.value_counts()

1    5039
0    4961
Name: sentiment, dtype: int64

In [None]:
train = pd.DataFrame({
    'text': X_train,
    'label': y_train
})
test = pd.DataFrame({
    'text': X_test,
    'label': y_test
})

train.to_csv('imdb_review_train.csv', index=False)
test.to_csv('imdb_review_test.csv', index=False)

In [None]:
ds_train = tf.data.experimental.CsvDataset(['imdb_review_train.csv'],
                                           record_defaults=[tf.constant([""], dtype=tf.string), tf.constant([0], dtype=tf.int64)],
                                           header=True)
ds_test = tf.data.experimental.CsvDataset(['imdb_review_test.csv'],
                                           record_defaults=[tf.constant([""], dtype=tf.string), tf.constant([0], dtype=tf.int64)],
                                           header=True)
ds_train = ds_train.take(20000)
ds_test = ds_test.take(5000)

for msg, label in ds_test.take(5):
  print(msg, label)

tf.Tensor(b"I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes on Jerry Lawler and stuff with the Harts and Lawler was always very interesting, then Ludvig Borga destroyed Marty Jannetty, Undertaker took on Giant Gonzalez in another terrible match, The Smoking Gunns and Tatanka took on Bam Bam Bigelow and the Headshrinkers, and Yokozuna 

# Preprocessing

In [None]:
MODEL_NAME = 'bert-large-cased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
MAX_LENGTH = (int) (df['review'].str.len().mean())
MAX_LENGTH = 128

In [None]:
def convert_sentence_to_features(sentence):
    return tokenizer.encode_plus(
        sentence,
        max_length=MAX_LENGTH,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True
    )

In [None]:
def map_features_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_masks,
    }, label

In [None]:
def encode_sentences(dataset):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

    for message, label in tfds.as_numpy(dataset):
        bert_input = convert_sentence_to_features(message.decode())
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])

    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_features_to_dict)

# Encode Dataset

In [None]:
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 10000

In [None]:
ds_train_encoded = encode_sentences(ds_train).shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
ds_test_encoded = encode_sentences(ds_test).batch(BATCH_SIZE)



In [None]:
for msg, label in ds_test_encoded.take(1):
  print(msg, label)

{'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[  101,   146,  1541, ...,   117, 23402,   102],
       [  101,  1753,  1242, ...,  2749,   117,   102],
       [  101,  1109,  1273, ...,  1143,  1199,   102],
       ...,
       [  101,  2677,  4067, ...,  1122,   112,   102],
       [  101,   141,  6834, ...,  1119,  1108,   102],
       [  101,  1188,  1273, ...,   112,  1115,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}

# Train Model

In [None]:
EPOCHS = 2
LEARNING_RATE = 1e-6

In [None]:
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Configure Model

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

# Compile Model

In [None]:
model.compile(optimizer=optimizer,
              loss=loss,
              metrics=[metric])

In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  333579264 
_________________________________________________________________
dropout_73 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  2050      
Total params: 333,581,314
Trainable params: 333,581,314
Non-trainable params: 0
_________________________________________________________________


# TensorBoard

In [None]:
%reload_ext tensorboard

In [None]:
log_dir = 'logs/fit/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,histogram_freq=1, profile_batch=0) 

In [None]:
tensorboard_callback

<tensorflow.python.keras.callbacks.TensorBoard at 0x7f441ee83650>

# Fit Model

In [None]:
history = model.fit(ds_train_encoded,
                    epochs=EPOCHS,
                    validation_data=ds_test_encoded)

Epoch 1/2
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


ResourceExhaustedError: ignored

# Evaluation

In [None]:
_, train_acc = model.evaluate(ds_train_encoded)
_, test_acc = model.evaluate(ds_test_encoded)

print('train acc:', train_acc)
print('test acc:', test_acc)