# Import Dependencies

In [8]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import os
import xml.etree.ElementTree as ET
import tensorflow as tf
import pandas as pd

## Classes

In [9]:
class Opinion:
    def __init__(self, target, category, polarity, start, end):
        self.target = target
        self.category = category
        self.polarity = polarity
        self.start = start
        self.end = end

In [10]:
class Sentence:
    def __init__(self, text, opinions):
        self.text = text
        self.opinions = opinions

In [11]:
class Review:
    def __init__(self, sentences):
        self.sentences = sentences

# Instantiate Model

In [12]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


# Train and Test Split

In [14]:
train = tf.keras.preprocessing.text_dataset_from_directory(
    'data/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=42)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'data/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=42)

Found 1630 files belonging to 2 classes.
Using 1304 files for training.
Found 1630 files belonging to 2 classes.
Using 326 files for validation.


### Train data to DataFrame

In [15]:
for i in train.take(1):
    train_feat = i[0].numpy()
    train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,"Over the years the host, Vittorio, and his cre...",1
1,I really recommend the very simple Unda (Egg) ...,1
2,Excellent spot for holiday get togethers with ...,1
3,It’s just you and your date and an occasional ...,1
4,"Place is open till late, no dress code.",1


### Test data to Dataframe

In [16]:
for j in test.take(1):
    test_feat = j[0].numpy()
    test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,It's great to go for a quick lunch either alon...,1
1,"They never brought us complimentary noodles, i...",0
2,I LOVE their Thai,1
3,The waiter delivered our food while holding wh...,0
4,Worth the trip from Manhattan.,1


# Input Sequences

## InputExample

Doc: https://huggingface.co/transformers/main_classes/processors.html#transformers.data.processors.utils.InputExample
A single training/test exmaple for simple sequence classification


## InputFeatures

Doc: https://huggingface.co/transformers/main_classes/processors.html#transformers.data.processors.utils.InputFeatures
A sincle set of features of data

In [17]:
def data_to_examples(data, COLUMN, LABEL):
    return data.apply(lambda x: InputExample(guid=None, 
                                            text_a = x[COLUMN], 
                                            text_b = None,
                                            label = x[LABEL]), axis = 1)

In [18]:
train_examples = data_to_examples(train, 'DATA_COLUMN', 'LABEL_COLUMN')
test_examples = data_to_examples(test, 'DATA_COLUMN', 'LABEL_COLUMN')

In [19]:
def examples_to_dataset(examples, tokenizer, max_length=128):
    features = [] 

    for example in examples:
        # Doc: https://huggingface.co/transformers/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus
        input_dict = tokenizer.encode_plus(
            example.text_a,
            add_special_tokens=True,
            max_length=max_length,
            return_token_type_ids=True,
            return_attention_mask=True,
            padding='max_length',
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=example.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [20]:
train_data = examples_to_dataset(list(train_examples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

test_data = examples_to_dataset(list(test_examples), tokenizer)
test_data = test_data.batch(32)

# Fine tuning

In [21]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f98c3b4f590>

In [22]:
os.makedirs("model/bert")
model.save_pretrained("model/bert")


In [64]:
xml_data = open(os.path.join('data/semeval16_test.xml'), 'r').read()
root = ET.XML(xml_data)

prediction_sentences = []

for _, review in enumerate(root):
    current_sentences = []

    for _, sentences in enumerate(review):
        for _, sentence in enumerate(sentences):
            for _, sentence_children in enumerate(sentence):
                if sentence_children.tag == "text":
                    prediction_sentences.append(sentence_children.text)

tf_batch = tokenizer(prediction_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(prediction_sentences)):
  print(prediction_sentences[i], " ---- ", labels[label[i]], '\n')


Yum!  ----  Positive 

Serves really good sushi.  ----  Positive 

Not the biggest portions but adequate.  ----  Negative 

Green Tea creme brulee is a must!  ----  Positive 

Don't leave the restaurant without it.  ----  Negative 

No Comparison  ----  Negative 

– I can't say enough about this place.  ----  Positive 

It has great sushi and even better service.  ----  Positive 

The entire staff was extremely accomodating and tended to my every need.  ----  Positive 

I've been to this restaurant over a dozen times with no complaints to date.  ----  Positive 

Snotty Attitude  ----  Negative 

– We were treated very rudely here one time for breakfast.  ----  Negative 

The owner is belligerent to guests that have a complaint.  ----  Negative 

Good food!  ----  Positive 

– We love breakfast food.  ----  Positive 

This is a great place to get a delicious meal.  ----  Positive 

We never had to wait more than 5 minutes.  ----  Positive 

The staff is pretty friendly.  ----  Positive 