In [1]:
# https://github.com/strongio/keras-bert/blob/master/keras-bert.py

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda, GRU

In [3]:
from tqdm import tqdm

In [4]:
BERT_URL = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
module = hub.Module(BERT_URL)

In [5]:
from bert.tokenization import FullTokenizer

In [6]:
def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        return input_ids, input_mask, segment_ids

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids = [], [], []
    for example in tqdm(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids)
    )

In [7]:
def my_metric(y_true, y_pred):
    n = tf.size(y_true)
    eq = tf.math.equal(y_true, y_pred)
    eq = tf.reduce_sum(tf.cast(eq, tf.int32))
    return eq * 100 / n

In [8]:
# Build model
def build_model(max_seq_length, num_classes):
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)
#     lstm_1 = tf.keras.layers.LSTM(256, return_sequences=True)(bert_output)
#     dropout_1 = tf.keras.layers.Dropout(0.2)(lstm_1)
#     lstm_2 = tf.keras.layers.LSTM(128, return_sequences=True)(bert_output)
#     dropout_2 = tf.keras.layers.Dropout(0.2)(lstm_2)
#     lstm_3 = tf.keras.layers.LSTM(64)(dropout_2)
#     dense_1 = tf.keras.layers.Dense(50, activation='relu')(lstm_3)
    dense_2 = tf.keras.layers.Dense(num_classes, activation="sigmoid")(bert_output)

    model = tf.keras.models.Model(inputs=bert_inputs, outputs=dense_2)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy", my_metric])
    model.summary()

    return model

In [9]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="mean",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super(BertLayer, self).__init__(**kwargs)
        
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'output_size': self.output_size,
            'pooling': self.pooling,
            'n_fine_tune_layers': self.n_fine_tune_layers,
            'trainable': self.trainable
        })
        return config

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [10]:
# class BertLayer(tf.keras.layers.Layer):
#     def __init__(
#         self,
#         n_fine_tune_layers=10,
#         pooling="mean",
#         bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
#         **kwargs,
#     ):
#         self.n_fine_tune_layers = n_fine_tune_layers
#         self.trainable = True
#         self.output_size = 768
#         self.pooling = pooling
#         self.bert_path = bert_path
#         if self.pooling not in ["first", "mean"]:
#             raise NameError(
#                 f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
#             )

#         super(BertLayer, self).__init__(**kwargs)
        
#     def get_config(self):

#         config = super().get_config().copy()
#         config.update({
#             'output_size': self.output_size,
#             'pooling': self.pooling,
#             'n_fine_tune_layers': self.n_fine_tune_layers,
#             'trainable': self.trainable
#         })
#         return config

#     def build(self, input_shape):
#         self.bert = hub.Module(
#             self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
#         )

#         # Remove unused layers
#         trainable_vars = self.bert.variables
#         if self.pooling == "first":
#             trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
#             trainable_layers = ["pooler/dense"]

#         elif self.pooling == "mean":
#             trainable_vars = [
#                 var
#                 for var in trainable_vars
#                 if not "/cls/" in var.name and not "/pooler/" in var.name
#             ]
#             trainable_layers = []
#         else:
#             raise NameError(
#                 f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
#             )

#         # Select how many layers to fine tune
#         for i in range(self.n_fine_tune_layers):
#             trainable_layers.append(f"encoder/layer_{str(11 - i)}")

#         # Update trainable vars to contain only the specified layers
#         trainable_vars = [
#             var
#             for var in trainable_vars
#             if any([l in var.name for l in trainable_layers])
#         ]

#         # Add to trainable weights
#         for var in trainable_vars:
#             self._trainable_weights.append(var)

#         for var in self.bert.variables:
#             if var not in self._trainable_weights:
#                 self._non_trainable_weights.append(var)

#         super(BertLayer, self).build(input_shape)

#     def call(self, inputs):
#         inputs = [K.cast(x, dtype="int32") for x in inputs]
#         input_ids, input_mask, segment_ids = inputs
#         bert_inputs = dict(
#             input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
#         )
#         result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
#             "sequence_output"
#         ]
#         return result

#     def compute_output_shape(self, input_shape):
#         return (input_shape[0], self.output_size)

In [11]:
def create_tokenizer_from_hub_module(bert_path):
    """Get the vocab file and casing info from the Hub module."""
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

In [12]:
def convert_text_to_examples(texts):
    """Create InputExamples"""
    InputExamples = []
    for text in texts:
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None)
        )
    return InputExamples

In [13]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b

In [14]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """


In [15]:
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [16]:
data = pd.read_csv(r'C:\Users\Anna\Files\SentimentAnalysis\ugam\train.csv')

In [17]:
data.shape

(6136, 14)

In [18]:
train, dev = train_test_split(data, test_size=0.1)

In [19]:
train.shape

(5522, 14)

In [20]:
max_seq_length = 150

In [21]:
train_text = train['Review'].tolist()
train_text = [" ".join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

dev_text = dev['Review'].tolist()
dev_text = [" ".join(t.split()[0:max_seq_length]) for t in dev_text]
dev_text = np.array(dev_text, dtype=object)[:, np.newaxis]

In [22]:
num_labels = len(['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability',
       'Polarity'])
train_labels = train[['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability',
       'Polarity']].to_numpy()
dev_labels = dev[['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability',
       'Polarity']].to_numpy()

In [23]:
# Initialize session
sess = tf.Session()







In [24]:
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
tokenizer = create_tokenizer_from_hub_module(bert_path)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








In [25]:
# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text)
dev_examples = convert_text_to_examples(dev_text)

In [26]:
(train_input_ids,
train_input_masks,
train_segment_ids) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(dev_input_ids,
dev_input_masks,
dev_segment_ids) = convert_examples_to_features(tokenizer, dev_examples, max_seq_length=max_seq_length)

Converting examples to features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 5522/5522 [00:02<00:00, 2604.98it/s]
Converting examples to features: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 614/614 [00:00<00:00, 2355.93it/s]


In [27]:
train_input_ids.shape

(5522, 150)

In [28]:
train_labels.shape

(5522, 12)

In [29]:
my_callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                min_delta=0,
                                patience=3,
                                verbose=0,
                                mode='auto',
                                baseline=None,
                                restore_best_weights=True)

In [30]:
model = build_model(max_seq_length, num_labels)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
bert_layer (BertLayer)          (None, 768)          110104890   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [31]:
# Instantiate variables
initialize_vars(sess)

























In [32]:
model.fit(
    [train_input_ids, train_input_masks, train_segment_ids],
    train_labels,
    validation_data=(
        [dev_input_ids, dev_input_masks, dev_segment_ids],
        dev_labels,
    ),
    epochs=20,
    batch_size=32,
    callbacks=[my_callback]
)

Train on 5522 samples, validate on 614 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<tensorflow.python.keras.callbacks.History at 0x1d89cbb0d88>

In [33]:
model.save_weights(r'C:\Users\Anna\Files\SentimentAnalysis\ugam\models\bert2')

In [34]:
test = pd.read_csv(r'C:\Users\Anna\Files\SentimentAnalysis\ugam\test.csv')
test_text = test['Review'].tolist()
test_text = [" ".join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]

In [35]:
test_examples = convert_text_to_examples(test_text)

In [36]:
(test_input_ids,
test_input_masks,
test_segment_ids) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

Converting examples to features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2631/2631 [00:01<00:00, 2456.19it/s]


In [37]:
pred = model.predict([test_input_ids,
test_input_masks,
test_segment_ids], batch_size = 32)

In [38]:
df = pd.DataFrame(data=pred, columns=[['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability',
       'Polarity']])
df.to_csv(r'C:\Users\Anna\Files\SentimentAnalysis\ugam\results\bert3_predictions.csv', index = False)

In [39]:
pred_binary = np.where(pred > 0.5, 1, 0)

In [40]:
results = pd.DataFrame(data=pred_binary, columns=[['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability',
       'Polarity']])

In [41]:
results.to_csv(r'C:\Users\Anna\Files\SentimentAnalysis\ugam\results\res_bert3_05.csv', index = False)

In [42]:
pred_binary = np.where(pred > 0.3, 1, 0)
results = pd.DataFrame(data=pred_binary, columns=[['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability',
       'Polarity']])
results.to_csv(r'C:\Users\Anna\Files\SentimentAnalysis\ugam\results\res_bert3_03.csv', index = False)

In [43]:
pred_binary = np.where(pred > 0.05, 1, 0)
len(pred_binary)
results = pd.DataFrame(data=pred_binary, columns=[['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability',
       'Polarity']])
results.to_csv(r'C:\Users\Anna\Files\SentimentAnalysis\ugam\results\res_bert3_005.csv', index = False)

In [44]:
len(pred_binary)


2631