# BERT Sentiment Classifier with Tensorflow

In [158]:
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

from sklearn.model_selection import train_test_split

tf.get_logger().setLevel('ERROR')

In [4]:
%load_ext watermark

In [159]:
%watermark -p tensorflow,tensorflow_hub,tensorflow_text,sklearn

tensorflow     : 2.13.1
tensorflow_hub : 0.16.1
tensorflow_text: 2.13.0
sklearn        : 1.3.2



In [138]:
# Data downloaded from https://www.kaggle.com/datasets/farisdurrani/sentimentsearch

df=pd.concat([
    pd.read_csv("../data/farisdurrani/twitter_filtered.csv"),
    pd.read_csv("../data/farisdurrani/facebook_filtered.csv")
])

In [149]:
len(df)

821081

In [150]:
df.head()

Unnamed: 0,platform,bodyText,sentiment,date,country,Target
0,Twitter,@Kenichan I dived many times for the ball. Man...,0.4939,2009-04-06,,1.0
1,Twitter,"@nationwideclass no, it's not behaving at all....",-0.4939,2009-04-06,,-1.0
2,Twitter,Need a hug,0.4767,2009-04-06,,1.0
3,Twitter,@LOLTrish hey long time no see! Yes.. Rains a...,0.6208,2009-04-06,,1.0
4,Twitter,@Tatiana_K nope they didn't have it,0.0,2009-04-06,,0.0


In [151]:
df = df.dropna(subset=['sentiment'], axis=0)

In [154]:
df['Target'] = df['sentiment'].apply(lambda x: 1 if x==0 else np.sign(x)+1).astype(int)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Target'] = df['sentiment'].apply(lambda x: 1 if x==0 else np.sign(x)+1).astype(int)


Unnamed: 0,platform,bodyText,sentiment,date,country,Target
0,Twitter,@Kenichan I dived many times for the ball. Man...,0.4939,2009-04-06,,2
1,Twitter,"@nationwideclass no, it's not behaving at all....",-0.4939,2009-04-06,,0
2,Twitter,Need a hug,0.4767,2009-04-06,,2
3,Twitter,@LOLTrish hey long time no see! Yes.. Rains a...,0.6208,2009-04-06,,2
4,Twitter,@Tatiana_K nope they didn't have it,0.0,2009-04-06,,1


In [155]:
df.Target.value_counts(normalize=True)

Target
2    0.476433
1    0.262097
0    0.261471
Name: proportion, dtype: float64

In [188]:
df = df.sample(frac=0.5)

In [189]:
X_train, _X, y_train, _y = train_test_split(df['bodyText'], df['Target'], stratify=df['Target'], test_size=0.2)
X_val, X_test, y_val, y_test = train_test_split(_X, _y, stratify=_y, test_size=0.5)

In [190]:
print(f"Train : ({X_train.shape, y_train.shape}) samples")
print(f"Val : ({X_val.shape, y_val.shape}) samples")
print(f"Test : ({X_test.shape, y_test.shape}) samples")

Train : (((328355,), (328355,))) samples
Val : (((41044,), (41044,))) samples
Test : (((41045,), (41045,))) samples


In [191]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).cache().prefetch(buffer_size=tf.data.AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [192]:
for x, y in train_ds.batch(4).take(1):
    print(x, y)

tf.Tensor(
[b'just back from a gig!, imadethis and Furlo rawked! met up with some old friends too &gt;.&lt; thanks for all the nice comments '
 b'@nerdist YAY FOR @COLINMELOY '
 b"Wow am I tired..was up talking to my bffl Daisha til 12ish..she was talking some sense into me..Maybe I'll eat some cake to wake up "
 b'has lost her voice, '], shape=(4,), dtype=string) tf.Tensor([2 2 2 0], shape=(4,), dtype=int64)


2024-05-10 16:46:28.695647: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [193]:
y_test.value_counts()

Target
2    19561
1    10786
0    10698
Name: count, dtype: int64

In [32]:
BERT_MODEL_NAME = 'small_bert/bert_en_uncased_L-2_H-128_A-2'
TOKENIZER_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
BERT_MODEL_URL = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1'
tokenizer = hub.KerasLayer(TOKENIZER_URL)
bert_model = hub.KerasLayer(BERT_MODEL_URL)

In [33]:
text_test = ['sometimes i wish i was a panda']
text_preprocessed = tokenizer(text_test)

In [34]:
print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :10]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :10]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :10]}')

Keys       : ['input_mask', 'input_type_ids', 'input_word_ids']
Shape      : (1, 128)
Word Ids   : [  101  2823  1045  4299  1045  2001  1037 25462   102     0]
Input Mask : [1 1 1 1 1 1 1 1 1 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0]


In [35]:
bert_model(text_preprocessed)

{'default': <tf.Tensor: shape=(1, 128), dtype=float32, numpy=
 array([[-0.9999753 ,  0.16285095, -0.9991155 ,  0.94176614, -0.99974805,
          0.06312796, -0.998636  ,  0.4763149 ,  0.13516821, -0.02416325,
         -0.6557846 , -0.04298883, -0.1316844 ,  1.        , -0.88311327,
         -0.8912707 ,  0.90224254,  0.00613371, -0.8089084 ,  0.99620104,
          0.9636094 ,  0.08133331,  0.99200433,  0.95425904, -0.999996  ,
          0.05177542, -0.9996473 ,  0.9649326 ,  0.9903885 ,  0.07173721,
          0.10204275,  0.09355631, -0.97235924, -0.15794337,  0.7986553 ,
          0.9996173 , -0.6733732 , -0.155746  ,  0.8948879 , -0.99969214,
          0.7239788 ,  0.9898049 , -0.9991481 ,  0.9894511 , -0.9999779 ,
         -0.22439677, -0.99981123,  0.99636346,  0.9813393 ,  0.9832557 ,
          0.9895193 , -0.43293884,  0.0314689 ,  0.9930125 ,  0.99891675,
          0.999489  , -0.9907511 , -0.68675447,  0.5443378 , -0.77976435,
         -0.05668417,  0.29758197, -0.84936285,  0

In [253]:
class SentimentBERT(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.tokenizer = hub.KerasLayer(TOKENIZER_URL, name='tokenizer')
        self.bert_model = hub.KerasLayer(BERT_MODEL_URL, trainable=True, name='bert_model')
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.final = tf.keras.layers.Dense(3, activation=None)
        
    def call(self, inputs):
        x = self.tokenizer(inputs)
        x = self.bert_model(x)
        x = self.dropout(x['pooled_output'])
        out = self.final(x)
        return out

In [254]:
classifier = SentimentBERT()

In [255]:
bert_raw_result = classifier(tf.constant(text_test))
print(tf.keras.activations.softmax(bert_raw_result))

tf.Tensor([[0.14317669 0.43381873 0.42300454]], shape=(1, 3), dtype=float32)


In [256]:
bert_raw_result = classifier(x)
print(tf.keras.activations.softmax(bert_raw_result))

tf.Tensor(
[[0.2786571  0.41875377 0.30258903]
 [0.28298354 0.28624907 0.43076733]
 [0.11631086 0.41331226 0.47037688]
 [0.11895362 0.3087614  0.572285  ]], shape=(4, 3), dtype=float32)


In [None]:
tf.keras.utils.plot_model(classifier)

In [257]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.metrics.SparseCategoricalAccuracy()
optimizer = tf.keras.optimizers.Adam()

In [258]:
classifier.compile(optimizer=optimizer, loss=loss_fn, metrics=metric)

In [259]:
trainable_params = np.sum([np.prod(v.get_shape()) for v in classifier.trainable_weights])
non_trainable_params = np.sum([np.prod(v.get_shape()) for v in classifier.non_trainable_weights])
total_params = trainable_params + non_trainable_params
    
print(f"Trainable params : {trainable_params}")
print(f"Total params : {total_params}")
print(f"% : {trainable_params/total_params*100:.4f}")

Trainable params : 4386307
Total params : 4386308.0
% : 100.0000


In [260]:
print(f'Training model with {BERT_MODEL_NAME}')
history = classifier.fit(
    x=X_train.values,
    y=y_train.values,
    validation_data=(X_val.values, y_val.values),
    epochs=2,
    batch_size=512)

Training model with small_bert/bert_en_uncased_L-2_H-128_A-2
Epoch 1/2
Epoch 2/2


In [262]:
classifier.save("bert_classifier.tf")

In [263]:
reverse_labels = {v: k for k, v in sentiment_to_target.items()}

In [264]:
def sentiment_score(comment):
    inp = tf.data.Dataset.from_tensors(comment).batch(1)
    for x in inp.take(1):
        print(x)
    bert_raw_result = classifier(x)
    y = tf.keras.activations.softmax(bert_raw_result)
    print(y)
    for n, x in enumerate(y[0]):
        print(f"{reverse_labels[n]}: {100*x:.2f}%")

In [265]:
bert_raw_result = classifier(tf.constant(text_test))
print(tf.keras.activations.softmax(bert_raw_result))

tf.Tensor([[7.5321941e-04 1.0718697e-03 9.9817491e-01]], shape=(1, 3), dtype=float32)


In [266]:
sentiment_score("I hate watching this")

tf.Tensor([b'I hate watching this'], shape=(1,), dtype=string)
tf.Tensor([[0.997603   0.00139782 0.00099914]], shape=(1, 3), dtype=float32)
Negative: 99.76%
Neutral: 0.14%
Positive: 0.10%


In [267]:
sentiment_score("I really love this ring, it's so beautiful !")

tf.Tensor([b"I really love this ring, it's so beautiful !"], shape=(1,), dtype=string)
tf.Tensor([[4.4591504e-04 1.4268260e-03 9.9812728e-01]], shape=(1, 3), dtype=float32)
Negative: 0.04%
Neutral: 0.14%
Positive: 99.81%


In [268]:
sentiment_score("This place is a scam, i highly disrecommend")

tf.Tensor([b'This place is a scam, i highly disrecommend'], shape=(1,), dtype=string)
tf.Tensor([[0.68084687 0.03579305 0.28336   ]], shape=(1, 3), dtype=float32)
Negative: 68.08%
Neutral: 3.58%
Positive: 28.34%


In [269]:
sentiment_score("I don't know what to say")

tf.Tensor([b"I don't know what to say"], shape=(1,), dtype=string)
tf.Tensor([[7.085762e-04 9.987644e-01 5.271097e-04]], shape=(1, 3), dtype=float32)
Negative: 0.07%
Neutral: 99.88%
Positive: 0.05%


```py

# %%writefile ../app/   msrc/models/tf_bert.py
import pandas as pd
import numpy as np
import argparse
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import logging

from sklearn.model_selection import train_test_split

logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', level=logging.DEBUG)
logging.getLogger().setLevel(logging.INFO)

BERT_MODEL_NAME = 'small_bert/bert_en_uncased_L-2_H-128_A-2'
TOKENIZER_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
BERT_MODEL_URL = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1'


class SentimentBERT(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.tokenizer = hub.KerasLayer(TOKENIZER_URL, name='tokenizer')
        self.bert_model = hub.KerasLayer(BERT_MODEL_URL, trainable=True, name='bert_model')
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.final = tf.keras.layers.Dense(3, activation=None)
        
    def call(self, inputs):
        x = self.tokenizer(inputs)
        x = self.bert_model(x)
        x = self.dropout(x['pooled_output'])
        out = self.final(x)
        return out
    
    
def train_and_evaluate(**params):
    
    epochs = int(params.get('epochs'))
    batch_size = int(params.get('batch_size'))
    learning_rate = float(params.get('learning_rate'))
    
    df=pd.concat([
        pd.read_csv("data/farisdurrani/twitter_filtered.csv"),
        pd.read_csv("data/farisdurrani/facebook_filtered.csv")
    ])
    df = df.dropna(subset=['sentiment'], axis=0)
    df['Target'] = df['sentiment'].apply(lambda x: 1 if x==0 else np.sign(x)+1).astype(int)

    X_train, _X, y_train, _y = train_test_split(df['bodyText'], df['Target'], stratify=df['Target'], test_size=0.2)
    X_val, X_test, y_val, y_test = train_test_split(_X, _y, stratify=_y, test_size=0.5)
    
    logging.info(f"Train : ({X_train.shape, y_train.shape}) samples")
    logging.info(f"Val : ({X_val.shape, y_val.shape}) samples")
    logging.info(f"Test : ({X_test.shape, y_test.shape}) samples")
    
    classifier = SentimentBERT()
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.metrics.SparseCategoricalAccuracy()
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    classifier.compile(optimizer=optimizer, loss=loss_fn, metrics=metric)
    
    stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        min_delta=0, 
        patience=10, 
        verbose=2, 
        mode='min',
        baseline=None, 
        restore_best_weights=True
    )

    logging.info(f'Training model with {BERT_MODEL_NAME}')
    if args.dry_run:
        logging.info("Dry run mode")
        epochs = 1
        steps_per_epoch = 1
    else:
        steps_per_epoch = None
        
    history = classifier.fit(
        x=X_train.values,
        y=y_train.values,
        validation_data=(X_val.values, y_val.values),
        epochs=epochs,
        batch_size=batch_size,
        steps_per_epoch=steps_per_epoch,
        callbacks=[stopping])
    
    if args.dry_run:
        # If dry run, we do not run the evaluation
        return None
    
    res = classifier.evaluate(
        x=X_test.values, 
        y=y_test.values,
        verbose=2
    )
    
    metrics = {
        'train_acc': history.history['accuracy'],
        'val_acc': history.history['val_accuracy'],
        'test_acc': res[-1],
    }
    logging.info(metrics)
    
    # save model and architecture to single file
    if params.get('job_dir') is None:
        logging.warning("No job dir provided, model will not be saved")
    else:
        logging.info("Saving model to {} ".format(params.get('job_dir')))
        classifier.save(params.get('job_dir'))
    logging.info("Bye bye")
    
    
if __name__ == '__main__':
    # Create arguments here
    parser = argparse.ArgumentParser()
    parser.add_argument('--job-dir', required=True)
    parser.add_argument('--epochs', type=float, default=2)
    parser.add_argument('--batch-size', type=float, default=1024)
    parser.add_argument('--learning-rate', type=float, default=0.01)
    parser.add_argument('--dry-run', action="store_true")

    # Parse them
    args, _ = parser.parse_known_args()

    # Execute training
    train_and_evaluate(
        job_dir=args.job_dir,
        batch_size=args.batch_size,
        learning_rate=args.learning_rate,
        epochs=args.epochs
    )
```

In [273]:
%%writefile ../app/src/models/tensorflow_bert/requirements.txt

tensorflow==2.13.1
tensorflow_hub==0.16.1
tensorflow_text==2.13.0
scikit-learn==1.3.2

Writing ../app/src/models/tensorflow_bert/requirements.txt


In [276]:
PROJECT_ID = ...
IMAGE_NAME=f'bert_tf_sentiment'
IMAGE_TAG='latest'
IMAGE_URI='eu.gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

!gcloud builds submit --tag $IMAGE_URI ../app/src/Dockerfile

Reauthentication required.
[1;31mERROR:[0m (gcloud.builds.submit) There was a problem refreshing your current auth tokens: Reauthentication failed. Please run `gcloud auth login` to complete reauthentication with SAML.
Please run:

  $ gcloud auth login

to obtain new credentials.

If you have already logged in with a different account, run:

  $ gcloud config set account ACCOUNT

to select an already authenticated account to use.
