In [None]:
# Install "transformer" library
!pip install transformers

In [None]:
# Import Libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
# Load Pre-trained BERT Model and Tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
# Show the model
bert_model.summary()

In [None]:
# Load spam/ham email dataset
data = pd.read_csv("/content/sample_data/email.csv")
data.head()

In [None]:
# Text message
email_texts = data.Message
email_texts = list(email_texts)

In [None]:
print(email_texts)

In [None]:
# Transform the lebel by spam = 1 and ham = 0
labels = data.Category.apply(lambda x: 1 if x=="spam" else 0)
# Prepare Labels
labels = np.array(labels)

In [None]:
print(labels)

In [None]:
# Tokenize and Pad Data
max_length = 128
encoded_inputs = tokenizer(email_texts, padding=True, truncation=True, max_length=max_length, return_tensors='tf')
print(encoded_inputs.input_ids.shape)
print()
print(encoded_inputs.token_type_ids.shape)
print()
print(encoded_inputs.attention_mask.shape)
print()

In [None]:
# Split Data into Training and Validation Sets
train_ratio = 0.8
num_train_samples = int(len(email_texts) * train_ratio)

train_inputs = {key: val[:num_train_samples] for key, val in encoded_inputs.items()}
test_inputs = {key: val[num_train_samples:] for key, val in encoded_inputs.items()}
train_labels = labels[:num_train_samples]
test_labels = labels[num_train_samples:]

In [None]:
# Compile the Model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
# Initialize the GPU
import tensorflow as tf
device_name = tf.test.gpu_device_name()
with tf.device(device_name):
  pass

In [None]:
# Train the Model
epochs = 20
batch_size = 64

with tf.device(device_name):
  history = bert_model.fit(x=train_inputs, y=train_labels, validation_data=(test_inputs, test_labels),
                           epochs=epochs,batch_size=batch_size)

In [None]:
# show the loss and accuracy through each iteration
print(history.history['loss'])
print(history.history['accuracy'])
print(history.history['val_loss'])
print(history.history['val_accuracy'])

In [None]:
# Plot function
def plotHistory(val,nm):
  plt.figure(figsize=(5,2))
  plt.plot(val)
  plt.xlabel("Count of Iteration")
  plt.ylabel(str(nm))
  plt.title(nm)
  plt.show()

In [None]:
# Plot loss
plotHistory(history.history['loss'],'loss')

In [None]:
# Plot accuracy
plotHistory(history.history['accuracy'],'accuracy')

In [None]:
# Plot validation loss
plotHistory(history.history['val_loss'],'val_loss')

In [None]:
# Plot validation accuracy
plotHistory(history.history['val_accuracy'],'val_accuracy')

In [None]:
# Predict the bert model
pred_lebel = bert_model.predict(test_inputs)

In [None]:
# Show the actual prediction
print(pred_lebel)

In [None]:
# Important part of prediction
pred_lebel.logits

In [None]:
# Transform the prediction lebel to standard lebel
predicted_labels = np.argmax(pred_lebel.logits, axis=-1)
# Show shape of predicted lebel
print(predicted_labels.shape)

In [None]:
# show lebel
print(predicted_labels)

In [None]:
# Import and show the result
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
confusion_matrix(test_labels, predicted_labels)

In [None]:
# Plot the result
cm = confusion_matrix(test_labels, predicted_labels, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
plt.figure(figsize=(1,1))
disp.plot()
plt.show()

In [None]:
# Accuracy score
accuracy_score(test_labels, predicted_labels)