#Notebook Setup

In [302]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as k
import tensorflow_hub as hub
import keras_nlp
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

##Load cleaned dataset

In [303]:
df = pd.read_csv('/content/data_clean.csv')
df.head()

Unnamed: 0,label,text
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar Joking wif u oni
2,1,free entry 2 wkly comp win FA Cup final tkts 2...
3,0,u dun early hor u c
4,0,nah I think usf live


##Dataset Preparation

In [304]:
# Dataset Preparation
BATCH_SIZE = 32
NUM_WORDS = 7000
EPOCHS = 5
SEQUENCE_LENGTH = 100
PRESET = "distil_bert_base_en_uncased"

##Build training and validation set

In [306]:
df['text'] = df['text'].astype(str)
x_train, x_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, stratify=df['label'])

##Load BERT lightweight version model

In [307]:
# Load DistilBERT Preprocessor and Backbone
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(PRESET, sequence_length=SEQUENCE_LENGTH)
backbone = keras_nlp.models.DistilBertBackbone.from_preset(PRESET)

##Preprocessing

In [308]:
train_data = tf.data.Dataset.from_tensor_slices((x_train.to_numpy(), y_train.to_numpy())).batch(BATCH_SIZE)
val_data = tf.data.Dataset.from_tensor_slices((x_val.to_numpy(), y_val.to_numpy())).batch(BATCH_SIZE)

In [309]:
def build_distilbert_model():
    inputs = k.layers.Input(shape=(), dtype=tf.string, name="text")
    x = preprocessor(inputs)
    x = backbone(x)
    x = k.layers.GlobalAveragePooling1D()(x)
    x = k.layers.Dropout(0.1)(x)
    outputs = k.layers.Dense(1, activation='sigmoid')(x)
    return k.Model(inputs, outputs)

distilbert_model = build_distilbert_model()

In [310]:
OPTIMIZER = k.optimizers.Adam(learning_rate=2e-5)
distilbert_model.compile(
    optimizer=OPTIMIZER,
    loss=k.losses.BinaryCrossentropy(),
    metrics=[k.metrics.BinaryAccuracy(), k.metrics.Recall(), k.metrics.Precision()],
    )

In [311]:
distilbert_model.summary()

In [313]:
def scheduler(epoch, lr):
    lr = lr * np.exp(-0.1)
    return lr
lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [314]:
history = distilbert_model.fit(train_data, validation_data=val_data, epochs=EPOCHS, callbacks=[lr])

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1000s[0m 7s/step - binary_accuracy: 0.9106 - loss: 0.1883 - precision_21: 0.6989 - recall_21: 0.5729 - val_binary_accuracy: 0.9928 - val_loss: 0.0217 - val_precision_21: 0.9796 - val_recall_21: 0.9664 - learning_rate: 1.8097e-05
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m976s[0m 7s/step - binary_accuracy: 0.9905 - loss: 0.0344 - precision_21: 0.9680 - recall_21: 0.9582 - val_binary_accuracy: 0.9946 - val_loss: 0.0161 - val_precision_21: 1.0000 - val_recall_21: 0.9597 - learning_rate: 1.6375e-05
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m972s[0m 7s/step - binary_accuracy: 0.9960 - loss: 0.0158 - precision_21: 0.9940 - recall_21: 0.9745 - val_binary_accuracy: 0.9946 - val_loss: 0.0225 - val_precision_21: 1.0000 - val_recall_21: 0.9597 - learning_rate: 1.4816e-05
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m973s[0m 7s/step - binary_accuracy: 

In [316]:
distilbert_model.evaluate(val_data)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - binary_accuracy: 0.9962 - loss: 0.0151 - precision_21: 1.0000 - recall_21: 0.9704


[0.014553407207131386, 0.9964125752449036, 0.9731543660163879, 1.0]

In [319]:
# Model Evaluation
# Evaluation
def evaluate_model(model, x_test, y_test):
    y_pred = distilbert_model.predict(x_test)
    y_pred = np.where(y_pred < 0.5, 0, 1)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

evaluate_model(distilbert_model, x_val, y_val)

ValueError: Invalid dtype: object

Let's redefine a preprocessing function to test the model

In [102]:
import re
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS

In [112]:
def preprocess_text(text):
  text = re.sub(r'[^a-zA-Z0-9 ]','',text.strip())
  return ' '.join([token.lemma_ for token in nlp(text) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)])

In [113]:
# Single Message Prediction
def predict_spam(model, text):
    processed_text = tokenizer.texts_to_sequences([preprocess_text(text)])
    padded_text = k.preprocessing.sequence.pad_sequences(processed_text, maxlen=100, padding='post')
    pred = bert_model.predict(padded_text)[0][0]
    return "Spam" if pred > 0.5 else "Ham"

In [114]:
# SPAM Message
sample_text = "Congratulations! You’ve won a $500 Amazon gift card. Claim it here "
print("Prediction:", predict_spam(bert_model, sample_text))

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [106]:
# NOT a SPAM Message
sample_text = "The dog is in the garden. "
print("Prediction:", predict_spam(model, sample_text))

NameError: name 'model' is not defined