In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import pandas as pd

In [3]:
!wget -O spam.csv https://raw.githubusercontent.com/codebasics/deep-learning-keras-tf-tutorial/master/47_BERT_text_classification/spam.csv

--2024-01-10 13:24:01--  https://raw.githubusercontent.com/codebasics/deep-learning-keras-tf-tutorial/master/47_BERT_text_classification/spam.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480130 (469K) [text/plain]
Saving to: ‘spam.csv’


2024-01-10 13:24:01 (34.0 MB/s) - ‘spam.csv’ saved [480130/480130]



In [12]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Model Building (BERT)

In [22]:
preprocessor = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/1")
encoder = hub.KerasLayer("https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-12-h-128-a-2/versions/2", trainable=True)

## Data Preprocessing and cleaning

In [24]:
df['spam'] = df.Category.apply(lambda x: 0 if x=='ham' else 1)

In [25]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [26]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

There is an imbalance in our dataset, lets handle it using SMOTE

But first, we must do the word embedding.

In [29]:
def sentence_embedding(sentences):
  preprocessed_text = preprocessor(sentences)
  output = encoder(preprocessed_text)

  return output['pooled_output']

Now, we can use the `pooled_output` as the **feature (X)** and the `spam` as the **label (y)**

In [34]:
X = sentence_embedding(df.Message)
y = df.spam

Now, we can apply SMOTE to balance the data

In [38]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

In [39]:
y_sm.value_counts()

0    4825
1    4825
Name: spam, dtype: int64

## Data Splitting into train and test set

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=.2, random_state=42)

## Model Building

In [42]:
X_train.shape

(7720, 128)

In [43]:
y_train.shape

(7720,)

In [47]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, input_shape=(128,), activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [49]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f38440df9a0>

In [50]:
model.evaluate(X_test, y_test)



[0.10582133382558823, 0.9606217741966248]

In [53]:
def predict_spam(sentence):
  preprocessed_text = preprocessor([sentence])
  output = encoder(preprocessed_text)

  spam_prob = model.predict(output['pooled_output'])

  return spam_prob

In [55]:
predict_spam('''
  OMG, DOWNLOAD FREE ROBUX USING THIS LINK BELOW
  WWW.FREE-ROBUX.COM
''')



array([[0.9734588]], dtype=float32)

In [61]:
import pickle

# Save the model using pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)