In [1]:
import pandas as pd

In [14]:
df = pd.read_csv('datasets/spam.csv', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
df=df[['v1','v2']]

In [6]:
df.shape

(5572, 2)

In [7]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [16]:
df = df.rename(columns={'v1': 'label', 'v2': 'message'})

In [17]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [19]:
x=df['message']
y=df['label']

In [22]:
x=x.str.lower().str.replace(r'[^\w\s]','',regex = True)

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

# using TF-IDF (Term Frequency-Inverse Document Frequency):

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [33]:
x_train_tfidf=tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

x_train_ann = x_train_tfidf.toarray()
x_test_ann = x_test_tfidf.toarray()
y_train_ann = y_train.map({'ham':0, 'spam':1}).values
y_test_ann = y_test.map({'ham':0, 'spam':1}).values

In [41]:
from keras.layers import Dense,Dropout
from keras.models import Sequential
import keras_tuner as kt
from tensorflow.keras.callbacks import EarlyStopping


# tuning hyperparameter

In [58]:
def build_model(hp):
    model = Sequential()

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            units=hp.Int('units_' + str(i), min_value=32, max_value=128, step=32),
            activation='relu'))
        model.add(Dropout(hp.Float('dropout_' + str(i), 0.2, 0.5, step=0.1)))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'])
    return model

In [62]:
tuner = kt.RandomSearch(build_model,
                        objective='val_accuracy',
                        max_trials=10,
                        directory='models',
                        project_name='spam1',)

In [63]:
tuner.search(x_train_ann, y_train_ann, epochs=5,validation_data=(x_test_ann, y_test_ann))

Trial 10 Complete [00h 00m 06s]
val_accuracy: 0.9847533702850342

Best val_accuracy So Far: 0.9847533702850342
Total elapsed time: 00h 01m 28s


In [65]:
tuned_model = tuner.get_best_models(num_models=1)[0]


# training model

In [68]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

tuned_model.fit(
    x_train_ann, y_train_ann,
    epochs=100,
    initial_epoch=5,
    validation_data=(x_test_ann, y_test_ann),
    class_weight={0: 1.0, 1: 6.0},
    callbacks=[early_stop])


Epoch 6/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 1.0000 - loss: 7.8281e-08 - val_accuracy: 0.9865 - val_loss: 0.2938
Epoch 7/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 1.0000 - loss: 3.7844e-08 - val_accuracy: 0.9865 - val_loss: 0.2942
Epoch 8/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 9.0827e-08 - val_accuracy: 0.9857 - val_loss: 0.2954
Epoch 9/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 3.1324e-08 - val_accuracy: 0.9857 - val_loss: 0.2958


<keras.src.callbacks.history.History at 0x1eafda6f0e0>

# evaluation

In [69]:
from sklearn.metrics import confusion_matrix, classification_report

# Convert probabilities to binary 0 or 1
y_pred = (tuned_model.predict(x_test_ann) > 0.5).astype("int32")

print("--- Confusion Matrix ---")
print(confusion_matrix(y_test_ann, y_pred))
print("\n--- Classification Report ---")
print(classification_report(y_test_ann, y_pred))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
--- Confusion Matrix ---
[[961   4]
 [ 11 139]]

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       965
           1       0.97      0.93      0.95       150

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



# saving model

In [73]:
import pickle
import joblib

tuned_model.save('spam_ann_model.keras')

joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']