In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


data = pd.read_csv('spam.csv', encoding='latin-1')  


data = data.rename(columns={"v1": "label", "v2": "text"})
data = data[["label", "text"]]


data['label'] = data['label'].map({'ham': 0, 'spam': 1})


X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)


vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_tfidf, y_train)


y_pred_svm = svm_model.predict(X_test_tfidf)


accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")
print("\nSVM Classification Report:\n")
print(classification_report(y_test, y_pred_svm))


max_words = 5000  
max_len = 100  

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')


cnn_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Conv1D(64, 3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


cnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)


cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_pad, y_test)
print(f"CNN Accuracy: {cnn_accuracy:.2f}")


sample_email = ["Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize."]
sample_seq = tokenizer.texts_to_sequences(sample_email)
sample_pad = pad_sequences(sample_seq, maxlen=max_len, padding='post')
cnn_prediction = cnn_model.predict(sample_pad)
svm_prediction = svm_model.predict(vectorizer.transform(sample_email))

print("CNN Prediction (1 = Spam, 0 = Ham):", int(cnn_prediction[0] > 0.5))
print("SVM Prediction (1 = Spam, 0 = Ham):", svm_prediction[0])


SVM Accuracy: 0.98

SVM Classification Report:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115





Epoch 1/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 27ms/step - accuracy: 0.8469 - loss: 0.4654 - val_accuracy: 0.9686 - val_loss: 0.1485
Epoch 2/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.9802 - loss: 0.0891 - val_accuracy: 0.9843 - val_loss: 0.0621
Epoch 3/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.9952 - loss: 0.0201 - val_accuracy: 0.9843 - val_loss: 0.0631
Epoch 4/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.9979 - loss: 0.0071 - val_accuracy: 0.9854 - val_loss: 0.0593
Epoch 5/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 1.0000 - loss: 0.0022 - val_accuracy: 0.9865 - val_loss: 0.0621
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9834 - loss: 0.0486
CNN Accuracy: 0.98
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21