# SMS Spam Detection Project
This notebook presents a complete pipeline to classify SMS messages as spam or ham using both Machine Learning and Deep Learning approaches.

## 1. Dataset Overview & Preprocessing
We begin by loading the labeled SMS Spam Collection dataset and converting it into a format suitable for model training.

# SMS Spam Detection
This notebook includes complete code to classify SMS messages using Machine Learning and Deep Learning models.

## 2. Machine Learning Models
We apply traditional machine learning models (Random Forest, XGBoost, LightGBM) using TF-IDF vectorized text data.

In [1]:
# Data Loading and Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('../data/SMSSpamCollection.csv', encoding='latin-1')[['label', 'text']]
df['label_num'] = df.label.map({'ham':0, 'spam':1})

X = df['text']
y = df['label_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

## 3. Deep Learning Models
We implement deep learning models (ANN, LSTM, CNN) to capture sequential features of text data.

In [2]:
# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

models_ml = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

for name, model in models_ml.items():
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    print(f"\nResults for {name}:")
    print(classification_report(y_test, preds))


Results for Random Forest:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Results for XGBoost:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.98      0.80      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

[LightGBM] [Info] Number of positive: 597, number of negative: 3860
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5646
[LightGBM] [Info] Number of data

In [3]:
# Deep Learning Models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 1000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

# ANN Model
model_ann = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_ann.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ann.fit(X_train_seq, y_train, epochs=3, validation_split=0.1)
print("\nANN Evaluation:")
model_ann.evaluate(X_test_seq, y_test)

# LSTM Model
model_lstm = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train_seq, y_train, epochs=3, validation_split=0.1)
print("\nLSTM Evaluation:")
model_lstm.evaluate(X_test_seq, y_test)

# CNN Model
model_cnn = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    Conv1D(32, 3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(1, activation='sigmoid')
])

model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn.fit(X_train_seq, y_train, epochs=3, validation_split=0.1)
print("\nCNN Evaluation:")
model_cnn.evaluate(X_test_seq, y_test)

Epoch 1/3




[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8373 - loss: 0.3625 - val_accuracy: 0.9552 - val_loss: 0.1722
Epoch 2/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9776 - loss: 0.1288 - val_accuracy: 0.9709 - val_loss: 0.1439
Epoch 3/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9905 - loss: 0.1035 - val_accuracy: 0.9709 - val_loss: 0.1435

ANN Evaluation:
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9837 - loss: 0.1180
Epoch 1/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 41ms/step - accuracy: 0.8460 - loss: 0.4084 - val_accuracy: 0.9641 - val_loss: 0.1343
Epoch 2/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.9841 - loss: 0.0676 - val_accuracy: 0.9709 - val_loss: 0.0746
Epoch 3/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 61m

[0.06402628123760223, 0.9811659455299377]

## 4. Final Testing on Unlabeled Dataset
We apply all trained models to an external unlabeled dataset provided for evaluation and compare their predictions.

In [4]:
# Load professor's dataset
test_df = pd.read_csv('../data/spam_texts.csv', encoding='latin-1')
test_texts = test_df['text']

# Vectorize and tokenize
test_vec = vectorizer.transform(test_texts)
test_seq = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=150)

# Predictions from all models
test_df['pred_rf'] = models_ml['Random Forest'].predict(test_vec)
test_df['pred_xgb'] = models_ml['XGBoost'].predict(test_vec)
test_df['pred_lgb'] = models_ml['LightGBM'].predict(test_vec)
test_df['pred_ann'] = (model_ann.predict(test_seq) > 0.5).astype(int)
test_df['pred_lstm'] = (model_lstm.predict(test_seq) > 0.5).astype(int)
test_df['pred_cnn'] = (model_cnn.predict(test_seq) > 0.5).astype(int)

# Format predictions to 'ham' or 'spam'
for col in ['pred_rf', 'pred_xgb', 'pred_lgb', 'pred_ann', 'pred_lstm', 'pred_cnn']:
    test_df[col] = test_df[col].astype(int).map({0: 'ham', 1: 'spam'})

# Display comparison
comparison = test_df[['text', 'pred_rf', 'pred_xgb', 'pred_lgb', 'pred_ann', 'pred_lstm', 'pred_cnn']]
print(comparison)

# Optional export
comparison.to_csv('../report/final_model_comparison.csv', index=False)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
                                                 text pred_rf pred_xgb  \
0   CREDITED: Rs.75 wallet money. Use it to order ...     ham      ham   
1   Shoe styles paired with minimal looks, perfect...     ham      ham   
2   Kejani Cleaning Services offers comprehensive,...     ham      ham   
3   Carrefour Fridays month is on!! Crazy deals ev...     ham     spam   
4   Keep up with MTN Broadband! Visit https://apps...     ham      ham   
5   Get clientele HELP Cover today. Debi check and...     ham      ham   
6   Do you like your friend's signature? Reply wit...     ham      ham   
7   Get 2.5GB + 100 Telkom Mins +2 Bob/ Min to oth...     ham      ham   
8   Enjoy more talktime when you recharge your Air...     ham      ham   
9   25% Discount - Get Ultra 50GB from My