# SMS Spam Detection
This notebook includes complete code to classify SMS messages using Machine Learning and Deep Learning models.

In [None]:
# Data Loading and Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('../data/SMSSpamdCollection.csv', encoding='latin-1')[['label', 'text']]
df['label_num'] = df.label.map({'ham':0, 'spam':1})

X = df['text']
y = df['label_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [2]:
# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

models_ml = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

for name, model in models_ml.items():
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    print(f"\nResults for {name}:")
    print(classification_report(y_test, preds))


Results for Random Forest:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Results for XGBoost:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.98      0.80      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

[LightGBM] [Info] Number of positive: 597, number of negative: 3860
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[L

In [3]:
# Deep Learning Models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 1000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

# ANN Model
model_ann = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_ann.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ann.fit(X_train_seq, y_train, epochs=3, validation_split=0.1)
print("\nANN Evaluation:")
model_ann.evaluate(X_test_seq, y_test)

# LSTM Model
model_lstm = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train_seq, y_train, epochs=3, validation_split=0.1)
print("\nLSTM Evaluation:")
model_lstm.evaluate(X_test_seq, y_test)

# CNN Model
model_cnn = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    Conv1D(32, 3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(1, activation='sigmoid')
])

model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn.fit(X_train_seq, y_train, epochs=3, validation_split=0.1)
print("\nCNN Evaluation:")
model_cnn.evaluate(X_test_seq, y_test)



Epoch 1/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8441 - loss: 0.3537 - val_accuracy: 0.9574 - val_loss: 0.1679
Epoch 2/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9774 - loss: 0.1331 - val_accuracy: 0.9664 - val_loss: 0.1412
Epoch 3/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9879 - loss: 0.1017 - val_accuracy: 0.9686 - val_loss: 0.1453

ANN Evaluation:
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9803 - loss: 0.1199
Epoch 1/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 45ms/step - accuracy: 0.8674 - loss: 0.3945 - val_accuracy: 0.9641 - val_loss: 0.1218
Epoch 2/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 91ms/step - accuracy: 0.9830 - loss: 0.0697 - val_accuracy: 0.9731 - val_loss: 0.0975
Epoch 3/3
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

[0.06408586353063583, 0.9811659455299377]

In [4]:
# 📘 Final Testing on Professor's Dataset
import pandas as pd

# Load professor's dataset
test_df = pd.read_csv('../data/spam_texts.csv', encoding='latin-1')

# Vectorize using the same trained TF-IDF vectorizer
test_texts = test_df['text']
test_vec = vectorizer.transform(test_texts)

# Predict using trained Random Forest model
test_preds = models_ml['Random Forest'].predict(test_vec)

# Add predictions to dataframe
test_df['predicted_label'] = test_preds
test_df['predicted_label'] = test_df['predicted_label'].map({0: 'ham', 1: 'spam'})

# Show results
print(test_df[['text', 'predicted_label']])

# Optional: export results for use in your report
test_df[['text', 'predicted_label']].to_csv('../report/test_predictions.csv', index=False)


                                                 text predicted_label
0   CREDITED: Rs.75 wallet money. Use it to order ...             ham
1   Shoe styles paired with minimal looks, perfect...             ham
2   Kejani Cleaning Services offers comprehensive,...             ham
3   Carrefour Fridays month is on!! Crazy deals ev...             ham
4   Keep up with MTN Broadband! Visit https://apps...             ham
5   Get clientele HELP Cover today. Debi check and...             ham
6   Do you like your friend's signature? Reply wit...             ham
7   Get 2.5GB + 100 Telkom Mins +2 Bob/ Min to oth...             ham
8   Enjoy more talktime when you recharge your Air...             ham
9   25% Discount - Get Ultra 50GB from MyTelenor A...             ham
10  Discount of Rs 100 is now YOURS! Get 126 Chann...             ham
11  Ride & save with 25% off 5 GO or GO Awfar ride...             ham
12  Study in UK, USA, Canada, Australia, Malaysia,...            spam
13  Ramzan Mubarak! 