In [4]:
# SPAM SMS DETECTION PROJECT

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Download NLTK Data
nltk.download('stopwords')
nltk.download('punkt')

print("="*70)
print("ðŸ“± SPAM SMS DETECTION USING MACHINE LEARNING")
print("="*70)

#  1. Load Dataset 
path = "/kaggle/input/sms-spam-collection-dataset/spam.csv"
data = pd.read_csv(path, encoding='latin-1')

# Drop unnecessary columns
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

print("\nDataset Loaded Successfully!")
print(f"Shape: {data.shape}")
print(data.head())

#  2. Data Cleaning #
data.drop_duplicates(inplace=True)
data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})

print("\nLabel Distribution:")
print(data['label'].value_counts())

#  3. Text Preprocessing  #
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(word) for word in words if word not in stop_words and len(word) > 2]
    return ' '.join(words)

print("\nCleaning messages...")
data['cleaned_message'] = data['message'].apply(clean_text)

print("âœ… Text Cleaning Done!")

# 4. TF-IDF Vectorization #
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(data['cleaned_message'])
y = data['label_num']

print(f"TF-IDF Shape: {X.shape}")

# 5. Train-Test Split  #
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData Split Complete!")
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

# =================== 6. Model Training =================== #
model = MultinomialNB()
model.fit(X_train, y_train)

#  7. Predictions #
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nðŸ“Š Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

#  8. Confusion Matrix  #
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=300)
plt.show()

# 9. Test on Custom Input  #
sample_msgs = [
    "Congratulations! You won a free iPhone. Click here to claim!",
    "Hey, are we still meeting for lunch today?"
]
sample_cleaned = [clean_text(msg) for msg in sample_msgs]
sample_features = vectorizer.transform(sample_cleaned)
sample_preds = model.predict(sample_features)

for msg, pred in zip(sample_msgs, sample_preds):
    print(f"\nMessage: {msg}")
    print("Prediction:", "ðŸš¨ SPAM" if pred == 1 else "âœ… HAM")

print("\nðŸŽ‰ Training Complete! Model Ready for Use.")
print("Output File: confusion_matrix.png")
print("="*70)


KeyboardInterrupt: 