In [None]:
# --- SPAM EMAIL DETECTION MODEL ---
# Task 4: Machine Learning Model Implementation
# Works flawlessly in VS Code with Jupyter extension

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

print("🚀 Loading dataset...")

# Load sample dataset (simulates spam vs non-spam emails)
categories = ['comp.sys.ibm.pc.hardware', 'rec.sport.baseball']  # simulating spam vs legit
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

X = data.data
y = data.target  # 0 = comp (spam-like), 1 = rec (legit)

print(f"✅ Dataset loaded: {len(X)} emails")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("🔍 Vectorizing text features...")

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("🧠 Training Logistic Regression model...")

# Train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train)

print("📊 Evaluating model...")

# Predict
y_pred = model.predict(X_test_vec)

# Metrics
acc = accuracy_score(y_test, y_pred)
print(f"\n🎯 Accuracy: {acc:.4f}")
print("\n📋 Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Spam-like', 'Legit']))

# Optional: Show top predictive words for "Spam-like"
print("\n🔍 Top 10 words predictive of 'Spam-like' class:")
feature_names = vectorizer.get_feature_names_out()
coef = model.coef_[0]
top_indices = np.argsort(coef)[-10:][::-1]  # top positive coefficients
for idx in top_indices:
    print(f"  {feature_names[idx]}: {coef[idx]:.3f}")

print("\n✅ Model implementation complete!")