In [18]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [19]:
# === Step 1: Generate Synthetic Balanced Dataset ===
spam_subjects = [
    "Congratulations, you've won a prize!",
    "Limited time offer, claim now!",
    "Get cash instantly $$$",
    "Earn money fast!!!",
    "FREE subscription trial",
    "Click here to claim your reward",
    "URGENT: Update your account now",
    "You are selected to win iPhone",
    "This is your last chance!!!",
    "WIN a trip to Bahamas today"
]

ham_subjects = [
    "Meeting rescheduled for tomorrow",
    "Lunch at 1 PM?",
    "Reminder: Project deadline approaching",
    "Let's catch up soon",
    "Monthly report attached",
    "Your invoice is ready",
    "Update on your leave request",
    "Team call at 3 PM",
    "Notes from today’s session",
    "Can we reschedule the appointment?"
]

In [20]:
# Create 100 rows (50 spam, 50 ham)
df = pd.DataFrame({
    "Subject": spam_subjects * 5 + ham_subjects * 5,
    "Spam/Ham": ["spam"] * 50 + ["ham"] * 50
})


In [21]:
# === Step 2: Feature Engineering ===
spammy_keywords = ['free', '$$$', 'win', 'cash', 'offer', 'congratulations', 'urgent', 'limited time', 'earn', 'reward']

def add_features(df):
    df['Subject'] = df['Subject'].astype(str)
    df['subject_length'] = df['Subject'].apply(len)
    df['num_uppercase_words'] = df['Subject'].apply(lambda x: sum(1 for w in x.split() if w.isupper()))
    df['num_exclamations'] = df['Subject'].apply(lambda x: x.count('!'))
    df['percent_uppercase'] = df['Subject'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
    df['has_spammy_words'] = df['Subject'].apply(lambda x: int(any(word in x.lower() for word in spammy_keywords)))
    return df

df = add_features(df)


In [22]:
# === Step 3: Prepare Train/Test Sets ===
X = df[['Subject', 'subject_length', 'num_uppercase_words', 'num_exclamations', 'percent_uppercase', 'has_spammy_words']]
y = df['Spam/Ham'].map({'ham': 0, 'spam': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [23]:
regression = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

In [24]:
scaler = StandardScaler()

In [25]:
# === Step 4: Build Pipeline ===
numeric_features = ['subject_length', 'num_uppercase_words', 'num_exclamations', 'percent_uppercase', 'has_spammy_words']

preprocessor = ColumnTransformer([
    ('num',scaler, numeric_features),
    ('text', TfidfVectorizer(max_features=2000), 'Subject')
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regression', regression)
])

In [26]:
# === Step 5: Train Model ===
pipeline.fit(X_train, y_train)


In [27]:
# === Step 6: Evaluate ===
y_pred = pipeline.predict(X_test)
print("\n📊 Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))



📊 Classification Report:

              precision    recall  f1-score   support

         Ham       1.00      1.00      1.00        10
        Spam       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [28]:
# === Step 7: Universal Subject Prediction ===
def predict_subject(subject):
    features = {
        'Subject': subject,
        'subject_length': len(subject),
        'num_uppercase_words': sum(1 for w in subject.split() if w.isupper()),
        'num_exclamations': subject.count('!'),
        'percent_uppercase': sum(1 for c in subject if c.isupper()) / (len(subject) + 1),
        'has_spammy_words': int(any(word in subject.lower() for word in spammy_keywords))
    }
    sample = pd.DataFrame([features])
    pred = pipeline.predict(sample)[0]
    print(f"\n📩 Subject: {subject}")
    print(f"🤖 Prediction: {'SPAM' if pred else 'HAM'}")


In [29]:
# === Step 8: Universal Test Inputs ===
predict_subject("FREE cash offer!!!")
predict_subject("Time Table For Exam")
predict_subject("$$$ Earn money easily")
predict_subject("Team meeting today")
predict_subject("URGENT: Verify your ID")
predict_subject("Reminder: fee payment due")
predict_subject("$$$")


📩 Subject: FREE cash offer!!!
🤖 Prediction: SPAM

📩 Subject: Time Table For Exam
🤖 Prediction: HAM

📩 Subject: $$$ Earn money easily
🤖 Prediction: SPAM

📩 Subject: Team meeting today
🤖 Prediction: HAM

📩 Subject: URGENT: Verify your ID
🤖 Prediction: SPAM

📩 Subject: Reminder: fee payment due
🤖 Prediction: HAM

📩 Subject: $$$
🤖 Prediction: SPAM


In [30]:
# Optional: User input loop
# while True:
#     text = input("Enter subject (or 'exit'): ")
#     if text.lower() == 'exit':
#         break
#     predict_subject(text)


In [31]:
import joblib

# Save the trained pipeline
joblib.dump(pipeline, 'spam_classifier_model.pkl')
print("✅ Model saved to 'spam_classifier_model.pkl'")


✅ Model saved to 'spam_classifier_model.pkl'


In [33]:
joblib.dump(scaler, 'spam_scaler_classifier.pkl')
print("✅ Model saved to 'spam_scaler_classifier.pkl'")

✅ Model saved to 'spam_scaler_classifier.pkl'


In [34]:
from google.colab import files
files.download('spam_classifier_model.pkl')
files.download('spam_scaler_classifier.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>