In [47]:
# Imports
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [48]:
# Load data
df = pd.read_csv('enron_spam_data.csv')

In [49]:
# Feature engineering
df['subject_length'] = df['Subject'].astype(str).apply(len)
df['num_uppercase_words'] = df['Subject'].astype(str).apply(lambda x: sum(1 for w in x.split() if w.isupper()))
df['num_exclamations'] = df['Subject'].astype(str).apply(lambda x: x.count('!'))
df['percent_uppercase'] = df['Subject'].astype(str).apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))

X = df[['Subject', 'subject_length', 'num_uppercase_words', 'num_exclamations', 'percent_uppercase']]
y = df['Spam/Ham'].map({'ham': 0, 'spam': 1})


In [50]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [51]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [52]:
scaler = StandardScaler()

In [53]:
# Pipeline: scale numeric, vectorize text
numeric_features = ['subject_length', 'num_uppercase_words', 'num_exclamations', 'percent_uppercase']
preprocessor = ColumnTransformer([
    ('num', scaler, numeric_features),
    ('text', TfidfVectorizer(max_features=2000), 'Subject')
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])


In [54]:
# Train
pipeline.fit(X_train, y_train)

In [55]:
# Evaluate on test set
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3309
           1       1.00      1.00      1.00      3435

    accuracy                           1.00      6744
   macro avg       1.00      1.00      1.00      6744
weighted avg       1.00      1.00      1.00      6744



In [56]:
# Prediction function using pipeline (must be after training)
def predict_subject(subject):
    features = {
        'Subject': subject,
        'subject_length': len(subject),
        'num_uppercase_words': sum(1 for w in subject.split() if w.isupper()),
        'num_exclamations': subject.count('!'),
        'percent_uppercase': sum(1 for c in subject if c.isupper()) / (len(subject) + 1)
    }
    sample = pd.DataFrame([features])
    pred = pipeline.predict(sample)[0]
    print(f"\n📩 Subject: {subject}")
    print(f"🤖 Predicted: {'SPAM' if pred else 'HAM'}")

In [57]:
# 🔍 Example: Check prediction for an email from the dataset (change index as needed)
predict_subject(df.loc[19, 'Subject'])  # Check prediction for 11th email


📩 Subject: ua 4 - meter 1441 for 11 / 97 - falfurrias
🤖 Predicted: HAM


In [58]:
joblib.dump(classifier, 'spam_classifier.joblib')

['spam_classifier.joblib']

In [59]:
joblib.dump(scaler, 'spam_scaler.joblib')

['spam_scaler.joblib']

In [60]:
from google.colab import files
files.download('spam_classifier.joblib')
files.download('spam_scaler.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>