# Week 4 - Day 3 Assignment

Dataset: Spam email classifier (Kaggle)

Tasks:
- Train Random Forest model
- Compare F1 score with a baseline model
- Visualize feature importance
- Optional: Train XGBoost and compare results


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Update this path if your CSV has a different name
candidate_paths = [
    "email_spam.csv",  # common Kaggle file name
    "spam.csv",        # fallback name used in other assignments
]

csv_path = next((p for p in candidate_paths if os.path.exists(p)), None)
if csv_path is None:
    raise FileNotFoundError(
        "Dataset not found. Place the Kaggle CSV in this folder and update candidate_paths."
    )

raw_df = pd.read_csv(csv_path)
print("Loaded:", csv_path)
print("Columns:", list(raw_df.columns))
raw_df.head()

In [None]:
# Identify label and text columns (handles common Kaggle variants)
label_candidates = ["label", "category", "type", "spam", "v1", "class"]
text_candidates = ["text", "message", "email", "v2", "content", "body"]

label_col = next((c for c in label_candidates if c in raw_df.columns), None)
text_col = next((c for c in text_candidates if c in raw_df.columns), None)

if label_col is None or text_col is None:
    raise ValueError(
        f"Could not infer label/text columns. Found columns: {list(raw_df.columns)}"
    )

# Clean and prepare
_df = raw_df[[label_col, text_col]].copy()
_df.columns = ["label", "text"]
_df["text"] = _df["text"].astype(str)

# Map labels to binary (spam=1, ham=0)
label_map = {
    "spam": 1,
    "ham": 0,
    "not spam": 0,
    "non-spam": 0,
    "legit": 0,
}

if _df["label"].dtype == object:
    _df["label"] = _df["label"].str.strip().str.lower().map(label_map)

# If already numeric, coerce to int
_df["label"] = pd.to_numeric(_df["label"], errors="coerce")
_df = _df.dropna(subset=["label", "text"])
_df["label"] = _df["label"].astype(int)

print(_df["label"].value_counts())
_df.head()

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    _df["text"], _df["label"], test_size=0.2, random_state=42, stratify=_df["label"]
)

# Baseline model: Logistic Regression
baseline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
    ("clf", LogisticRegression(max_iter=1000))
])

baseline.fit(X_train, y_train)
baseline_preds = baseline.predict(X_test)

baseline_f1 = f1_score(y_test, baseline_preds)
baseline_acc = accuracy_score(y_test, baseline_preds)

print(f"Baseline (LogReg) F1: {baseline_f1:.4f}")
print(f"Baseline (LogReg) Acc: {baseline_acc:.4f}")

In [None]:
# Random Forest model
rf = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"
    ))
])

rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

rf_f1 = f1_score(y_test, rf_preds)
rf_acc = accuracy_score(y_test, rf_preds)

print(f"Random Forest F1: {rf_f1:.4f}")
print(f"Random Forest Acc: {rf_acc:.4f}")

In [None]:
# Compare results
comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "F1": [baseline_f1, rf_f1],
    "Accuracy": [baseline_acc, rf_acc]
})
comparison

In [None]:
# Feature importance from Random Forest
vectorizer = rf.named_steps["tfidf"]
rf_model = rf.named_steps["clf"]

feature_names = vectorizer.get_feature_names_out()
importances = rf_model.feature_importances_

# Top 20 important features
indices = np.argsort(importances)[-20:]

plt.figure(figsize=(8, 6))
plt.barh(range(len(indices)), importances[indices], color="teal")
plt.yticks(range(len(indices)), feature_names[indices])
plt.title("Top 20 Feature Importances (Random Forest)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
# Optional: XGBoost model
try:
    from xgboost import XGBClassifier

    xgb = Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
        ("clf", XGBClassifier(
            n_estimators=300,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            eval_metric="logloss"
        ))
    ])

    xgb.fit(X_train, y_train)
    xgb_preds = xgb.predict(X_test)

    xgb_f1 = f1_score(y_test, xgb_preds)
    xgb_acc = accuracy_score(y_test, xgb_preds)

    print(f"XGBoost F1: {xgb_f1:.4f}")
    print(f"XGBoost Acc: {xgb_acc:.4f}")
except Exception as e:
    print("XGBoost not run:", e)