# Week 4 - Day 4 Assignment

Dataset: House prices or spam email classifier

Tasks:
- 5-fold or 10-fold CV for Random Forest and Logistic Regression
- Compare performance metrics
- Document insights on model stability and generalization


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Update this path if your CSV has a different name
candidate_paths = [
    "email_spam.csv",  # Kaggle file name
    "spam.csv",        # fallback (used in other assignments)
    "..\\day-3-assignment\\email_spam.csv",
    "..\\day-3-assignment\\spam.csv",
    "..\\..\\week-3\\day-2-assignment\\spam.csv",
]

csv_path = next((p for p in candidate_paths if os.path.exists(p)), None)
if csv_path is None:
    raise FileNotFoundError(
        "Dataset not found. Place the Kaggle CSV in this folder or update candidate_paths."
    )

raw_df = pd.read_csv(csv_path)
print("Loaded:", csv_path)
print("Columns:", list(raw_df.columns))
raw_df.head()

In [None]:
# Identify label and text columns (handles common Kaggle variants)
label_candidates = ["label", "category", "type", "spam", "v1", "class"]
text_candidates = ["text", "message", "email", "v2", "content", "body"]

label_col = next((c for c in label_candidates if c in raw_df.columns), None)
text_col = next((c for c in text_candidates if c in raw_df.columns), None)

if label_col is None or text_col is None:
    raise ValueError(
        f"Could not infer label/text columns. Found columns: {list(raw_df.columns)}"
    )

# Clean and prepare
_df = raw_df[[label_col, text_col]].copy()
_df.columns = ["label", "text"]
_df["text"] = _df["text"].astype(str)

# Map labels to binary (spam=1, ham=0)
label_map = {
    "spam": 1,
    "ham": 0,
    "not spam": 0,
    "non-spam": 0,
    "legit": 0,
}

if _df["label"].dtype == object:
    _df["label"] = _df["label"].str.strip().str.lower().map(label_map)

# If already numeric, coerce to int
_df["label"] = pd.to_numeric(_df["label"], errors="coerce")
_df = _df.dropna(subset=["label", "text"])
_df["label"] = _df["label"].astype(int)

print(_df["label"].value_counts())
_df.head()

In [None]:
# K-fold cross-validation (stratified)
X = _df["text"]
y = _df["label"]

# Pipelines
log_reg = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
    ("clf", LogisticRegression(max_iter=1000))
])

rf = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"
    ))
])

scoring = {
    "f1": "f1",
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

log_cv = cross_validate(log_reg, X, y, cv=skf, scoring=scoring, n_jobs=-1)
rf_cv = cross_validate(rf, X, y, cv=skf, scoring=scoring, n_jobs=-1)

cv_results = pd.DataFrame([
    {
        "Model": "Logistic Regression",
        "F1 (mean)": log_cv["test_f1"].mean(),
        "F1 (std)": log_cv["test_f1"].std(),
        "Accuracy (mean)": log_cv["test_accuracy"].mean(),
        "Accuracy (std)": log_cv["test_accuracy"].std(),
        "Precision (mean)": log_cv["test_precision"].mean(),
        "Precision (std)": log_cv["test_precision"].std(),
        "Recall (mean)": log_cv["test_recall"].mean(),
        "Recall (std)": log_cv["test_recall"].std(),
    },
    {
        "Model": "Random Forest",
        "F1 (mean)": rf_cv["test_f1"].mean(),
        "F1 (std)": rf_cv["test_f1"].std(),
        "Accuracy (mean)": rf_cv["test_accuracy"].mean(),
        "Accuracy (std)": rf_cv["test_accuracy"].std(),
        "Precision (mean)": rf_cv["test_precision"].mean(),
        "Precision (std)": rf_cv["test_precision"].std(),
        "Recall (mean)": rf_cv["test_recall"].mean(),
        "Recall (std)": rf_cv["test_recall"].std(),
    },
])

cv_results

## Insights (fill after running)

- **Model stability**: Use the CV standard deviation to assess stability; lower std indicates more consistent performance.
- **Generalization**: Higher mean CV scores indicate better generalization across folds.
- **Stratified sampling**: Stratification keeps spam/ham ratios consistent across folds, improving evaluation reliability.
