# Week 4 Day 2 Assignment

Datasets:
- Ames Housing Dataset (regression)
- Spam Assassin Email Classification Dataset (classification)

Tasks:
- Scale numerical features
- Retrain models
- Compare performance before vs after scaling

In [None]:
# If needed (run once):
# %pip install kagglehub pandas numpy scikit-learn

import os
import glob
import pandas as pd
import numpy as np
from IPython.display import display

In [None]:
# --- Download and load Ames Housing dataset (Kaggle) ---
import kagglehub

AMES_HANDLE = "prevek18/ames-housing-dataset"

try:
    ames_path = kagglehub.dataset_download(AMES_HANDLE)
    csv_files = sorted(glob.glob(os.path.join(ames_path, "**", "*.csv"), recursive=True))
    if not csv_files:
        raise FileNotFoundError("No CSV files found in Ames dataset.")

    print("Ames dataset files:")
    for f in csv_files:
        print("-", os.path.basename(f))

    ames_csv = csv_files[0]
    ames_df = pd.read_csv(ames_csv)
    print("Selected file:", os.path.basename(ames_csv))
    print("Ames dataset loaded:", ames_df.shape)
    display(ames_df.head())
except Exception as e:
    print("Ames dataset download/load failed.")
    print("Make sure Kaggle API credentials are configured.")
    print("Error:", e)

In [None]:
# Regression: compare without vs with scaling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

if 'ames_df' in globals():
    # Detect target column
    target_candidates = ["SalePrice", "saleprice", "Price", "price"]
    target_col = next((c for c in target_candidates if c in ames_df.columns), None)
    if target_col is None:
        raise ValueError("Could not find target column for price.")

    X = ames_df.select_dtypes(include='number').drop(columns=[target_col], errors='ignore')
    y = ames_df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # No scaling
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    preds = lr.predict(X_test)
    print("No scaling - MAE:", mean_absolute_error(y_test, preds))
    print("No scaling - R2:", r2_score(y_test, preds))

    # With scaling + imputation
    scaled_model = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ])
    scaled_model.fit(X_train, y_train)
    scaled_preds = scaled_model.predict(X_test)
    print("With scaling - MAE:", mean_absolute_error(y_test, scaled_preds))
    print("With scaling - R2:", r2_score(y_test, scaled_preds))

In [None]:
# --- Download and load Spam Assassin dataset (Kaggle) ---
SPAM_HANDLE = "ganiyuolalekan/spam-assassin-email-classification-dataset"

try:
    spam_path = kagglehub.dataset_download(SPAM_HANDLE)
    spam_files = sorted(glob.glob(os.path.join(spam_path, "**", "*.csv"), recursive=True))
    if not spam_files:
        raise FileNotFoundError("No CSV files found in spam dataset.")

    print("Spam dataset files:")
    for f in spam_files:
        print("-", os.path.basename(f))

    spam_csv = spam_files[0]
    spam_df = pd.read_csv(spam_csv)
    print("Selected file:", os.path.basename(spam_csv))
    print("Spam dataset loaded:", spam_df.shape)
    display(spam_df.head())
except Exception as e:
    print("Spam dataset download/load failed.")
    print("Make sure Kaggle API credentials are configured.")
    print("Error:", e)

In [None]:
# Classification: compare without vs with scaling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

if 'spam_df' in globals():
    # Detect label column
    label_candidates = ["label", "class", "spam", "target", "Category", "Prediction"]
    label_col = next((c for c in label_candidates if c in spam_df.columns), None)
    if label_col is None:
        label_col = spam_df.columns[-1]

    X = spam_df.drop(columns=[label_col])
    y = spam_df[label_col].astype(str)

    # Use numeric features only (word counts) for scaling comparison
    X = X.select_dtypes(include='number')

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # No scaling
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print("No scaling - Accuracy:", accuracy_score(y_test, preds))

    # With scaling + imputation
    scaled_clf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ])
    scaled_clf.fit(X_train, y_train)
    scaled_preds = scaled_clf.predict(X_test)
    print("With scaling - Accuracy:", accuracy_score(y_test, scaled_preds))