
# The End-to-End Data Science Workflow — Notebook 2/4
## Feature Engineering & Preprocessing Pipeline 🛠️

**Goal.** Engineer features, handle heterogeneous data, and build a leak-proof preprocessing pipeline using `scikit-learn` primitives.

**Libraries used (with roles):**
- `numpy`, `pandas`: data wrangling and deterministic behavior.
- `scikit-learn`: `Pipeline`, `ColumnTransformer`, `SimpleImputer`, `StandardScaler`,
  `OneHotEncoder`, `OrdinalEncoder`, `train_test_split`.
- `joblib`: persist fitted preprocessors/artifacts.
- `matplotlib`: quick diagnostic plots.


In [1]:

# ====== Imports, Paths, Seed ======
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

RANDOM_SEED = 42; np.random.seed(RANDOM_SEED)
DATA_DIR = Path("data"); ARTIFACTS_DIR = Path("artifacts")
DATA_DIR.mkdir(exist_ok=True); ARTIFACTS_DIR.mkdir(exist_ok=True)

CLEAN_DATA_PATH = DATA_DIR / "loan_default_clean_base.csv"
PREPROCESSOR_PATH = ARTIFACTS_DIR / "preprocessor.joblib"
FEATURE_NAMES_PATH = ARTIFACTS_DIR / "feature_names.csv"

plt.rcParams["figure.figsize"] = (7, 4.5)


In [2]:

# ====== Load Data & Split BEFORE Any Fitting ======
if CLEAN_DATA_PATH.exists():
    df = pd.read_csv(CLEAN_DATA_PATH, parse_dates=["application_date"])
else:
    raise FileNotFoundError("Expected base dataset not found. Run Notebook 1 first (or ensure the CSV exists).")

y = df["default"].astype(int)
X = df.drop(columns=["default"])

# CRITICAL: Split first to avoid leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)
print("Train:", X_train.shape, " Test:", X_test.shape, " Pos rate train:", y_train.mean().round(3))


Train: (4800, 15)  Test: (1200, 15)  Pos rate train: 0.103


In [3]:

# ====== Feature Engineering Transformer ======
# BEST PRACTICE: Place all transformations inside sklearn-compatible components.
class FeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, clip_dti=10.0, rare_cutoff=0.01):
        self.clip_dti = clip_dti
        self.rare_cutoff = rare_cutoff
        self.rare_maps_ = {}

    def fit(self, X, y=None):
        # Identify rare categories per column (on TRAIN only)
        cat_cols = ["state","home_ownership","purpose","applicant_gender"]
        self.rare_maps_ = {}
        for c in cat_cols:
            vc = X[c].value_counts(normalize=True, dropna=False)
            rare = set(vc[vc < self.rare_cutoff].index.tolist())
            if len(rare) > 0:
                self.rare_maps_[c] = rare
        return self

    def transform(self, X):
        X_ = X.copy()
        # Debt-to-income
        X_["dti"] = (X_["total_debt"] / X_["income"]).replace([np.inf,-np.inf], np.nan)
        X_["dti"] = X_["dti"].clip(0, self.clip_dti)

        # Date-derived
        X_["app_month"] = X_["application_date"].dt.month
        X_["app_dayofweek"] = X_["application_date"].dt.dayofweek
        X_["app_year"] = X_["application_date"].dt.year

        # Interaction terms
        X_["rate_term_interaction"] = X_["interest_rate"] * X_["term_months"]
        X_["amount_income_ratio"] = X_["loan_amount"] / np.maximum(X_["income"], 1.0)

        # Rare-category bucketing to reduce dimensionality
        for c, rare_set in self.rare_maps_.items():
            X_[c] = X_[c].astype("object")
            X_.loc[X_[c].isin(rare_set), c] = "__RARE__"
            X_[c] = X_[c].astype("category")
        return X_


In [4]:

# ====== Column Definitions ======
ordinal_edu = ["High School","Bachelor","Master","PhD"]
numeric_features = [
    "age","income","loan_amount","term_months","interest_rate","credit_score","employment_years","total_debt",
    "dti","app_month","app_dayofweek","app_year","rate_term_interaction","amount_income_ratio"
]
categorical_nominal = ["state","home_ownership","purpose","applicant_gender"]
categorical_ordinal = ["education_level"]


In [5]:

# ====== ColumnTransformer with Robust Defaults ======
num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
ord_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ord", OrdinalEncoder(categories=[ordinal_edu], dtype=float))])
nom_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

pre = ColumnTransformer([
    ("num", num_pipe, numeric_features),
    ("ord", ord_pipe, categorical_ordinal),
    ("nom", nom_pipe, categorical_nominal)
], remainder="drop")


In [6]:

# ====== Full Preprocessor Pipeline ======
preprocess = Pipeline([("feat", FeatureGenerator(clip_dti=10.0, rare_cutoff=0.01)), ("pre", pre)])

# Fit on TRAIN ONLY
Xtr = preprocess.fit_transform(X_train, y_train)
Xte = preprocess.transform(X_test)
print("Transformed (train,test):", Xtr.shape, Xte.shape)


Transformed (train,test): (4800, 36) (1200, 36)


In [7]:

# ====== Introspect Feature Names ======
def feature_names_from_preprocessor(preprocess_pipeline):
    pre = preprocess_pipeline.named_steps["pre"]
    ohe = pre.named_transformers_["nom"].named_steps["ohe"]
    # OHE names with categories
    ohe_names = []
    for col, cats in zip(["state","home_ownership","purpose","applicant_gender"], ohe.categories_):
        for c in cats:
            ohe_names.append(f"{col}__{c}")
    names = list(numeric_features) + ["education_level"] + ohe_names
    return names

feat_names = feature_names_from_preprocessor(preprocess)
print("Total feature count:", len(feat_names))
print("Sample of names:", feat_names[:20])

# Save for downstream inspection
pd.Series(feat_names, name="feature_name").to_csv(FEATURE_NAMES_PATH, index=False)


Total feature count: 36
Sample of names: ['age', 'income', 'loan_amount', 'term_months', 'interest_rate', 'credit_score', 'employment_years', 'total_debt', 'dti', 'app_month', 'app_dayofweek', 'app_year', 'rate_term_interaction', 'amount_income_ratio', 'education_level', 'state__S01', 'state__S02', 'state__S03', 'state__S04', 'state__S05']


In [8]:

# ====== Save Preprocessor Artifact ======
joblib.dump(preprocess, PREPROCESSOR_PATH)
print("Saved preprocessor to:", PREPROCESSOR_PATH)


Saved preprocessor to: artifacts/preprocessor.joblib


In [9]:

# ====== Sanity Plot: Standardized Numeric Preview ======
# Quick check that scaling worked (mean≈0, std≈1) for a few numeric features.
import numpy as np
std_preview_cols = ["age","income","loan_amount","interest_rate"]
idx = [numeric_features.index(c) for c in std_preview_cols]
arr = Xtr[:, idx]
print("Means ~ 0:", np.round(arr.mean(axis=0), 3))
print("Stds ~ 1:", np.round(arr.std(axis=0), 3))


Means ~ 0: [ 0. -0. -0. -0.]
Stds ~ 1: [1. 1. 1. 1.]
