# Notebook 03 — Modeling & Export

**Goal:** Train baseline (LogReg, RF), tune RF, evaluate, and export the best pipeline.  
**Input:** `../data/processed/hr_attrition_ready.parquet`  
**Outputs:** `../artifacts/v1/rf_pipeline.joblib`, `../artifacts/v1/features.json`  


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path

df = pd.read_parquet("../data/processed/hr_attrition_ready.parquet")

NUM = ["Age","MonthlyIncome","DistanceFromHome","TotalWorkingYears",
       "YearsAtCompany","NumCompaniesWorked","PercentSalaryHike"]
CAT = ["OverTime","JobRole","MaritalStatus","BusinessTravel",
       "Department","EducationField","Gender","JobLevel"]

X = df[NUM + CAT]
y = df["target"]

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def preproc(num, cat):
    return ColumnTransformer([
        ("num", Pipeline([("impute", SimpleImputer(strategy="median")),
                          ("scale", StandardScaler())]), num),
        ("cat", Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), cat)
    ])

logreg = Pipeline([("pre", preproc(NUM, CAT)), ("clf", LogisticRegression(max_iter=1000))])
rf     = Pipeline([("pre", preproc(NUM, CAT)), ("clf", RandomForestClassifier(random_state=42))])

for name, model in [("LogReg", logreg), ("RF", rf)]:
    model.fit(Xtr, ytr)
    preds = model.predict(Xte)
    probs = model.predict_proba(Xte)[:,1]
    print(f"\n=== {name} ===")
    print(classification_report(yte, preds))
    print("ROC-AUC:", roc_auc_score(yte, probs))
    print("Confusion matrix:\n", confusion_matrix(yte, preds))


=== LogReg ===
              precision    recall  f1-score   support

           0       0.87      0.97      0.91       247
           1       0.56      0.21      0.31        47

    accuracy                           0.85       294
   macro avg       0.71      0.59      0.61       294
weighted avg       0.82      0.85      0.82       294

ROC-AUC: 0.7768972349039539
Confusion matrix:
 [[239   8]
 [ 37  10]]

=== RF ===
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       247
           1       0.44      0.17      0.25        47

    accuracy                           0.83       294
   macro avg       0.65      0.56      0.58       294
weighted avg       0.79      0.83      0.80       294

ROC-AUC: 0.7623395641312776
Confusion matrix:
 [[237  10]
 [ 39   8]]
