# Notebook 03 — Modeling & Export

**Goal:** Train baseline (LogReg, RF), tune RF, evaluate, and export the best pipeline.  
**Input:** `../data/processed/hr_attrition_ready.parquet`  
**Outputs:** `../artifacts/v1/rf_pipeline.joblib`, `../artifacts/v1/features.json`  


In [4]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import json
import joblib

# Import from src modules for consistency
import sys
sys.path.append(str(Path.cwd().parent))  # Add project root to path

from src.features import NUM_FEATURES, CAT_FEATURES
from src.pipeline import make_logreg_pipeline, make_rf_pipeline
from src.config import READY_PARQUET, ARTIFACTS_DIR

print("✅ All libraries and modules imported")

# Load data using config
df = pd.read_parquet(READY_PARQUET)
print(f"✅ Loaded ready data: {df.shape}")
print(f"   From: {READY_PARQUET}")

# Use features from src.features module
NUM = NUM_FEATURES
CAT = CAT_FEATURES
print(f"✅ Using {len(NUM)} numeric and {len(CAT)} categorical features")

X = df[NUM + CAT]
y = df["target"]

Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Use built-in round() for floats
print("Class balance:", {
    "train": round(float(ytr.mean()), 3),
    "test":  round(float(yte.mean()), 3)
})

# Use pipeline builders from src.pipeline module
# This ensures consistent preprocessing across all notebooks and the dashboard
logreg = make_logreg_pipeline(NUM, CAT)
rf = make_rf_pipeline(NUM, CAT)

print("✅ Created pipelines using src.pipeline module")
print("   - Logistic Regression pipeline ready")
print("   - Random Forest pipeline ready")

# Quick baseline comparison on TEST
for name, model in [("LogReg", logreg), ("RF", rf)]:
    model.fit(Xtr, ytr)
    probs = model.predict_proba(Xte)[:,1]
    preds = (probs >= 0.50).astype(int)
    auc = roc_auc_score(yte, probs)
    print(f"\n=== {name} ===")
    print("ROC-AUC:", round(auc, 3))    
    print(classification_report(yte, preds))
    print("Confusion matrix:\n", confusion_matrix(yte, preds))

✅ All libraries and modules imported
✅ Loaded ready data: (1470, 36)
   From: /Users/ahmedgodah/Documents/vscode-projects/AttriSight/data/processed/hr_attrition_ready.parquet
✅ Using 7 numeric and 8 categorical features
Class balance: {'train': 0.162, 'test': 0.16}
✅ Created pipelines using src.pipeline module
   - Logistic Regression pipeline ready
   - Random Forest pipeline ready

=== LogReg ===
ROC-AUC: 0.777
              precision    recall  f1-score   support

           0       0.87      0.97      0.91       247
           1       0.56      0.21      0.31        47

    accuracy                           0.85       294
   macro avg       0.71      0.59      0.61       294
weighted avg       0.82      0.85      0.82       294

Confusion matrix:
 [[239   8]
 [ 37  10]]

=== RF ===
ROC-AUC: 0.764
              precision    recall  f1-score   support

           0       0.86      0.95      0.90       247
           1       0.43      0.19      0.26        47

    accuracy           

In [None]:
# Grid search for RF (AUC, 5-fold). Heavy but thorough.
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

param_grid = {
    "clf__n_estimators": [100, 300, 500],
    "clf__max_depth": [None, 8, 16],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2", None],
    "clf__criterion": ["gini", "entropy", "log_loss"]
}

rf = Pipeline([("pre", preproc(NUM, CAT)),
              ("clf", RandomForestClassifier(random_state=42))])

grid = GridSearchCV(
    rf, param_grid=param_grid, scoring="roc_auc", cv=5,
     n_jobs=-1, verbose=1)
grid.fit(Xtr, ytr)

best = grid.best_estimator_
test_auc = roc_auc_score(yte, best.predict_proba(Xte)[:, 1])

print("Best params:", grid.best_params_)
print("Test ROC-AUC (best):", round(test_auc, 3))

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


Traceback (most recent call last):
  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 112, in _get_module_details
  File "/workspace/.pyenv_mirror/user/current/lib/python3.12/site-packages/joblib/__init__.py", line 120, in <module>
Traceback (most recent call last):
  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 112, in _get_module_details
  File "/workspace/.pyenv_mirror/user/current/lib/python3.12/site-packages/joblib/__init__.py", line 115, in <module>
Traceback (most recent call last):
  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 112, in _get_module_details
  File "/workspace/.pyenv_mirror/user/current/lib/python3.12/site-packages/joblib/__init__.py", line 120, in <module>
Traceback (most recent call last):
Traceback (most recent call last):
  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 189, in _run_module_as_main
  File 

KeyboardInterrupt: 

In [2]:
#Smaller grid (faster). Use this instead of Cell 2 if timing matters
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

small_grid = {
    "clf__n_estimators": [150, 300],
    "clf__max_depth": [None, 12],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2],
    "clf__max_features": ["sqrt", "log2"],
    "clf__criterion": ["gini", "entropy"],
}
rf = Pipeline([("pre", preproc(NUM, CAT)),
              ("clf", RandomForestClassifier(random_state=42))])

grid = GridSearchCV(rf, param_grid=small_grid, scoring="roc_auc", cv=3,
     n_jobs=-1, verbose=1)
grid.fit(Xtr, ytr)

best = grid.best_estimator_
test_auc = roc_auc_score(yte, best.predict_proba(Xte)[:, 1])
print("Best params:", grid.best_params_)
print("Test ROC-AUC:", roc_auc_score(yte, best.predict_proba(Xte)[:,1]))

Fitting 3 folds for each of 64 candidates, totalling 192 fits
Best params: {'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2, 'clf__n_estimators': 150}
Test ROC-AUC: 0.7733224222585925


In [3]:
# Save the trained pipeline and feature list
import json, joblib
from pathlib import Path

ART = Path("../artifacts/v1"); ART.mkdir(parents=True, exist_ok=True)
joblib.dump(best, ART/"rf_pipeline.joblib")
with open(ART / "features.json", "w") as f:
    json.dump(NUM + CAT, f)

print("✅ Exported →", ART/"rf_pipeline.joblib", "and", ART/"features.json")

✅ Exported → ../artifacts/v1/rf_pipeline.joblib and ../artifacts/v1/features.json
