# Credit Scoring Model Development Pipeline

Interactive notebook for step-by-step model development with the refactored pipeline.

**Steps:**
1. Config loading and data splitting
2. Constant feature elimination
3. Missing value elimination
4. IV (Information Value) filtering
5. PSI stability filtering
6. Correlation elimination
7. Forward feature selection (XGBoost)
8. Model evaluation (Train / Test / OOT quarterly)
9. Validation checks
10. Report generation

Each cell is self-contained and re-runnable. You can also run the full pipeline in one call at the bottom.

In [None]:
import sys
from pathlib import Path

# Resolve project root regardless of where the notebook is run from
project_root = str(Path.cwd().parent) if Path.cwd().name == "notebooks" else str(Path.cwd())
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-5s | %(message)s",
    datefmt="%H:%M:%S",
)

from src.config.loader import load_config
from src.components import (
    DataSplitter,
    ConstantFilter,
    MissingFilter,
    IVFilter,
    PSIFilter,
    CorrelationFilter,
    ForwardFeatureSelector,
    ModelEvaluator,
)
from src.io.output_manager import OutputManager
from src.pipeline.orchestrator import PipelineOrchestrator
from src.validation.data_checks import DataValidator
from src.validation.model_checks import ModelValidator
from src.reporting.report_exporter import ReportExporter

print(f"Project root: {project_root}")

In [None]:
# Load config from YAML (edit the YAML or use overrides here)
config = load_config("config/model_development.yaml")

# Override for this session (uncomment and edit as needed):
# config = load_config("config/model_development.yaml", overrides={
#     "splitting": {"train_end_date": "2024-06-30"},
#     "steps": {"iv": {"min_iv": 0.03}},
# })

print(f"Input:          {config.data.input_path}")
print(f"Train end date: {config.splitting.train_end_date}")
print(f"Target:         {config.data.target_column}")
print(f"IV range:       [{config.steps.iv.min_iv}, {config.steps.iv.max_iv}]")
print(f"PSI threshold:  {config.steps.psi.threshold}")
print(f"Corr threshold: {config.steps.correlation.threshold}")
print(f"AUC threshold:  {config.steps.selection.auc_threshold}")

## Data Loading and Splitting

Load the feature parquet and split into Train / Test / OOT quarters by `train_end_date`.
Test set is a stratified hold-out from the training period.

In [None]:
df = pd.read_parquet(config.data.input_path)
print(f"Loaded {len(df):,} rows x {len(df.columns)} columns")

splitter = DataSplitter(
    data_config=config.data,
    splitting_config=config.splitting,
    seed=config.reproducibility.global_seed,
)
split_result = splitter.split(df)

train_df = split_result.train
test_df = split_result.test
oot_quarters = split_result.oot_quarters
feature_columns = split_result.feature_columns

target = config.data.target_column
X_train = train_df[feature_columns]
y_train = train_df[target]
X_test = test_df[feature_columns]
y_test = test_df[target]

print(f"\nFeatures: {len(feature_columns)}")
print(f"Train: {len(train_df):,} rows, bad rate: {y_train.mean():.2%}")
print(f"Test:  {len(test_df):,} rows, bad rate: {y_test.mean():.2%}")
for label, qdf in sorted(oot_quarters.items()):
    print(f"OOT {label}: {len(qdf):,} rows, bad rate: {qdf[target].mean():.2%}")

## Pipeline Steps

Run each elimination / selection step individually. Each step takes the current feature set,
removes features that fail the criteria, and returns a `StepResult` with details.

In [None]:
# Step 1: Constant Filter
# Removes features with fewer than min_unique_values distinct values.

constant_filter = ConstantFilter(config.steps.constant)
constant_result = constant_filter.fit(X_train[feature_columns], y_train)
feature_columns = constant_result.output_features

print(constant_result.summary())
if constant_result.eliminated_features:
    print(f"\nEliminated: {constant_result.eliminated_features[:10]}")
    if len(constant_result.eliminated_features) > 10:
        print(f"  ... and {len(constant_result.eliminated_features) - 10} more")

In [None]:
# Step 2: Missing Filter
# Removes features with missing rate above the threshold.

missing_filter = MissingFilter(config.steps.missing)
missing_result = missing_filter.fit(X_train[feature_columns], y_train)
feature_columns = missing_result.output_features

print(missing_result.summary())
if missing_result.eliminated_features:
    print(f"\nEliminated: {missing_result.eliminated_features[:10]}")

In [None]:
# Step 3: IV Filter
# Removes features with IV below min_iv (useless) or above max_iv (suspicious).

iv_filter = IVFilter(config.steps.iv)
iv_result = iv_filter.fit(X_train[feature_columns], y_train)
feature_columns = iv_result.output_features

print(iv_result.summary())

# Show top features by IV
if not iv_result.results_df.empty:
    kept = iv_result.results_df[
        iv_result.results_df["feature"].isin(feature_columns)
    ].sort_values("iv", ascending=False)
    print(f"\nTop 10 features by IV:")
    display(kept.head(10))

In [None]:
# Step 4: PSI Filter
# Removes features with unstable distributions (within training data).

psi_filter = PSIFilter(config.steps.psi)
psi_result = psi_filter.fit(
    X_train[feature_columns], y_train,
    train_dates=train_df[config.data.date_column],
)
feature_columns = psi_result.output_features

print(psi_result.summary())
if psi_result.eliminated_features:
    print(f"\nEliminated (unstable): {psi_result.eliminated_features}")

In [None]:
# Step 5: Correlation Filter
# Greedy removal: among correlated pairs (|r| > threshold), drop the lower-IV feature.

correlation_filter = CorrelationFilter(config.steps.correlation)
corr_result = correlation_filter.fit(X_train[feature_columns], y_train)
feature_columns = corr_result.output_features

print(corr_result.summary())
if corr_result.eliminated_features:
    print(f"\nEliminated (correlated): {corr_result.eliminated_features[:10]}")
    if len(corr_result.eliminated_features) > 10:
        print(f"  ... and {len(corr_result.eliminated_features) - 10} more")

In [None]:
# Step 6: Forward Feature Selection
# Sequentially adds features (ordered by IV) that improve test AUC above threshold.

selector = ForwardFeatureSelector(config.steps.selection, config.model)
selection_result = selector.fit(
    X_train[feature_columns], y_train,
    X_test=X_test[feature_columns], y_test=y_test,
)
selected_features = selection_result.output_features

print(selection_result.summary())
print(f"\nSelected features ({len(selected_features)}):")
for i, feat in enumerate(selected_features, 1):
    print(f"  {i}. {feat}")

if not selection_result.results_df.empty:
    print("\nSelection details:")
    display(selection_result.results_df)

In [None]:
# Step 7: Model Evaluation
# Train final model on selected features, evaluate on Train/Test/OOT.

evaluator = ModelEvaluator(config.model, config.evaluation)
eval_result = evaluator.fit(
    X_train[selected_features], y_train,
    X_test=X_test[selected_features], y_test=y_test,
    oot_quarters={label: (qdf[selected_features], qdf[target]) for label, qdf in oot_quarters.items()},
)

print(eval_result.summary())

# Display performance table
if not eval_result.results_df.empty:
    print("\nPerformance by period:")
    display(eval_result.results_df)

# Get the trained model for later use
final_model = eval_result.metadata.get("model")

## Validation Checks

Run automated model quality checks: overfit gap, OOT degradation, score PSI, etc.

In [None]:
# Data validation (pre-pipeline checks)
data_validator = DataValidator(config)
data_report = data_validator.validate(df)

print("Data Validation Results:")
for check in data_report.checks:
    status_icon = "PASS" if check.status.value == "PASS" else "FAIL" if check.status.value == "FAIL" else "WARN"
    print(f"  [{status_icon}] {check.check_name}: {check.message}")

if data_report.has_critical_failures:
    print("\nCRITICAL: Data validation has failures. Review before proceeding.")

# Model validation (post-pipeline checks)
model_validator = ModelValidator(config)
model_report = model_validator.validate(
    eval_result=eval_result,
    selected_features=selected_features,
)

print("\nModel Validation Results:")
for check in model_report.checks:
    status_icon = "PASS" if check.status.value == "PASS" else "FAIL" if check.status.value == "FAIL" else "WARN"
    print(f"  [{status_icon}] {check.check_name}: {check.message}")
    if check.recommendation:
        print(f"         Recommendation: {check.recommendation}")

In [None]:
# Save everything: generate Excel report and persist artifacts
output_manager = OutputManager(config)

# Save config snapshot
output_manager.save_config_snapshot(config)

# Collect all step results
all_step_results = [
    constant_result,
    missing_result,
    iv_result,
    psi_result,
    corr_result,
    selection_result,
    eval_result,
]

# Save step results
step_names = [
    "01_constant", "02_missing", "03_iv", "04_psi",
    "05_correlation", "06_selection", "07_evaluation",
]
for step_name, result in zip(step_names, all_step_results):
    output_manager.save_step_results(step_name, {
        "results": result.results_df,
        "output_features": result.output_features,
        "eliminated_features": result.eliminated_features,
    })

# Generate Excel report
reporter = ReportExporter(config)
excel_path = reporter.generate(
    step_results=all_step_results,
    split_result=split_result,
    output_dir=str(output_manager.run_dir / "reports"),
)

output_manager.mark_complete("success")
output_manager.save_run_metadata()

print(f"Run ID: {output_manager.run_id}")
print(f"Output: {output_manager.run_dir}")
print(f"Report: {excel_path}")

## Experiment Tracking

Log this run to the experiment tracker for future comparison.

In [None]:
from src.tracking import ExperimentTracker

tracker = ExperimentTracker()

# Extract metrics from evaluation result
perf_df = eval_result.results_df
metrics = {
    "n_features_selected": len(selected_features),
}

# Extract AUC values if available in results
if not perf_df.empty and "auc" in perf_df.columns:
    for _, row in perf_df.iterrows():
        period = row.get("period", "")
        if "train" in str(period).lower():
            metrics["train_auc"] = row["auc"]
        elif "test" in str(period).lower():
            metrics["test_auc"] = row["auc"]
    # OOT mean AUC
    oot_rows = perf_df[perf_df["period"].str.contains("oot", case=False, na=False)]
    if not oot_rows.empty:
        metrics["oot_mean_auc"] = oot_rows["auc"].mean()

tracker.log_run(
    run_id=output_manager.run_id,
    config=config,
    metrics=metrics,
    duration=sum(r.duration_seconds for r in all_step_results),
    notes="Interactive notebook run",
)

print("Run logged. History:")
display(tracker.get_history())

---
## Alternative: Full Pipeline Mode

Instead of running each step manually above, run the entire pipeline in one call.
This uses the `PipelineOrchestrator` which handles step ordering, logging, and artifact saving.

In [None]:
# Full pipeline in one call (uncomment to use)

# config = load_config("config/model_development.yaml")
# df = pd.read_parquet(config.data.input_path)
#
# output_manager = OutputManager(config)
# pipeline = PipelineOrchestrator(config, output_manager)
#
# # Register steps
# pipeline.register_step(ConstantFilter(config.steps.constant))
# pipeline.register_step(MissingFilter(config.steps.missing))
# pipeline.register_step(IVFilter(config.steps.iv))
# pipeline.register_step(PSIFilter(config.steps.psi))
# pipeline.register_step(CorrelationFilter(config.steps.correlation))
# pipeline.register_step(ForwardFeatureSelector(config.steps.selection, config.model))
# pipeline.register_step(ModelEvaluator(config.model, config.evaluation))
#
# results = pipeline.run_all(df)
#
# print(results.summary())
# print(f"\nFinal features: {results.final_features}")

---
## Run Comparison

Compare multiple runs from the experiment log or output directories.

In [None]:
from src.tracking import RunComparison

comparison = RunComparison()
available_runs = comparison.list_runs()
print(f"Available runs ({len(available_runs)}):")
for run_id in available_runs[-5:]:
    print(f"  {run_id}")

# Compare the two most recent runs (uncomment when you have multiple runs)
# if len(available_runs) >= 2:
#     diff_df = comparison.diff_configs(available_runs[-2], available_runs[-1])
#     print("\nConfig differences:")
#     display(diff_df)
#
#     compare_df = comparison.compare(available_runs[-2:])
#     print("\nMetrics comparison:")
#     display(compare_df)