In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay

# --- 1. Load Data ---
# Create a dataset where scaling will matter
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- 2. Build the Pipeline & Parameter Grid ---
# We create a chain of operations.
# 'scaler': First, scale the data (mean=0, variance=1)
# 'model':  Second, run the logistic regression
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver='liblinear'))
])

# Define the parameters we want to tune
# We use 'step_name__parameter_name'
param_grid = {
    'model__penalty': ['l1', 'l2'],
    'model__C': [0.01, 0.1, 1, 10]
}

# --- 3. Hyperparameter Tuning with GridSearchCV ---
# cv=5 means 5-fold cross-validation
# n_jobs=-1 means use all available CPU cores
search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

print("Starting GridSearchCV...")
search.fit(X_train, y_train)

print(f"Best parameters found: {search.best_params_}")
print(f"Best cross-validation accuracy: {search.best_score_:.4f}")

# 'best_model' is the entire pipeline with the best parameters
best_model = search.best_estimator_

# --- 4. Evaluate the Tuned Model ---
print("\n--- Tuned Model Report ---")
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

# --- 5. Calibrate the Tuned Model ---
# Logistic Regression is often well-calibrated, but this shows the process.
# We take our *best_model* and wrap it in CalibratedClassifierCV.
# 'isotonic' is often a good choice, 'cv=3' uses 3-fold CV for calibration.
calibrated_model = CalibratedClassifierCV(
    best_model,
    method='isotonic',
    cv=3
)
calibrated_model.fit(X_train, y_train)

# Get calibrated probabilities
probs_uncalibrated = best_model.predict_proba(X_test)[:, 1]
probs_calibrated = calibrated_model.predict_proba(X_test)[:, 1]

# Brier Score: Measures the "goodness" of probabilities. Lower is better.
brier_uncalibrated = brier_score_loss(y_test, probs_uncalibrated)
brier_calibrated = brier_score_loss(y_test, probs_calibrated)

print("\n--- Model Calibration ---")
print(f"Brier Score (Uncalibrated): {brier_uncalibrated:.4f}")
print(f"Brier Score (Calibrated):   {brier_calibrated:.4f}")