In [None]:
"""
Temperature Scaling Practice with netcal
Learn how to apply temperature scaling to calibrate a model
"""

import os
from pathlib import Path
import numpy as np
if not hasattr(np, 'infty'):
    np.infty = np.inf
import matplotlib.pyplot as plt
from netcal.scaling import TemperatureScaling
from netcal.metrics import ECE
from sklearn.calibration import calibration_curve

print("=" * 70)
print("TEMPERATURE SCALING PRACTICE")
print("=" * 70)

In [None]:
# 1. Generate synthetic miscalibrated predictions
print("\n1. Generating synthetic miscalibrated predictions...")

np.random.seed(42)
n_samples = 1000

# True labels (binary)
y_true = np.random.randint(0, 2, n_samples)

# Generate probabilities directly (easier to control)
y_pred_proba_uncalibrated = np.zeros(n_samples)

for i in range(n_samples):
    if np.random.rand() < 0.80:  # 80% accuracy
        # Correct prediction - make it overconfident
        if y_true[i] == 1:
            # Should be high, but make it TOO high (overconfident)
            y_pred_proba_uncalibrated[i] = np.random.uniform(0.75, 0.98)
        else:
            # Should be low, but make it TOO low (overconfident)
            y_pred_proba_uncalibrated[i] = np.random.uniform(0.02, 0.25)
    else:
        # Incorrect prediction - still overconfident (this creates miscalibration!)
        if y_true[i] == 1:
            # Wrong class, but confident
            y_pred_proba_uncalibrated[i] = np.random.uniform(0.02, 0.25)
        else:
            # Wrong class, but confident
            y_pred_proba_uncalibrated[i] = np.random.uniform(0.75, 0.98)

# Get predictions
y_pred = (y_pred_proba_uncalibrated > 0.5).astype(int)

# Calculate accuracy
accuracy = (y_pred == y_true).mean()
print(f"  Accuracy: {accuracy*100:.2f}%")

# Show confidence distribution
high_conf_positive = (y_pred_proba_uncalibrated > 0.8).sum()
mid_high_conf = ((y_pred_proba_uncalibrated > 0.6) & (y_pred_proba_uncalibrated <= 0.8)).sum()
mid_low_conf = ((y_pred_proba_uncalibrated >= 0.2) & (y_pred_proba_uncalibrated < 0.4)).sum()
low_conf_negative = (y_pred_proba_uncalibrated < 0.2).sum()

print(f"  Very high confidence (>0.8): {high_conf_positive}/{n_samples} ({high_conf_positive/n_samples*100:.1f}%)")
print(f"  Medium-high (0.6-0.8): {mid_high_conf}/{n_samples} ({mid_high_conf/n_samples*100:.1f}%)")
print(f"  Medium-low (0.2-0.4): {mid_low_conf}/{n_samples} ({mid_low_conf/n_samples*100:.1f}%)")
print(f"  Very low confidence (<0.2): {low_conf_negative}/{n_samples} ({low_conf_negative/n_samples*100:.1f}%)")

In [None]:
# 2. Calculate ECE before calibration (on full dataset for initial assessment)
print("\n2. Calibration BEFORE temperature scaling...")

ece_metric = ECE(bins=10)
ece_before_full = ece_metric.measure(y_pred_proba_uncalibrated, y_true)

print(f"  ECE (full dataset): {ece_before_full:.4f}")
if ece_before_full < 0.05:
    print(f"  Status: Excellent calibration ✓")
elif ece_before_full < 0.10:
    print(f"  Status: Good calibration")
elif ece_before_full < 0.15:
    print(f"  Status: Acceptable calibration")
else:
    print(f"  Status: Poor calibration - needs fixing ✗")

In [None]:
# 3. Apply temperature scaling
print("\n3. Applying temperature scaling...")

# Split data: use first 70% for calibration, last 30% for testing
n_cal = int(0.7 * n_samples)

# For netcal with BINARY classification, we need 2D array
# Column 0: probability of class 0, Column 1: probability of class 1
y_proba_2d = np.column_stack([
    1 - y_pred_proba_uncalibrated,  # P(class=0)
    y_pred_proba_uncalibrated        # P(class=1)
])

# Split into calibration and test sets
y_proba_cal = y_proba_2d[:n_cal]
y_true_cal = y_true[:n_cal]

y_proba_test = y_proba_2d[n_cal:]
y_true_test = y_true[n_cal:]
y_pred_proba_test = y_pred_proba_uncalibrated[n_cal:]

# Calculate ECE on test set BEFORE calibration (for fair comparison)
ece_before_test = ece_metric.measure(y_pred_proba_test, y_true_test)
print(f"  ECE on test set (before calibration): {ece_before_test:.4f}")

# Initialize temperature scaling
temperature = TemperatureScaling()

# Fit on calibration set (finds optimal temperature T)
# netcal expects PROBABILITIES as input, not logits
temperature.fit(y_proba_cal, y_true_cal)

temp_value = temperature.temperature.item() if hasattr(temperature.temperature, 'item') else float(temperature.temperature)
print(f"  Optimal temperature: {temp_value:.4f}")
print(f"  Interpretation: {'Model is overconfident (T>1 softens predictions)' if temp_value > 1 else 'Model is underconfident (T<1 sharpens predictions)'}")

# Apply to test set
y_pred_proba_calibrated_full = temperature.transform(y_proba_test)
# Check if it's 1D or 2D
if y_pred_proba_calibrated_full.ndim == 1:
    y_pred_proba_calibrated = y_pred_proba_calibrated_full
else:
    y_pred_proba_calibrated = y_pred_proba_calibrated_full[:, 1]

In [None]:
# 4. Calculate ECE after calibration
print("\n4. Calibration AFTER temperature scaling...")

ece_after = ece_metric.measure(y_pred_proba_calibrated, y_true_test)

print(f"  ECE on test set (after calibration): {ece_after:.4f}")
if ece_after < 0.05:
    print(f"  Status: Excellent calibration ✓")
elif ece_after < 0.10:
    print(f"  Status: Good calibration")
else:
    print(f"  Status: Acceptable calibration")

improvement = ece_before_test - ece_after  # Compare both on test set
improvement_pct = (improvement / ece_before_test) * 100 if ece_before_test > 0 else 0

print(f"\n  ECE Improvement: {improvement:.4f} ({improvement_pct:.1f}%)")
if improvement > 0:
    print(f"  ✓ Calibration improved!")
else:
    print(f"  ⚠ Calibration got worse (unexpected)")

In [None]:
# 5. Visualize calibration
print("\n5. Creating calibration visualizations...")

fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Plot 1: Reliability diagram - BEFORE
frac_pos_before, mean_pred_before = calibration_curve(
    y_true_test, y_pred_proba_test, n_bins=10, strategy='uniform'
)

axes[0, 0].plot([0, 1], [0, 1], 'k--', linewidth=2, label='Perfect calibration')
axes[0, 0].plot(mean_pred_before, frac_pos_before, 'o-', linewidth=2, 
                markersize=8, color='coral', label='Model (before)')
axes[0, 0].set_xlabel('Mean Predicted Probability', fontsize=11, fontweight='bold')
axes[0, 0].set_ylabel('Fraction of Positives', fontsize=11, fontweight='bold')
axes[0, 0].set_title(f'Before Temperature Scaling\nECE = {ece_before_test:.4f}', 
                    fontsize=12, fontweight='bold')
axes[0, 0].legend(fontsize=10)
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xlim([0, 1])
axes[0, 0].set_ylim([0, 1])

# Plot 2: Reliability diagram - AFTER
frac_pos_after, mean_pred_after = calibration_curve(
    y_true_test, y_pred_proba_calibrated, n_bins=10, strategy='uniform'
)

axes[0, 1].plot([0, 1], [0, 1], 'k--', linewidth=2, label='Perfect calibration')
axes[0, 1].plot(mean_pred_after, frac_pos_after, 'o-', linewidth=2,
                markersize=8, color='steelblue', label='Model (after)')
axes[0, 1].set_xlabel('Mean Predicted Probability', fontsize=11, fontweight='bold')
axes[0, 1].set_ylabel('Fraction of Positives', fontsize=11, fontweight='bold')
axes[0, 1].set_title(f'After Temperature Scaling\nECE = {ece_after:.4f}',
                    fontsize=12, fontweight='bold')
axes[0, 1].legend(fontsize=10)
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xlim([0, 1])
axes[0, 1].set_ylim([0, 1])

# Plot 3: Confidence histogram - BEFORE
axes[1, 0].hist(y_pred_proba_test, bins=20, edgecolor='black', alpha=0.7, color='coral')
axes[1, 0].axvline(0.5, color='black', linestyle='--', alpha=0.5, label='Decision threshold')
axes[1, 0].set_xlabel('Predicted Probability', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('Frequency', fontsize=11, fontweight='bold')
axes[1, 0].set_title('Confidence Distribution (Before)', fontsize=12, fontweight='bold')
axes[1, 0].legend(fontsize=10)
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Plot 4: Confidence histogram - AFTER
axes[1, 1].hist(y_pred_proba_calibrated, bins=20, edgecolor='black', alpha=0.7, color='steelblue')
axes[1, 1].axvline(0.5, color='black', linestyle='--', alpha=0.5, label='Decision threshold')
axes[1, 1].set_xlabel('Predicted Probability', fontsize=11, fontweight='bold')
axes[1, 1].set_ylabel('Frequency', fontsize=11, fontweight='bold')
axes[1, 1].set_title('Confidence Distribution (After)', fontsize=12, fontweight='bold')
axes[1, 1].legend(fontsize=10)
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.suptitle('Temperature Scaling Effect on Calibration', fontsize=15, fontweight='bold')
plt.tight_layout()

# Create outputs directory if needed
os.makedirs('outputs', exist_ok=True)
plt.savefig('outputs/netcal_practice_calibration.png', dpi=150, bbox_inches='tight')
plt.show()

print("  ✓ Saved calibration plots to outputs/netcal_practice_calibration.png")

In [None]:
# 6. Effect on confidence
print("\n6. Effect on prediction confidence...")

print(f"\nBefore calibration (test set):")
print(f"  Mean confidence: {y_pred_proba_test.mean():.4f}")
print(f"  Max confidence: {y_pred_proba_test.max():.4f}")
print(f"  Min confidence: {y_pred_proba_test.min():.4f}")
print(f"  Std dev: {y_pred_proba_test.std():.4f}")

print(f"\nAfter calibration (test set):")
print(f"  Mean confidence: {y_pred_proba_calibrated.mean():.4f}")
print(f"  Max confidence: {y_pred_proba_calibrated.max():.4f}")
print(f"  Min confidence: {y_pred_proba_calibrated.min():.4f}")
print(f"  Std dev: {y_pred_proba_calibrated.std():.4f}")

# Check if accuracy is preserved - calculate both on test set
y_pred_before = (y_pred_proba_test > 0.5).astype(int)
y_pred_after = (y_pred_proba_calibrated > 0.5).astype(int)
accuracy_before = (y_pred_before == y_true_test).mean()
accuracy_after = (y_pred_after == y_true_test).mean()

print(f"\nAccuracy comparison (both on test set):")
print(f"  Before: {accuracy_before*100:.2f}%")
print(f"  After:  {accuracy_after*100:.2f}%")
print(f"  Change: {(accuracy_after - accuracy_before)*100:.2f}%")
print(f"  ✓ Accuracy preserved!" if abs(accuracy_after - accuracy_before) < 0.01 else "  ⚠ Accuracy changed (should be identical)")

# Additional diagnostic
if abs(accuracy_after - accuracy_before) > 0.01:
    print(f"\n  NOTE: Accuracy should be identical. If changed, it means the threshold (0.5)")
    print(f"        crossed for some predictions during calibration.")
    num_changed = (y_pred_before != y_pred_after).sum()
    print(f"        Number of predictions that changed: {num_changed}/{len(y_pred_before)}")


In [None]:
print("\n" + "=" * 70)
print("KEY TAKEAWAYS")
print("=" * 70)
print("1. Temperature scaling improves calibration (lowers ECE)")
print("2. T > 1: Model is overconfident, predictions get softened")
print("3. T < 1: Model is underconfident, predictions get sharpened")
print("4. Accuracy should be preserved (same predicted classes)")
print("5. Simple: just one parameter to optimize on calibration set")
print("6. Fast: can be done on validation set after training")
print("7. netcal expects 2D probability arrays as input")
print("\n✓ Temperature scaling practice complete!")