In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


In [3]:
#simulate data
np.random.seed(0)

X = np.random.randn(400, 2)
y = (X[:, 0] + X[:, 1] > 0).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


In [5]:
#train model
model = LogisticRegression()
model.fit(X_train, y_train)
#predict probabilities
y_probs = model.predict_proba(X_test)[:, 1]


In [7]:
#threshold sweep
thresholds = np.linspace(0.1, 0.9, 9)

for thresh in thresholds:
    y_pred_thresh = (y_probs > thresh).astype(int)
    cm = confusion_matrix(y_test, y_pred_thresh)

    TN, FP, FN, TP = cm.ravel()

    print(f"\nThreshold {thresh:.1f}")
    print(f"TP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}")



Threshold 0.1
TP: 50, FP: 18, FN: 0, TN: 32

Threshold 0.2
TP: 50, FP: 10, FN: 0, TN: 40

Threshold 0.3
TP: 50, FP: 7, FN: 0, TN: 43

Threshold 0.4
TP: 50, FP: 1, FN: 0, TN: 49

Threshold 0.5
TP: 50, FP: 0, FN: 0, TN: 50

Threshold 0.6
TP: 47, FP: 0, FN: 3, TN: 50

Threshold 0.7
TP: 45, FP: 0, FN: 5, TN: 50

Threshold 0.8
TP: 44, FP: 0, FN: 6, TN: 50

Threshold 0.9
TP: 37, FP: 0, FN: 13, TN: 50


**Now let's assume an example where false negative is very bad (medical diagnosis scenario)**

In [10]:
FN_cost = 10
FP_cost = 1

for thresh in thresholds:
    y_pred_thresh = (y_probs > thresh).astype(int)
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred_thresh).ravel()

    total_cost = FN * FN_cost + FP * FP_cost

    print(f"Threshold {thresh:.1f} → Cost = {total_cost}")


Threshold 0.1 → Cost = 18
Threshold 0.2 → Cost = 10
Threshold 0.3 → Cost = 7
Threshold 0.4 → Cost = 1
Threshold 0.5 → Cost = 0
Threshold 0.6 → Cost = 30
Threshold 0.7 → Cost = 50
Threshold 0.8 → Cost = 60
Threshold 0.9 → Cost = 130


In [16]:
#find optimal 
costs = []

for thresh in thresholds:
    y_pred_thresh = (y_probs > thresh).astype(int)
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred_thresh).ravel()

    total_cost = FN * FN_cost + FP * FP_cost
    costs.append(total_cost)

best_thresh = thresholds[np.argmin(costs)]

print("\nOptimal Threshold:", best_thresh)




Optimal Threshold: 0.5


# Confusion Matrix Tradeoffs & Threshold Tuning

## Objective
This notebook explores how classification thresholds influence prediction outcomes and the resulting balance between false positives and false negatives.

---

## Key Concepts

**False Positives (FP)**  
Incorrectly predicting a positive outcome.

**False Negatives (FN)**  
Failing to detect a true positive.

---

## Why Tradeoffs Matter
Different applications impose different costs:

- Medical diagnosis → FN often most costly
- Fraud detection → FP may frustrate users
- Risk systems → Balanced optimization required

---

## Key Insight
Threshold selection directly controls classifier behavior. Optimal thresholds depend on domain-specific cost structures rather than default values.
