# Day 9: Model Evaluation & Optimization

Today we learn how to **evaluate** models properly and **tune** them for better performance!

### Topics Covered:
1. Evaluation Metrics Deep Dive
2. Confusion Matrix Analysis
3. Cross-Validation
4. Hyperparameter Tuning (GridSearchCV)
5. **Mini Project: Optimize Titanic Model**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report, roc_curve, auc)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)
print("Libraries loaded!")

## 1. Evaluation Metrics Deep Dive

### The Confusion Matrix Components
```
                 Predicted
              Negative  Positive
Actual  Neg     TN        FP      ← False Positive (Type I Error)
        Pos     FN        TP      ← False Negative (Type II Error)
```

In [None]:
# Understand metrics with a concrete example
print(" METRICS EXPLAINED")
print("="*60)

# Simulated predictions
y_true = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])  # 5 positive, 5 negative
y_pred = np.array([1, 1, 1, 0, 0, 0, 0, 0, 1, 1])  # Model predictions

# Calculate components
TP = sum((y_true == 1) & (y_pred == 1))  # True Positive
TN = sum((y_true == 0) & (y_pred == 0))  # True Negative
FP = sum((y_true == 0) & (y_pred == 1))  # False Positive
FN = sum((y_true == 1) & (y_pred == 0))  # False Negative

print(f"\nConfusion Matrix Components:")
print(f"  True Positives (TP):  {TP}")
print(f"  True Negatives (TN):  {TN}")
print(f"  False Positives (FP): {FP} (Type I Error)")
print(f"  False Negatives (FN): {FN} (Type II Error)")

# Calculate metrics manually
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nCalculations:")
print(f"  Accuracy  = (TP+TN)/(All) = ({TP}+{TN})/10 = {accuracy:.1%}")
print(f"  Precision = TP/(TP+FP) = {TP}/({TP}+{FP}) = {precision:.1%}")
print(f"  Recall    = TP/(TP+FN) = {TP}/({TP}+{FN}) = {recall:.1%}")
print(f"  F1 Score  = 2*(P*R)/(P+R) = {f1:.1%}")

In [None]:
# When to use which metric?
print("\n WHEN TO USE WHICH METRIC?")
print("="*60)
print("""
 ACCURACY: When classes are balanced
   Example: 50% spam, 50% not spam

 PRECISION: When False Positives are costly
   Example: Email marked as spam but it's important
   Question: "Of all predicted positives, how many are correct?"

 RECALL: When False Negatives are costly  
   Example: Cancer detection - missing a case is dangerous
   Question: "Of all actual positives, how many did we catch?"

 F1 SCORE: When you need balance between Precision & Recall
   Example: Most real-world problems
""")

## 2. Confusion Matrix Visualization

In [None]:
# Create sample dataset
np.random.seed(42)
n = 500

# Features
X = np.random.randn(n, 4)
# Target based on features with some noise
y = (X[:, 0] + X[:, 1] * 0.5 + np.random.randn(n) * 0.5 > 0).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Raw counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Neg', 'Pos'], yticklabels=['Neg', 'Pos'])
axes[0].set_title('Confusion Matrix (Counts)', fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Normalized
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Greens', ax=axes[1],
            xticklabels=['Neg', 'Pos'], yticklabels=['Neg', 'Pos'])
axes[1].set_title('Confusion Matrix (Normalized)', fontweight='bold')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## 3. Cross-Validation

**Problem:** Single train/test split can be lucky or unlucky.

**Solution:** K-Fold Cross-Validation - split data K ways, train K times, average results.

In [None]:
# Visualize K-Fold
print(" K-FOLD CROSS-VALIDATION")
print("="*50)
print("""
5-Fold Example:

Fold 1: [TEST] [Train] [Train] [Train] [Train]
Fold 2: [Train] [TEST] [Train] [Train] [Train]
Fold 3: [Train] [Train] [TEST] [Train] [Train]
Fold 4: [Train] [Train] [Train] [TEST] [Train]
Fold 5: [Train] [Train] [Train] [Train] [TEST]

Final Score = Average of all 5 folds
""")

In [None]:
# Cross-validation in practice
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

print(" CROSS-VALIDATION RESULTS (5-Fold)")
print("="*55)

cv_results = []
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    cv_results.append({'Model': name, 'Mean': scores.mean(), 'Std': scores.std(), 'Scores': scores})
    print(f"\n{name}:")
    print(f"  Fold Scores: {[f'{s:.3f}' for s in scores]}")
    print(f"  Mean: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")

In [None]:
# Visualize CV results
fig, ax = plt.subplots(figsize=(10, 5))

positions = np.arange(len(models))
means = [r['Mean'] for r in cv_results]
stds = [r['Std'] for r in cv_results]

bars = ax.bar(positions, means, yerr=stds, capsize=5, color=['#3498db', '#2ecc71', '#e74c3c'])
ax.set_xticks(positions)
ax.set_xticklabels([r['Model'] for r in cv_results])
ax.set_ylabel('Accuracy')
ax.set_title('Model Comparison (5-Fold CV)', fontweight='bold')
ax.set_ylim(0.7, 1.0)

for bar, mean in zip(bars, means):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
            f'{mean:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Hyperparameter Tuning with GridSearchCV

**Hyperparameters:** Settings we choose BEFORE training (e.g., `max_depth`, `n_estimators`)

**GridSearchCV:** Automatically tries all combinations and finds the best!

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}

total_combinations = 3 * 4 * 3
print(f" GRID SEARCH")
print("="*50)
print(f"\nParameter Grid:")
for param, values in param_grid.items():
    print(f"  {param}: {values}")
print(f"\nTotal combinations to try: {total_combinations}")
print(f"With 5-fold CV: {total_combinations * 5} model fits!")

In [None]:
# Run GridSearchCV
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Use all CPU cores
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\n BEST PARAMETERS FOUND")
print("="*50)
print(f"Best Score: {grid_search.best_score_:.4f}")
print(f"Best Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

In [None]:
# Compare default vs tuned model
default_rf = RandomForestClassifier(random_state=42)
default_rf.fit(X_train, y_train)
default_score = default_rf.score(X_test, y_test)

tuned_rf = grid_search.best_estimator_
tuned_score = tuned_rf.score(X_test, y_test)

print(f"\n DEFAULT vs TUNED")
print("="*40)
print(f"Default RF Accuracy: {default_score:.4f}")
print(f"Tuned RF Accuracy:   {tuned_score:.4f}")
print(f"Improvement:         {(tuned_score - default_score)*100:+.2f}%")

In [None]:
# Visualize grid search results
results = pd.DataFrame(grid_search.cv_results_)

# Pivot for heatmap
pivot = results.pivot_table(
    values='mean_test_score',
    index='param_max_depth',
    columns='param_n_estimators',
    aggfunc='mean'
)

plt.figure(figsize=(8, 6))
sns.heatmap(pivot, annot=True, fmt='.3f', cmap='RdYlGn', center=pivot.values.mean())
plt.title('Grid Search Results (Accuracy)', fontweight='bold')
plt.xlabel('n_estimators')
plt.ylabel('max_depth')
plt.show()

---
## Mini Project: Optimize Titanic Model

**Goal:** Improve the Day 7 Titanic survival model using hyperparameter tuning.

In [None]:
# Create Titanic dataset (same as Day 7)
np.random.seed(42)
n = 800

pclass = np.random.choice([1, 2, 3], n, p=[0.25, 0.25, 0.50])
sex = np.random.choice([0, 1], n, p=[0.35, 0.65])  # 0=female, 1=male
age = np.random.normal(30, 15, n).clip(1, 80)
fare = np.where(pclass == 1, np.random.normal(80, 30, n),
                np.where(pclass == 2, np.random.normal(30, 15, n),
                         np.random.normal(15, 10, n))).clip(5, 200)
sibsp = np.random.choice([0, 1, 2, 3], n, p=[0.6, 0.25, 0.1, 0.05])

survival_prob = 0.3 + np.where(sex == 0, 0.35, -0.1) + np.where(pclass == 1, 0.25, np.where(pclass == 2, 0.1, -0.15))
survived = (np.random.random(n) < survival_prob).astype(int)

titanic = pd.DataFrame({
    'Pclass': pclass, 'Sex': sex, 'Age': age.round(0).astype(int),
    'Fare': fare.round(2), 'SibSp': sibsp, 'Survived': survived
})

print(f" TITANIC DATASET")
print(f"Samples: {len(titanic)} | Survived: {titanic['Survived'].mean()*100:.1f}%")

In [None]:
# Prepare data
X = titanic[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp']]
y = titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train: {len(X_train)} | Test: {len(X_test)}")

In [None]:
# Baseline models
print(" BASELINE MODELS (Default Parameters)")
print("="*50)

baseline_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

baseline_scores = {}
for name, model in baseline_models.items():
    if 'Logistic' in name:
        model.fit(X_train_scaled, y_train)
        score = model.score(X_test_scaled, y_test)
    else:
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
    baseline_scores[name] = score
    print(f"{name:25}: {score:.4f}")

In [None]:
# Tune Random Forest
print("\n TUNING RANDOM FOREST...")
print("="*50)

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params, cv=5, scoring='f1', n_jobs=-1
)
rf_grid.fit(X_train, y_train)

print(f"Best CV Score: {rf_grid.best_score_:.4f}")
print(f"Best Params: {rf_grid.best_params_}")

In [None]:
# Tune Decision Tree
print("\n TUNING DECISION TREE...")
print("="*50)

dt_params = {
    'max_depth': [3, 5, 7, 10, 15],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

dt_grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_params, cv=5, scoring='f1', n_jobs=-1
)
dt_grid.fit(X_train, y_train)

print(f"Best CV Score: {dt_grid.best_score_:.4f}")
print(f"Best Params: {dt_grid.best_params_}")

In [None]:
# Compare all models
print("\n FINAL COMPARISON: BASELINE vs TUNED")
print("="*60)

# Tuned predictions
y_pred_rf = rf_grid.best_estimator_.predict(X_test)
y_pred_dt = dt_grid.best_estimator_.predict(X_test)

results = {
    'Decision Tree (Default)': baseline_scores['Decision Tree'],
    'Decision Tree (Tuned)': accuracy_score(y_test, y_pred_dt),
    'Random Forest (Default)': baseline_scores['Random Forest'],
    'Random Forest (Tuned)': accuracy_score(y_test, y_pred_rf),
}

print(f"\n{'Model':<30} {'Accuracy':>10}")
print("-"*42)
for model, score in results.items():
    marker = '' if 'Tuned' in model else ''
    print(f"{marker} {model:<28} {score:>10.4f}")

# Improvement
rf_improvement = results['Random Forest (Tuned)'] - results['Random Forest (Default)']
dt_improvement = results['Decision Tree (Tuned)'] - results['Decision Tree (Default)']
print(f"\nRandom Forest Improvement: {rf_improvement*100:+.2f}%")
print(f"Decision Tree Improvement: {dt_improvement*100:+.2f}%")

In [None]:
# Final evaluation of best model
best_model = rf_grid.best_estimator_
y_pred_final = best_model.predict(X_test)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Died', 'Survived'], yticklabels=['Died', 'Survived'])
axes[0].set_title('Tuned Random Forest - Confusion Matrix', fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Feature Importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=True)

axes[1].barh(importance['Feature'], importance['Importance'], color=plt.cm.viridis(np.linspace(0.3, 0.9, 5)))
axes[1].set_title('Feature Importance (Tuned RF)', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=['Died', 'Survived']))

In [None]:
# Final Summary
print("\n" + "="*60)
print(" DAY 9 COMPLETE!")
print("="*60)
print("""
 KEY TAKEAWAYS:

 1. EVALUATION METRICS
    - Accuracy: Overall correctness
    - Precision: Quality of positive predictions
    - Recall: Coverage of actual positives
    - F1: Harmonic mean of P and R

 2. CROSS-VALIDATION
    - More reliable than single split
    - K-Fold: Train K times, average results
    - Detects overfitting

 3. HYPERPARAMETER TUNING
    - GridSearchCV: Try all combinations
    - Automatically uses cross-validation
    - Always tune with CV, evaluate on test set

 4. BEST PRACTICES
    - Use CV for model selection
    - Tune on validation, test on holdout
    - Choose metrics based on problem
""")
print("="*60)
print(" Next: Day 10 - Capstone Project!")

---
## Practice Exercises

1. Try `RandomizedSearchCV` for faster tuning
2. Tune Logistic Regression (C, penalty)
3. Use `StratifiedKFold` for imbalanced data
4. Plot ROC curves for model comparison

---
**Next Up:** Day 10 - Capstone Project!