# Breast Cancer Classification – XGBoost vs Random Forest (Grid Search, Regularization)

This notebook trains **XGBoost** and **Random Forest** models to classify breast tumors as **malignant (M)** or **benign (B)** using features computed from digitized FNA images.

**Assumptions**
- Target: `diagnosis` (`B`/`M`).
- Unique ID column: `id` (dropped).
- All other features are numeric; no missing values.

We perform **GridSearchCV** for both models. L2-like regularization is applied via:
- **XGBoost**: `reg_lambda` (true L2 penalty).
- **Random Forest**: structural controls and pruning (`ccp_alpha`, `min_samples_leaf`, `min_samples_split`, `max_features`, optional `max_depth`), which act as regularization.

## 0. Setup & Configuration

In [None]:
# Paths & column names — update as needed
DATA_PATH = 'your_data.csv'   # <-- change to your CSV path if different
ID_COL = 'id'
TARGET_COL = 'diagnosis'      # 'B' or 'M'

RANDOM_STATE = 42
TEST_SIZE = 0.2
N_JOBS = -1

# Primary scoring for model selection
PRIMARY_SCORING = 'f1'  # You can switch to 'recall' if that's your priority
CV_FOLDS = 5

# Libraries
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
df = pd.read_csv(DATA_PATH)

# Basic checks
assert TARGET_COL in df.columns, f"Target column {TARGET_COL!r} not found"
assert ID_COL in df.columns, f"ID column {ID_COL!r} not found"

print(f"Loaded shape: {df.shape}")
df.head()

## 2. Preprocess

In [None]:
# Drop ID
df = df.drop(columns=[ID_COL])

# Encode target: B -> 0 (benign), M -> 1 (malignant)
df[TARGET_COL] = df[TARGET_COL].map({'B': 0, 'M': 1}).astype(int)

# Split features/target
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Train label balance:", y_train.value_counts(normalize=True).sort_index())
print("Test  label balance:", y_test.value_counts(normalize=True).sort_index())

## 3. Helper Functions

In [None]:
def compute_scale_pos_weight(y):
    # For XGBoost: ratio of negative to positive class
    pos = (y == 1).sum()
    neg = (y == 0).sum()
    return float(neg) / float(pos) if pos > 0 else 1.0

def evaluate_on_test(y_true, y_pred, label):
    prec = precision_score(y_true, y_pred)
    rec  = recall_score(y_true, y_pred)
    acc  = accuracy_score(y_true, y_pred)
    f1   = f1_score(y_true, y_pred)
    print(f"\n=== {label} – Test Metrics ===")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    return {'model': label, 'precision': prec, 'recall': rec, 'accuracy': acc, 'f1': f1}

def cross_validated_metrics(estimator, X, y, cv_folds=5):
    scoring = {'precision': 'precision', 'recall': 'recall', 'accuracy': 'accuracy', 'f1': 'f1'}
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=RANDOM_STATE)
    cv_results = cross_validate(estimator, X, y, scoring=scoring, cv=cv, n_jobs=N_JOBS, return_train_score=False)
    summary = {k: float(np.mean(v)) for k, v in cv_results.items() if k.startswith('test_')}
    # Rename keys without 'test_'
    summary = {k.replace('test_', ''): v for k, v in summary.items()}
    return summary

def plot_conf_mat(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    fig, ax = plt.subplots()
    disp.plot(ax=ax, colorbar=False)  # do not set any explicit colors
    ax.set_title(title)
    plt.show()

## 4. XGBoost – Grid Search with L2 Regularization

In [None]:
scale_pos_weight = compute_scale_pos_weight(y_train)
print("Computed scale_pos_weight (train):", round(scale_pos_weight, 3))

xgb = XGBClassifier(
    random_state=RANDOM_STATE,
    eval_metric='logloss',
    tree_method='hist',
    use_label_encoder=False,
    # We'll set scale_pos_weight inside the grid as a fixed value
)

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'reg_lambda': [0.1, 1, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [scale_pos_weight],
}

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

grid_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring=PRIMARY_SCORING,
    cv=cv,
    n_jobs=N_JOBS,
    verbose=1
)

grid_xgb.fit(X_train, y_train)

best_xgb = grid_xgb.best_estimator_
print("\nBest XGBoost Params:", grid_xgb.best_params_)
print("Best CV Score (primary):", grid_xgb.best_score_)

# Cross-validated metrics (for the chosen best estimator)
xgb_cv_metrics = cross_validated_metrics(best_xgb, X_train, y_train, cv_folds=CV_FOLDS)
print("\nXGBoost CV Metrics (mean over folds):", xgb_cv_metrics)

# Test metrics
y_pred_xgb = best_xgb.predict(X_test)
xgb_test_metrics = evaluate_on_test(y_test, y_pred_xgb, label="XGBoost")
plot_conf_mat(y_test, y_pred_xgb, "XGBoost – Confusion Matrix (Test)")

## 5. Random Forest – Grid Search with Regularization & Pruning

In [None]:
rf = RandomForestClassifier(random_state=RANDOM_STATE)

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'ccp_alpha': [0.0, 0.001, 0.01],  # cost-complexity pruning
    'class_weight': [None, 'balanced']  # handle imbalance if present
}

grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    scoring=PRIMARY_SCORING,
    cv=cv,
    n_jobs=N_JOBS,
    verbose=1
)

grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
print("\nBest RF Params:", grid_rf.best_params_)
print("Best CV Score (primary):", grid_rf.best_score_)

# Cross-validated metrics (best estimator)
rf_cv_metrics = cross_validated_metrics(best_rf, X_train, y_train, cv_folds=CV_FOLDS)
print("\nRandom Forest CV Metrics (mean over folds):", rf_cv_metrics)

# Test metrics
y_pred_rf = best_rf.predict(X_test)
rf_test_metrics = evaluate_on_test(y_test, y_pred_rf, label="Random Forest")
plot_conf_mat(y_test, y_pred_rf, "Random Forest – Confusion Matrix (Test)")

## 6. Side-by-Side Comparison & Save Results

In [None]:
comparison = pd.DataFrame([xgb_test_metrics, rf_test_metrics]).set_index('model')
print("\n=== Test Set Comparison ===")
display(comparison)

# Save for GitHub artifact
out_path = Path("model_comparison.csv")
comparison.to_csv(out_path, index=True)
print(f"Saved comparison to: {out_path.resolve()}")

## 7. Notes & Tips

- If **recall** on malignant cases is your main priority, consider setting `PRIMARY_SCORING = 'recall'` or using a custom scorer that emphasizes recall.
- You may expand the grids (e.g., more depths, learning rates) for a more exhaustive search, at the cost of runtime.
- For very small datasets, prefer reporting **cross-validated** metrics in addition to a single train/test split.
- You can export fitted models with `joblib.dump(best_xxx, 'model.pkl')` to reuse later.