# Machine Learning Boilerplate Workflow (Decision Tree Example)

## 1. Define Problem
- Goal: Predict if a tumor is malignant (0) or benign (1) based on clinical features.
- Decision Trees are intuitive "if-else" models but prone to overfitting.
- Ensembles (Random Forest, Boosting) help improve generalization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
confusion_matrix, classification_report, roc_auc_score, roc_curve
)

# Models
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
RandomForestClassifier, ExtraTreesClassifier,
AdaBoostClassifier, GradientBoostingClassifier,
StackingClassifier
)
from sklearn.linear_model import LogisticRegression


# Advanced libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## 2. EDA

In [None]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)


print("Dataset shape:", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

In [None]:
print(X.describe().T.head())  # summary statistics
sns.countplot(x=y)
plt.title("Target Distribution: 0=Malignant, 1=Benign")
plt.show()

# Quick correlation heatmap (to see redundancy)
sns.heatmap(X.corr(), cmap="coolwarm", cbar=False)
plt.title("Feature Correlations")
plt.show()

## 3. Data Preparation

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

## 4.1 Baseline Decision Tree

In [None]:
print("\n--- Decision Tree ---")
dt = DecisionTreeClassifier(criterion="gini", random_state=42)
dt.fit(X_train, y_train)
print("Accuracy:", dt.score(X_test, y_test))


# Visualize tree (small depth for clarity)
small_dt = DecisionTreeClassifier(max_depth=3, random_state=42)
small_dt.fit(X_train, y_train)
plt.figure(figsize=(12,6))
plot_tree(small_dt, feature_names=X.columns, class_names=data.target_names, filled=True)
plt.title("Decision Tree (max_depth=3)")
plt.show()

## 4.2 Random Forest

In [None]:
print("\n--- Random Forest ---")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print("Accuracy:", rf.score(X_test, y_test))

## 4.3 Extra Trees

In [None]:
print("\n--- Extra Trees ---")
et = ExtraTreesClassifier(n_estimators=100, random_state=42)
et.fit(X_train, y_train)
print("Accuracy:", et.score(X_test, y_test))

## 4.4 AdaBoost

In [None]:
print("\n--- AdaBoost ---")
ab = AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=42)
ab.fit(X_train, y_train)
print("Accuracy:", ab.score(X_test, y_test))

## 4.5 Gradient Boosting

In [None]:
print("\n--- Gradient Boosting ---")
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
gb.fit(X_train, y_train)
print("Accuracy:", gb.score(X_test, y_test))

## 4.6 XGBoost

In [None]:
print("\n--- XGBoost ---")
xgb = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=3,
subsample=0.8, colsample_bytree=0.8, eval_metric="logloss", random_state=42)
xgb.fit(X_train, y_train)
print("Accuracy:", xgb.score(X_test, y_test))

## 4.7 LightGBM

In [None]:
print("\n--- LightGBM ---")
lgbm = LGBMClassifier(n_estimators=200, learning_rate=0.05, random_state=42)
lgbm.fit(X_train, y_train)
print("Accuracy:", lgbm.score(X_test, y_test))

## 4.8 CatBoost

In [None]:
print("\n--- CatBoost ---")
cat = CatBoostClassifier(iterations=200, learning_rate=0.05, depth=6, verbose=0, random_state=42)
cat.fit(X_train, y_train)
print("Accuracy:", cat.score(X_test, y_test))

## 4.9 Stacking (Ensemble of Ensembles) 

In [None]:
print("\n--- Stacking ---")
stack = StackingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('xgb', XGBClassifier(eval_metric="logloss", random_state=42))
],
final_estimator=LogisticRegression(max_iter=1000)
)
stack.fit(X_train, y_train)
print("Accuracy:", stack.score(X_test, y_test))

# All Models

In [None]:
models = {
"Decision Tree": DecisionTreeClassifier(criterion="gini", random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
"AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=42),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42),
"XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, subsample=0.8, colsample_bytree=0.8, eval_metric="logloss", random_state=42),
"LightGBM": LGBMClassifier(n_estimators=200, learning_rate=0.05, random_state=42),
"CatBoost": CatBoostClassifier(iterations=200, learning_rate=0.05, depth=6, verbose=0, random_state=42),
"Stacking": StackingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('xgb', XGBClassifier(eval_metric="logloss", random_state=42))
],
final_estimator=LogisticRegression(max_iter=1000)
)
}

## 5. Evaluate Model

In [None]:
fitted_models = {}
for name, model in models.items():
    print(f"\n--- {name} ---")
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(f"Accuracy: {acc:.4f}")
    fitted_models[name] = model

In [None]:
# Model Evaluation (Confusion Matrix, Report, ROC for all models)
plt.figure(figsize=(8,6))
for name, model in fitted_models.items():
    y_pred = model.predict(X_test)
    y_probs = model.predict_proba(X_test)[:,1]
    # Confusion Matrix (example: Random Forest only for visualization)
    if name == "Random Forest":
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title("Confusion Matrix (Random Forest)")
        plt.show()
        print(f"Classification Report ({name}):\n", classification_report(y_test, y_pred))
    
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    auc = roc_auc_score(y_test, y_probs)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")

In [None]:
# Plot ROC for all models
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves Across Models")
plt.legend()
plt.show()

In [None]:
# --- Step 4: Feature Importances (example: Random Forest) ---
importances = pd.Series(fitted_models["Random Forest"].feature_importances_, index=X.columns)
importances.sort_values(ascending=False).head(10).plot(kind="barh")
plt.title("Top 10 Feature Importances (Random Forest)")
plt.show()

## 6. Overfitting & Pruning
- Trees can perfectly memorize data (overfit).
- Solution: Limit depth, min_samples, or use Cost Complexity Pruning (ccp_alpha).


In [None]:
path = dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

train_scores, test_scores = [], []
for alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=42, ccp_alpha=alpha)
    clf.fit(X_train, y_train)
    train_scores.append(clf.score(X_train, y_train))
    test_scores.append(clf.score(X_test, y_test))

plt.plot(ccp_alphas, train_scores, marker='o', label="Train")
plt.plot(ccp_alphas, test_scores, marker='o', label="Test")
plt.xlabel("Alpha (Pruning Strength)")
plt.ylabel("Accuracy")
plt.title("Cost-Complexity Pruning Curve")
plt.legend()
plt.show()

## 7. Random Forest (Bagging)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

print("Random Forest Test Accuracy:", rf.score(X_test, y_test))

# Feature importance
feat_importance = pd.Series(rf.feature_importances_, index=X.columns)
feat_importance.nlargest(10).plot(kind="barh")
plt.title("Top 10 Feature Importances (Random Forest)")
plt.show()

## 8. Boosting (AdaBoost & Gradient Boosting)

In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train, y_train)
print("AdaBoost Test Accuracy:", ada.score(X_test, y_test))

gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
print("Gradient Boosting Test Accuracy:", gb.score(X_test, y_test))

## 9. Bias-Variance Tradeoff (Learning Curve)

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    dt, X_train, y_train, cv=5, scoring="accuracy"
)
train_mean, test_mean = np.mean(train_scores, axis=1), np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_mean, label="Training")
plt.plot(train_sizes, test_mean, label="Validation")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy")
plt.title("Decision Tree Bias-Variance Tradeoff")
plt.legend()
plt.show()

## 10. Save Model

In [None]:
joblib.dump(fitted_models["Random Forest"], "random_forest_breast_cancer.pkl")
print("Random Forest model saved!")