# Decision Trees in scikit-learn

This notebook focuses on **implementation** (scikit-learn) for:
- classification trees
- model evaluation (train/test vs cross-validation)
- tuning `max_depth`
- bagging and random forests
- visualizing decision boundaries and trees
- a short regression-tree example


## Setup

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.datasets import load_diabetes

RANDOM_STATE = 2025
np.random.seed(RANDOM_STATE)


def fit_tabular_preprocessor(X_train: pd.DataFrame):
    """Fit a simple preprocessor without sklearn preprocessing utilities.

    - numeric: fill missing with median
    - categorical: fill missing with mode
    - one-hot encoding via pandas.get_dummies

    Stores the dummy column set learned from training data.
    """
    num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in X_train.columns if c not in num_cols]

    num_medians = {c: float(X_train[c].median()) for c in num_cols}
    cat_modes = {}
    for c in cat_cols:
        mode_series = X_train[c].mode(dropna=True)
        cat_modes[c] = mode_series.iloc[0] if len(mode_series) else ""

    X_filled = X_train.copy()
    for c, med in num_medians.items():
        X_filled[c] = X_filled[c].fillna(med)
    for c, mode in cat_modes.items():
        X_filled[c] = X_filled[c].fillna(mode)

    X_dum = pd.get_dummies(X_filled, columns=cat_cols, drop_first=False)
    feature_names = X_dum.columns.tolist()

    return {
        "num_cols": num_cols,
        "cat_cols": cat_cols,
        "num_medians": num_medians,
        "cat_modes": cat_modes,
        "feature_names": feature_names,
    }


def transform_tabular(X: pd.DataFrame, preproc) -> np.ndarray:
    """Apply the fitted preprocessor and return a numeric design matrix."""
    X_filled = X.copy()
    for c, med in preproc["num_medians"].items():
        if c in X_filled.columns:
            X_filled[c] = X_filled[c].fillna(med)
    for c, mode in preproc["cat_modes"].items():
        if c in X_filled.columns:
            X_filled[c] = X_filled[c].fillna(mode)

    X_dum = pd.get_dummies(X_filled, columns=preproc["cat_cols"], drop_first=False)
    X_dum = X_dum.reindex(columns=preproc["feature_names"], fill_value=0)
    return X_dum.values


def cv_accuracy(model_ctor, X: pd.DataFrame, y: pd.Series, cv: StratifiedKFold) -> np.ndarray:
    """Manual CV loop that preprocesses *inside* each fold."""
    accs = []
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        preproc = fit_tabular_preprocessor(X_train)
        X_train_m = transform_tabular(X_train, preproc)
        X_test_m = transform_tabular(X_test, preproc)

        model = model_ctor()
        model.fit(X_train_m, y_train)
        y_pred = model.predict(X_test_m)
        accs.append(accuracy_score(y_test, y_pred))

    return np.array(accs)


## 1) Prepare features and target (classification)
We map the target `Attrition` to 0/1, then use a small helper preprocessor that:
- fills missing numeric values with the median
- fills missing categorical values with the mode
- one-hot encodes categoricals with `pandas.get_dummies`

(We avoid `SimpleImputer`, `Pipeline`, and `ColumnTransformer`.)


In [None]:
# --- verify input dataframe is present ---
assert "df" in globals(), "Expected a pandas DataFrame named `df` already loaded in memory."

# --- basic checks ---
assert isinstance(df, pd.DataFrame), "`df` must be a pandas DataFrame."
assert "Attrition" in df.columns, "Expected target column `Attrition` in df."

df.head()


In [None]:
# --- split X / y ---
X = df.drop(columns=["Attrition"]).copy()
y_raw = df["Attrition"].copy()

# map target to 0/1 if needed
if y_raw.dtype == "O":
    y = y_raw.map({"No": 0, "Yes": 1})
else:
    y = y_raw.astype(int)

assert set(pd.unique(y)).issubset({0, 1}), "Target must be binary after mapping."

# reference: column types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print(f"Rows: {len(X):,}  |  Features: {X.shape[1]:,}  |  Numeric: {len(num_cols)}  |  Categorical: {len(cat_cols)}")
X.head()


## 2) Accuracy: train/test split vs cross-validation
We compare a single train/test split to **10-fold cross-validation**. For the remainder of the notebook we report **cross-validated accuracy**.


In [None]:
# --- CV definition (use for the rest of the notebook) ---
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

# --- train/test split accuracy (single estimate) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

preproc_tt = fit_tabular_preprocessor(X_train)
X_train_m = transform_tabular(X_train, preproc_tt)
X_test_m = transform_tabular(X_test, preproc_tt)

tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
tree.fit(X_train_m, y_train)
split_acc = tree.score(X_test_m, y_test)

# --- cross-validated accuracy (more stable estimate) ---
cv_scores = cv_accuracy(
    model_ctor=lambda: DecisionTreeClassifier(random_state=RANDOM_STATE),
    X=X,
    y=y,
    cv=cv,
)
cv_acc = float(cv_scores.mean())

print(f"Train/Test split accuracy: {split_acc:.4f}")
print(f"10-fold CV accuracy (mean): {cv_acc:.4f}  |  (std): {cv_scores.std():.4f}")


## 3) Tune `max_depth` using cross-validation
We sweep `max_depth` and plot mean CV accuracy vs depth.


In [None]:
max_depths = list(range(1, 21))
cv_means = []

for d in max_depths:
    scores = cv_accuracy(
        model_ctor=lambda d=d: DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=d),
        X=X,
        y=y,
        cv=cv,
    )
    cv_means.append(float(scores.mean()))

best_idx = int(np.argmax(cv_means))
best_depth = max_depths[best_idx]
best_acc = cv_means[best_idx]

print(f"Best max_depth = {best_depth} with mean CV accuracy = {best_acc:.4f}")


In [None]:
# --- OO-style plot ---
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(max_depths, cv_means, marker="o")
ax.set_xlabel("max_depth")
ax.set_ylabel("Mean 10-fold CV Accuracy")
ax.set_title("Decision Tree: Accuracy vs max_depth")
ax.set_xticks(max_depths)
ax.grid(True)
plt.show()


## 4) Best tree vs ensembles
We reuse the best `max_depth` and compare:
- single decision tree
- bagging
- random forest

(All with the same CV setup.)


In [None]:
# --- baseline best-depth tree ---
baseline_scores = cv_accuracy(
    model_ctor=lambda: DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=best_depth),
    X=X,
    y=y,
    cv=cv,
)
print(f"Best single tree | mean CV acc: {baseline_scores.mean():.4f} (std {baseline_scores.std():.4f})")

# --- bagging ---
bag_scores = cv_accuracy(
    model_ctor=lambda: BaggingClassifier(
        estimator=DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=best_depth),
        n_estimators=200,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    ),
    X=X,
    y=y,
    cv=cv,
)
print(f"Bagging          | mean CV acc: {bag_scores.mean():.4f} (std {bag_scores.std():.4f})")

# --- random forest (single configuration) ---
rf_scores = cv_accuracy(
    model_ctor=lambda: RandomForestClassifier(
        n_estimators=300,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        max_depth=best_depth,
    ),
    X=X,
    y=y,
    cv=cv,
)
print(f"Random forest    | mean CV acc: {rf_scores.mean():.4f} (std {rf_scores.std():.4f})")


## 5) Random forests: sweep `n_estimators`
We vary the number of trees and plot mean CV accuracy.


In [None]:
n_estimators_list = [10, 25, 50, 100, 200, 300, 500]
rf_means = []

for n_est in n_estimators_list:
    scores = cv_accuracy(
        model_ctor=lambda n_est=n_est: RandomForestClassifier(
            n_estimators=n_est,
            random_state=RANDOM_STATE,
            n_jobs=-1,
            max_depth=best_depth,
        ),
        X=X,
        y=y,
        cv=cv,
    )
    rf_means.append(float(scores.mean()))

best_rf_idx = int(np.argmax(rf_means))
print(f"Best n_estimators = {n_estimators_list[best_rf_idx]} with mean CV acc = {rf_means[best_rf_idx]:.4f}")


In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(n_estimators_list, rf_means, marker="o")
ax.set_xlabel("n_estimators")
ax.set_ylabel("Mean 10-fold CV Accuracy")
ax.set_title("Random Forest: Accuracy vs n_estimators")
ax.set_xticks(n_estimators_list)
ax.grid(True)
plt.show()


## 6) Decision boundaries with 2 features (classification)
To visualize splits, we fit trees using **only two numeric features**.

We pick two features (prefer `Age` and `MonthlyIncome` if available), then plot decision boundaries
for six different `max_depth` values on a **3×2** grid of subplots.

In [None]:
# --- choose two numeric features (prefer common IBM fields) ---
preferred = ["Age", "MonthlyIncome"]
two_features = [c for c in preferred if c in X.columns and c in num_cols]

if len(two_features) < 2:
    # fallback: first two numeric columns
    two_features = num_cols[:2]

assert len(two_features) == 2, "Need two numeric features to plot decision boundaries."
print("Using features:", two_features)

X2 = df[two_features].copy()
y2 = y.copy()


In [None]:
def plot_tree_decision_boundary(ax, model, X2: pd.DataFrame, y: pd.Series, title: str):
    # mesh
    x_min, x_max = X2.iloc[:, 0].min() - 0.5, X2.iloc[:, 0].max() + 0.5
    y_min, y_max = X2.iloc[:, 1].min() - 0.5, X2.iloc[:, 1].max() + 0.5

    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, 300),
        np.linspace(y_min, y_max, 300),
    )

    grid = np.c_[xx.ravel(), yy.ravel()]
    preds = model.predict(grid).reshape(xx.shape)

    ax.contourf(xx, yy, preds, alpha=0.25)
    ax.scatter(X2.iloc[:, 0], X2.iloc[:, 1], c=y, s=12, alpha=0.7)

    ax.set_xlabel(X2.columns[0])
    ax.set_ylabel(X2.columns[1])
    ax.set_title(title)


In [None]:
depth_grid = [1, 2, 3, 4, 6, 10]

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 12), constrained_layout=True)
axes = axes.ravel()

for ax, d in zip(axes, depth_grid):
    clf2 = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=d)
    clf2.fit(X2.values, y2.values)
    plot_tree_decision_boundary(ax, clf2, X2, y2, title=f"max_depth = {d}")

plt.show()


## 7) Plot the fitted tree (2 features)
We plot a simple decision tree using the same two features as above. We keep it small for readability.

In [None]:
clf2 = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=3)
clf2.fit(X2.values, y2.values)

fig, ax = plt.subplots(figsize=(10, 6))
plot_tree(
    clf2,
    feature_names=two_features,
    class_names=["No", "Yes"],
    filled=True,
    impurity=True,
    ax=ax,
)
ax.set_title("Decision Tree (2 features, max_depth=3)")
plt.show()


## 8) Short regression tree example (built-in dataset)
We use scikit-learn's **Diabetes** regression dataset.

- Evaluate regression trees with **cross-validated R²**
- Sweep `max_depth` and plot mean R²
- Visualize a 2-feature regression surface and the tree

In [None]:
diabetes = load_diabetes(as_frame=True)
Xr = diabetes.data
yr = diabetes.target

# CV for regression
cv_r = KFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

depths_r = list(range(1, 16))
r2_means = []

for d in depths_r:
    reg = DecisionTreeRegressor(random_state=RANDOM_STATE, max_depth=d)
    scores = cross_val_score(reg, Xr, yr, cv=cv_r, scoring="r2")
    r2_means.append(float(scores.mean()))

best_r_idx = int(np.argmax(r2_means))
best_r_depth = depths_r[best_r_idx]

print(f"Best max_depth = {best_r_depth} with mean CV R² = {r2_means[best_r_idx]:.4f}")


In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(depths_r, r2_means, marker="o")
ax.set_xlabel("max_depth")
ax.set_ylabel("Mean 10-fold CV R²")
ax.set_title("Regression Tree (Diabetes): R² vs max_depth")
ax.set_xticks(depths_r)
ax.grid(True)
plt.show()


### Regression surface with 2 features
We visualize a regression tree trained on **two** features to see piecewise-constant predictions.

In [None]:
# pick two features for visualization
feat_r = ["bmi", "bp"]
if not all(f in Xr.columns for f in feat_r):
    feat_r = Xr.columns[:2].tolist()

Xr2 = Xr[feat_r].copy()

reg2 = DecisionTreeRegressor(random_state=RANDOM_STATE, max_depth=4)
reg2.fit(Xr2.values, yr.values)

# mesh
x_min, x_max = Xr2.iloc[:, 0].min() - 0.1, Xr2.iloc[:, 0].max() + 0.1
y_min, y_max = Xr2.iloc[:, 1].min() - 0.1, Xr2.iloc[:, 1].max() + 0.1

xx, yy = np.meshgrid(
    np.linspace(x_min, x_max, 250),
    np.linspace(y_min, y_max, 250),
)

grid = np.c_[xx.ravel(), yy.ravel()]
preds = reg2.predict(grid).reshape(xx.shape)

fig, ax = plt.subplots(figsize=(8, 6))
cf = ax.contourf(xx, yy, preds, alpha=0.35)
ax.scatter(Xr2.iloc[:, 0], Xr2.iloc[:, 1], c=yr, s=12, alpha=0.7)
ax.set_xlabel(feat_r[0])
ax.set_ylabel(feat_r[1])
ax.set_title("Regression Tree Predictions (2 features, max_depth=4)")
plt.show()


### Plot the regression tree
We plot the tree trained on the same two features.

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
plot_tree(
    reg2,
    feature_names=feat_r,
    filled=True,
    impurity=True,
    ax=ax,
)
ax.set_title("Regression Tree (2 features, max_depth=4)")
plt.show()
