Fit a tree that only uses one split on one variable to the spam dat.
Now apply boosting.

In [71]:
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

# Download the spam data

In [72]:
# Read the data into a pandas data frame
df = pd.read_csv('../data/spam.dat', sep=' ', header=None)

# Extract the response variable Y from the data frame
# and convert it to a numpy array
Y = df[df.columns[-1]].to_numpy()

# Extract all 57 covariates into a numpy array X
X = df[df.columns[:-1]].to_numpy()

# Fit a single tree
(Since there is only one split, on a randomly chosen feature, this is really a *random branch*.)

In [76]:
model = DecisionTreeClassifier(max_depth=1, max_features=1)
classifier = model.fit(X, Y)

empirical_error_rate = zero_one_loss(Y, classifier.predict(X))
true_error_rate_cv_estimate = 1 - cross_val_score(model, X, Y, cv=5).mean()
print(
    f"Empirical error rate: {empirical_error_rate:.3}\n"
f"Cross-validation estimate of the true error rate: {true_error_rate_cv_estimate:.3}"
)

Empirical error rate: 0.394
Cross-validation estimate of the true error rate: 0.345


# Boost the tree above

In [81]:
boosted_model = AdaBoostClassifier(model, n_estimators=200)
boosted_classifier = boosted_model.fit(X, Y)

empirical_error_rate = zero_one_loss(Y, boosted_classifier.predict(X))
true_error_rate_cv_estimate = 1 - cross_val_score(boosted_model, X, Y, cv=5).mean()
print(
    f"Empirical error rate: {empirical_error_rate:.3}\n"
f"Cross-validation estimate of the true error rate: {true_error_rate_cv_estimate:.3}"
)

Empirical error rate: 0.0587
Cross-validation estimate of the true error rate: 0.0796
