Fit a tree to the spam data.
Now apply bagging and report your results.

In [1]:
import pandas as pd

from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

# Download the spam data

In [3]:
# Read the data into a pandas data frame
df = pd.read_csv('../data/spam.dat', sep=' ', header=None)

# Extract the response variable Y from the data frame
# and convert it to a numpy array
Y = df[df.columns[-1]].to_numpy()

# Extract all 57 covariates into a numpy array X
X = df[df.columns[:-1]].to_numpy()

# 'Fit and report' function

In [4]:
def fit_and_report(model, X, Y):
    
    classifier = model.fit(X, Y)
    
    empirical_error_rate = zero_one_loss(Y, classifier.predict(X))
    true_error_rate_cv_estimate = 1 - cross_val_score(model, X, Y, cv=5).mean()
    print(
        f"Empirical error rate: {empirical_error_rate:.3}\n"
        f"Cross-validation estimate of the true error rate: {true_error_rate_cv_estimate:.3}"
    )

# Single tree classifier

## Unlimited splitting

In [5]:
model = DecisionTreeClassifier()
fit_and_report(model, X, Y)

Empirical error rate: 0.000652
Cross-validation estimate of the true error rate: 0.114


## Limited splitting

In [6]:
model = DecisionTreeClassifier(max_depth=4, max_leaf_nodes=6)
fit_and_report(model, X, Y)

Empirical error rate: 0.101
Cross-validation estimate of the true error rate: 0.121


# Bag several tree classifiers

## Unlimited splitting

In [7]:
model = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 20,
    max_samples=1.0
)
fit_and_report(model, X, Y)

Empirical error rate: 0.00196
Cross-validation estimate of the true error rate: 0.0878


## Limited splitting

In [8]:
model = BaggingClassifier(
    DecisionTreeClassifier(max_depth=4, max_leaf_nodes=6),
    n_estimators = 50,
    max_samples=1.0
)
fit_and_report(model, X, Y)

Empirical error rate: 0.0898
Cross-validation estimate of the true error rate: 0.114
