In [1]:
import numpy as np
import pandas as pd
import scipy
from numpy.linalg import inv, det
from sklearn.tree import DecisionTreeClassifier
from functools import partial
from scipy.stats import norm

First let's prepare the data.

In [2]:
with open('spam.txt', 'r') as f:
    arr = []
    for line in f:
        row = line.split()
        row = list(map(lambda x: float(x), row))
        arr.append(row)

df = pd.DataFrame(arr)
df[57] = df[57].astype(int)

ds = df.to_numpy()
ds

X = ds[:,:-1]
Y = ds[:,-1]

We'll use sci-kit learn's decision tree class as our base tree in order to speed training.

In [3]:
base_clf = DecisionTreeClassifier(criterion='gini', min_samples_split=10, random_state=42)

In [4]:
base_clf.fit(X, Y)

Now we create a function to bag several decision trees.

In [5]:
def bag_trees(X, Y, num_bootstrap_samples=1000, min_samples_split=10, random_state=42):
    """
        Create a bagging classifier with many trees.
        X, Y: the dataset to train on, numpy arrays with X.shape[0] = Y.shape[0]
        num_bootstrap_samples: the number of trees to train
        min_samples_split: the minimum number of samples in a leaf node
            before it can be split
        random_state: random state to use for the decision trees
        returns:
            classifier: a Python function which is the estimator
                resulting from bagging
    """
    
    # Create an array to store the different decision trees in the model
    n = len(X)
    indices = np.array(list(range(n)))
    models = []
    
    for i in range(num_bootstrap_samples):
        # Choose a bootstrap sample from X, Y
        ind_boot = np.random.choice(indices, size=n, replace=True)
        X_boot = X[ind_boot]
        Y_boot = Y[ind_boot]
        
        # Fit a decision tree classifier to the bootstrapped
        # dataset. Store the decision tree for later use.
        classifier = DecisionTreeClassifier(criterion='gini', min_samples_split=min_samples_split,
                                            random_state=random_state+i)
        classifier.fit(X_boot, Y_boot)
        models.append(classifier)
    
    # The bagging classifier
    def classifier(inp):
        
        # Get the output of each tree on inp and return
        # the majority vote
        output = np.array([model.predict(inp) for model in models])
        total = np.sum(output, axis=0)
        threshold = num_bootstrap_samples / 2
        return (total > threshold).astype(int)
    
    return classifier

Now we'll use cross-validation to compare the bagging classifier and the base tree.

In [11]:
def create_random_splits(X, Y, num_ch):
    """
        Split the dataset (X, Y) into num_ch random chunks
        of equal size (except for possibly the last chunk).
        (X, Y): the dataset
        num_ch: the number of chunks
    """
    
    # Randomly permute (X, Y)
    n = len(X)
    indices = np.random.permutation(n)
    X_perm = X[indices]
    Y_perm = Y[indices]
    
    k = n // num_ch
    
    # Arrays to hold the chunks
    Xs = []
    Ys = []
    
    # Append one chunk at a time
    for i in range(num_ch):
        if i < num_ch-1:
            Xs.append(X_perm[i*k:i*k + k, :])
            Ys.append(Y_perm[i*k:i*k + k])
        else:
            Xs.append(X_perm[i*k:, :])
            Ys.append(Y_perm[i*k:])
    
    return Xs, Ys

In [12]:
def cross_validation(X, Y, num_ch, num_bootstrap_samples=1000, min_samples_split=10, random_state=42):
    """
        Perform n-fold cross-validation to compare a decision tree
            and a bagged model where
        (X, Y): the dataset
        num_ch: the number of folds
        Returns:
            base_errors: a list of errors for the base tree on each fold
            bag_errors: a list of errors for the bagged model on each fold
    """
    base_errors = []
    bag_errors = []
    Xs, Ys = create_random_splits(X, Y, num_ch)

    # Train the model for each fold and compute
    # the error on the held out test set.
    for i in range(num_ch):
        # (X_left_over, Y_left_over) is the training dataset
        # for this fold.
        # (X_ch, Y_ch) is the held out test set.
        X_left_over = np.concatenate(Xs[:i] + Xs[i+1:], axis=0)
        Y_left_over = np.concatenate(Ys[:i] + Ys[i+1:], axis=0)
        X_ch = Xs[i]
        Y_ch = Ys[i]

        # Fit a tree on X_left_over, Y_left_over and
        # compute the error on X_ch, Y_ch
        base_clf = DecisionTreeClassifier(criterion='gini', min_samples_split=min_samples_split,
                                        random_state=random_state)
        base_clf.fit(X_left_over, Y_left_over)
        Y_pred_base = base_clf.predict(X_ch)
        base_error = (Y_ch != Y_pred_base).mean()
        
        # Fit a bagging model on X_left_over, Y_left_over and
        # compute the error on X_ch, Y_ch
        bagged_clf = bag_trees(X_left_over, Y_left_over, num_bootstrap_samples=num_bootstrap_samples,
                               min_samples_split=min_samples_split, random_state=random_state)
        Y_pred_bagged = bagged_clf(X_ch)
        bagged_error = (Y_ch != Y_pred_bagged).mean()
        
        base_errors.append(base_error)
        bag_errors.append(bagged_error)

    return base_errors, bag_errors

In [13]:
base_errors, bagged_errors = cross_validation(X, Y, 5)

In [17]:
for i in range(5):
    base_err = base_errors[i]
    bag_err = bagged_errors[i]
    print(f'Fold: {i}, tree error: {base_err:.5f}, bagging model error: {bag_err:.5f}')

Fold: 0, tree error: 0.07717, bagging model error: 0.04348
Fold: 1, tree error: 0.07609, bagging model error: 0.05870
Fold: 2, tree error: 0.09348, bagging model error: 0.06196
Fold: 3, tree error: 0.08370, bagging model error: 0.05652
Fold: 4, tree error: 0.09989, bagging model error: 0.06080


In [18]:
mean_base_error = np.mean(base_errors)
mean_bag_error = np.mean(bagged_errors)
print(f'Base tree mean CV error: {mean_base_error:.5f}')
print(f'Bagged model mean CV error: {mean_bag_error:.5f}')

Base tree mean CV error: 0.08607
Bagged model mean CV error: 0.05629


Thus we see that the bagged model does improve performance quite a bit. Namely, the CV error is reduced by approximately 35%.