# Chapter 16 - Bootstrap Aggregation

In [1]:
from random import seed, randrange

### Bootstrap Resample

In [2]:
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio=1.0):
    sample = list()
    n_sample = round(len(dataset)*ratio)
    while len(sample) < n_sample:
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample

In [3]:
# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers) / float(len(numbers))

# Test subsampling a dataset
seed(1)
# True mean
dataset = [[randrange(10)] for i in range(20)]
print('True Mean: %.3f' % mean([row[0] for row in dataset]))
# Estimated means
ratio = 0.10
for size in [1, 10, 100]:
    sample_means = list()
    for i in range(size):
        sample = subsample(dataset, ratio)
        sample_mean = mean([row[0] for row in sample])
        sample_means.append(sample_mean)
    print('Samples=%d, Estimated Mean: %.3f' % (size, mean(sample_means)))

True Mean: 4.500
Samples=1, Estimated Mean: 4.000
Samples=10, Estimated Mean: 4.700
Samples=100, Estimated Mean: 4.570


### Sonar Case Study

In [4]:
# Functions to load and prepare data
from Codes.ch01_load_and_convert_data import load_csv, str_column_to_float, str_column_to_int
# k-Fold for classification evaluation
from Codes.ch06_algorithm_test_harnesses import evaluate_algorithm_kfold
# Decision Tree Classifier
from Codes.ch11_decision_tree import test_split, gini_index, get_split, to_terminal, split, build_tree, predict

In [5]:
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)

# Bootstrap Aggregation Algorithm
def bagging(train, test, max_depth, min_size, sample_size, n_trees):
    trees = list()
    for _ in range(n_trees):
        sample = subsample(train, sample_size)
        tree = build_tree(sample, max_depth, min_size)
        trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
    return (predictions)

In [6]:
# Test bagging on the sonar dataset
seed(1)

# load and prepare data
filename = './data/sonar.all-data.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

# evaluate algorithm
n_folds = 5
max_depth = 6
min_size = 2
sample_size = 0.50
for n_trees in [1, 5, 10, 50]:
    scores = evaluate_algorithm_kfold(dataset, bagging, n_folds, max_depth, min_size, sample_size, n_trees)
    print('Trees: %d' % n_trees)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Trees: 1
Scores: [87.8048780487805, 68.29268292682927, 65.85365853658537, 65.85365853658537, 73.17073170731707]
Mean Accuracy: 72.195%
Trees: 5
Scores: [63.41463414634146, 80.48780487804879, 78.04878048780488, 82.92682926829268, 60.97560975609756]
Mean Accuracy: 73.171%
Trees: 10
Scores: [58.536585365853654, 78.04878048780488, 80.48780487804879, 85.36585365853658, 65.85365853658537]
Mean Accuracy: 73.659%
Trees: 50
Scores: [60.97560975609756, 75.60975609756098, 82.92682926829268, 73.17073170731707, 85.36585365853658]
Mean Accuracy: 75.610%


## Future Works

* Tune the Example. Explore different configurations for the number of trees and even individual tree configurations to see if you can further improve results.
* Bag Another Algorithm. Other algorithms can be used with bagging. For example, a k-nearest neighbor algorithm with a low value of k will have a high variance and is a good candidate for bagging.
* Regression Problems. Bagging can be used with regression trees. Instead of predicting the most common class value from the set of predictions, you can return the average of the predictions from the bagged trees. Experiment on regression problems.