In [9]:
from sklearn.datasets import fetch_covtype # dataset
from sklearn.model_selection import train_test_split # split dataset into training/test sets
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# download the dataset from:
# "http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
cover_type = fetch_covtype() 

In [3]:
y = cover_type.target
X = cover_type.data

In [4]:
# since our dataset is rather large, and we will be doing cross validation on our training set,
# we set the train_size parameter to be 90% and set aside %10 to test on
# due to large class imbalances in our target matrix, we
# set the stratify parameter=y. this makes a split so that the proportion of classes in the 
# test and train sets will be similar
X_train, X_test, y_train, y_test=\
                            train_test_split(X, y, train_size=.90, random_state=42, stratify=y)

In [15]:
clf_stump = AdaBoostClassifier()
clf_stump.fit(X_train, y_train)
clf_stump.score(X_test, y_test)

0.4541668100925958

In [14]:
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [12]:
# the default for AdaBoost Classifier is a Decision Stump with max_depth=1
# when we take this default away, the decision tree continues splitting at each node until
# it reaches purity, no surprise the model with an perfectly fit tree preforms better 
clf.score(X_test, y_test)

0.94127568758390412

## Adaptive Boost Algorithm

In [None]:
Given: 
    N, (number of samples)
    estimators, (number of estimators)
    weakL, (weak learner)
    X_train, y_train (training set from data (X,y))

# Initialize weights
For each sample in range(N):
    weight_sample = 1/N

For each est in range(estimators):
    # fit a weak learner to the training data using the weight 
    est = weakL.fit(X_train, y_train, weight)
    # calculate error
    error = sum(weight * (est(x) != y)) / sum(weight)
    # calculate alpha
    alpha = (1/2)ln((1-error)/error)
    # Update weights
    weight *= exp(alpha * (est(x) != y))
    
    For each i in range(N):
        # estimator misclassifies y
        if est(x_i) != y: 
            # increase weight of i for next estimator
            weight_i(est + 1) = weight_i(est)/(2*error)
        # estimator classifies y correctly
        else:
            # decrease weight of i for next estimator
            weight_i(est + 1) = weight_i(est)/(2*(1-error))
            
# return sum of predictions for all est*alpha
return sum(est*alpha)
    

##### Reasons to use Ada Boost
- Besides deciding on the number of weak classifiers (assumption is made) and the initial weight, there are no parameters to tune
- Fast and versatile 
- You can have little prior knowledge about the weak classifier (for example, decision stumps)