In [None]:
from utils import css_from_file
css_from_file('style/style.css')

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss
from sklearn.ensemble import VotingClassifier

def load(path):
    df = pd.read_csv(path)
    if "Activity" not in df.columns:
        df["Activity"] = np.nan
    return df.drop("Activity",axis=1), df.Activity
    
X_tr, y_tr = load("data/boehringer/train.csv")
X_te, y_te = load("data/boehringer/test.csv")

print("training data shape", X_tr.shape)
print("testing data shape", X_te.shape)

Stacking
---------------------------

In general stacking is a method of blending the models where you treat the predictions from 1 set of models as features for another model. You can think about comittee of experts. In time you are learning who to listen to.

In previous exercise we used a very simple way to combine models together.
We used a linear combination of predictions

There is an excellent article about ensembling here: http://mlwave.com/kaggle-ensembling-guide/

Another great source of information: https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/14335/1st-place-winner-solution-gilberto-titericz-stanislav-semenov - here you can see stacking in action

Exercise
----------------------

1. Read code below and understand what it does?
2. Try to use your classifier from the previous exercise? Does stacking improve a simple average?
3. Try other `mixer` models.
4. After fitting LogisticRegression mixer look at the `coef_` parameter. Which model has the highest weight?

In [None]:
from stacked_classifier import StackedClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, make_union
from cross_validation import cross_val_apply
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import BaggingClassifier, RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPClassifier

clfs = [
]


stacked_classifier = StackedClassifier(estimators=clfs,
                                       mixer=SGDClassifier(loss='log'),
                                       cv=3,
                                       n_jobs=-1,
                                       probability=True)

oof_predictions = cross_val_apply(stacked_classifier,
                                  X_tr,
                                  y_tr,
                                  cv=4,
                                  n_jobs=-1, 
                                  decision_func="predict_proba")

err = log_loss(y_tr, oof_predictions)
print("Your error is", err)

Double click to see the answers

<div class="spoiler">
from stacked_classifier import StackedClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, make_union
from cross_validation import cross_val_apply
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import BaggingClassifier, RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPClassifier

class LazyTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, x, y = None):
        return self

    def transform(self, x):
        return x

nn_forest = BaggingClassifier(make_pipeline(
                        make_union(RandomTreesEmbedding(n_estimators=10), 
                                   LazyTransformer()),
                        StandardScaler(with_mean=False), 
                        VarianceThreshold(0.001),
                        MLPClassifier((25,), alpha=10.0, verbose=False)), 
                        max_samples=0.75,
                        max_features=0.75,
                        n_estimators=10)

clfs = [
    ('nn_forest', nn_forest),
    ('rf',RandomForestClassifier(n_estimators=100,n_jobs=1)),
    ('rf_entropy',RandomForestClassifier(n_estimators=200, n_jobs=-1, criterion='entropy', max_depth=20, 
                                 min_samples_split=2, min_samples_leaf=1, max_features=250, 
                                 max_leaf_nodes=300, bootstrap=True, 
                                 oob_score=False, random_state=123, 
                                 verbose=0, warm_start=False, class_weight=None)),
    ('rf_entropy_3', RandomForestClassifier(n_estimators=300,n_jobs=-1, criterion = 'entropy', max_depth = 50,
                                 max_features = 350, random_state = 123)),
    ('rf_3', RandomForestClassifier(min_samples_leaf = 2, min_samples_split = 4, n_estimators = 200)),
    ('xgb_1', XGBClassifier(n_estimators=200, max_depth=6)),
    ('xgb_2', XGBClassifier(n_estimators=200, max_depth=6, min_child_weight=5)),
    ('bag', BaggingClassifier(n_estimators=100)),
    ('svc', make_pipeline(StandardScaler(), SVC(probability=True))
]

clfs = [est for _, est in clfs]

stacked_classifier = StackedClassifier(estimators=clfs,
                                       mixer=ExtraTreesClassifier(),
                                       cv=3,
                                       n_jobs=-1,
                                       probability=True)

oof_predictions = cross_val_apply(stacked_classifier,
                                  X_tr,
                                  y_tr,
                                  cv=4,
                                  n_jobs=-1, 
                                  decision_func="predict_proba")

err = log_loss(y_tr, oof_predictions)
print("Your error is", err)
</div>