# Advanced Ensemble Learning
Ensemble learning techniques are used to improve classification/regressions in several ways. For a comprehensive guide, please see [https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/](here).   

In this notebook, I am going to practice some of the advanced techniques like 
1. Stacking
2. Blending
3. Bagging
4. Boosting

### Importing libraries

In [19]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

### Loading and manipulating data

In [46]:
# Load dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
data = pd.read_csv('data/iris.csv', names=names)
X = data[['sepal-length', 'sepal-width', 'petal-length', 'petal-width']]
Y = data[['class']]
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=420)

# Normalizing the data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
y_train = y_train['class'].ravel()

### Advanced Ensembling Techniques
Here we implement 3 simple ensembling techniques implemented through VotingClassifier:
1. **Stacking** - [more here](http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/). 
2. 
3. 

In [64]:
# SkLearnHelper from here: # Like here: https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python
ntrain = x_train.shape[0]
ntest = x_test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
folder = KFold(n_splits= NFOLDS, random_state=SEED)
kf = folder.split(x_train)
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        #params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)
        print x_te
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [65]:
# Features for models
knn_params = {
    'n_neighbors' : 2
}
lr_params = {
    'penalty' : 'l2'
}
nb_params = {
    'priors' : None
}
dt_params = {
    'criterion' : 'gini'
}


In [66]:
lr = SklearnHelper(clf=LogisticRegression, seed=SEED, params=lr_params)
knn = SklearnHelper(clf=KNeighborsClassifier, seed=SEED, params=knn_params)
nb = SklearnHelper(clf=GaussianNB, seed=SEED,params=nb_params)
dt = SklearnHelper(clf=DecisionTreeClassifier, seed=SEED,params=dt_params)

In [67]:
lr_oof_train, lr_oof_test = get_oof(lr, x_train, y_train, x_test)

[[ 0.69892655  0.32412926  0.70768633  0.99533647]
 [ 0.46595103 -1.67784561  0.31893487  0.08880325]
 [-1.16487758  0.10168761 -1.29160689 -1.46525368]
 [ 1.51434085  0.32412926  1.20750964  0.73632698]
 [-1.16487758 -0.12075404 -1.34714282 -1.33574894]
 [ 0.34946327  0.76901257  0.8742941   1.3838507 ]
 [-0.46595103  0.99145422 -1.40267874 -1.33574894]
 [-0.93190206  1.65877918 -1.29160689 -1.20624419]
 [-0.11648776 -0.56563734  0.70768633  1.51335545]
 [ 0.23297552 -0.12075404  0.59661449  0.73632698]
 [-0.81541431  2.32610414 -1.29160689 -1.46525368]
 [ 2.32975516  1.65877918  1.42965333  0.99533647]
 [ 1.28136534  0.32412926  0.48554264  0.218308  ]
 [ 1.04838982  0.32412926  1.15197372  1.3838507 ]
 [-0.34946327 -1.2329623   0.04125526 -0.17020624]
 [ 0.23297552 -0.56563734  0.09679118  0.08880325]
 [ 0.93190206  0.54657092  1.04090187  1.64286019]
 [-1.28136534 -0.12075404 -1.34714282 -1.20624419]
 [-1.28136534  0.76901257 -1.0694632  -1.33574894]
 [-0.23297552 -1.2329623   0.65

ValueError: could not convert string to float: Iris-virginica

In [None]:
for i, (train_index, test_index) in enumerate(kf):
    print i
    print train_index
    print test_index