# Ensemble Techniques
     - Bagging
     - Boosting
     - Stacking

## Loading datasets and pre-processing

In [1]:
!pip3 install pydataset



In [1]:
from pydataset import data
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [46]:
df = data('Housing')
df.head(3).values

array([[42000.0, 5850, 3, 1, 2, 'yes', 'no', 'yes', 'no', 'no', 1, 'no'],
       [38500.0, 4000, 2, 1, 1, 'yes', 'no', 'no', 'no', 'no', 0, 'no'],
       [49500.0, 3060, 3, 1, 1, 'yes', 'no', 'no', 'no', 'no', 0, 'no']],
      dtype=object)

In [3]:
d = dict(zip(['no', 'yes'], range(0,2)))

In [4]:
df.dtypes

price       float64
lotsize       int64
bedrooms      int64
bathrms       int64
stories       int64
driveway     object
recroom      object
fullbase     object
gashw        object
airco        object
garagepl      int64
prefarea     object
dtype: object

In [5]:
for i in zip(df.dtypes.index, df.dtypes):
    if str(i[1]) == 'object':
        df[i[0]] = df[i[0]].map(d)

In [6]:
df.head().values

array([[4.20e+04, 5.85e+03, 3.00e+00, 1.00e+00, 2.00e+00, 1.00e+00,
        0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00],
       [3.85e+04, 4.00e+03, 2.00e+00, 1.00e+00, 1.00e+00, 1.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00],
       [4.95e+04, 3.06e+03, 3.00e+00, 1.00e+00, 1.00e+00, 1.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00],
       [6.05e+04, 6.65e+03, 3.00e+00, 1.00e+00, 2.00e+00, 1.00e+00,
        1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00],
       [6.10e+04, 6.36e+03, 2.00e+00, 1.00e+00, 1.00e+00, 1.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00]])

In [7]:
pd.qcut(df['price'], 3)

1      (24999.999, 53000.0]
2      (24999.999, 53000.0]
3      (24999.999, 53000.0]
4        (53000.0, 74500.0]
5        (53000.0, 74500.0]
6        (53000.0, 74500.0]
7        (53000.0, 74500.0]
8        (53000.0, 74500.0]
9       (74500.0, 190000.0]
10      (74500.0, 190000.0]
11      (74500.0, 190000.0]
12     (24999.999, 53000.0]
13     (24999.999, 53000.0]
14     (24999.999, 53000.0]
15     (24999.999, 53000.0]
16     (24999.999, 53000.0]
17     (24999.999, 53000.0]
18     (24999.999, 53000.0]
19     (24999.999, 53000.0]
20     (24999.999, 53000.0]
21     (24999.999, 53000.0]
22       (53000.0, 74500.0]
23     (24999.999, 53000.0]
24     (24999.999, 53000.0]
25     (24999.999, 53000.0]
26     (24999.999, 53000.0]
27     (24999.999, 53000.0]
28     (24999.999, 53000.0]
29     (24999.999, 53000.0]
30     (24999.999, 53000.0]
31     (24999.999, 53000.0]
32     (24999.999, 53000.0]
33     (24999.999, 53000.0]
34     (24999.999, 53000.0]
35       (53000.0, 74500.0]
36       (53000.0, 7

In [8]:
## check the range of price
for i,j in enumerate(np.unique(pd.qcut(df['price'], 3))):
    print(i, j)

0 (24999.999, 53000.0]
1 (53000.0, 74500.0]
2 (74500.0, 190000.0]


In [9]:
## bucket the price
df['price'] = pd.qcut(df['price'], 3, labels = ['0', '1', '2']).cat.codes

In [10]:
df['price']

1      0
2      0
3      0
4      1
5      1
6      1
7      1
8      1
9      2
10     2
11     2
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     1
23     0
24     0
25     0
26     0
27     0
28     0
29     0
30     0
31     0
32     0
33     0
34     0
35     1
36     1
37     1
38     1
39     2
40     1
41     1
42     1
43     2
44     2
45     0
46     0
47     0
48     0
49     0
50     1
51     1
52     1
53     1
54     2
55     0
56     0
57     0
58     1
59     0
60     0
61     0
62     0
63     0
64     1
65     1
66     1
67     1
68     1
69     0
70     1
71     0
72     0
73     0
74     0
75     0
76     0
77     0
78     0
79     1
80     1
81     0
82     0
83     0
84     0
85     1
      ..
462    0
463    0
464    0
465    0
466    0
467    1
468    1
469    1
470    1
471    1
472    1
473    0
474    1
475    2
476    2
477    1
478    2
479    2
480    2
481    2
482    2
483    2
484    0
485    1
486    2
4

In [11]:
## Create datasets by splitting into X and y
y = df['price'] 
X = df.drop('price', 1)

In [12]:
print(X.shape)
print(y.shape)

(546, 11)
(546,)


## Baseline - Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [14]:
dt = DecisionTreeClassifier()

In [15]:
baseline_score = cross_val_score(dt, X, y, cv = 10, n_jobs = -1) 
print(baseline_score)
print("-----------------------------------------------------------")
print("Mean is {0: .3f} with std dev of {1: .3f} [Baseline]".format(baseline_score.mean(), baseline_score.std()))
    

[0.41818182 0.49090909 0.6        0.45454545 0.56363636 0.72727273
 0.7037037  0.59259259 0.61111111 0.53703704]
-----------------------------------------------------------
Mean is  0.570 with std dev of  0.095 [Baseline]


## Bagging ( Bootstrapping + Aggregation )

<div class="alert alert-block alert-info">
<b>ExtraTreesClassifier: Extremely Randomized Trees -</b> Apart from randomization involved in random forests, thresholds are drawn at random for each candidate feature and the best of these randomly-generated thresholds is picked as the splitting rule.  
</div>

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC

  from numpy.core.umath_tests import inner1d


In [17]:
seed = 1075
np.random.seed(seed)

In [18]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()

In [19]:
clf_array = [rf, et, knn, svc]

- StratifiedKFold
- Cross validation

In [20]:
for clf in clf_array:
    without_bagging_scores = cross_val_score(clf, X, y, cv = 10, n_jobs = -1)
    ## define bagging classifier
    bagging_clf = BaggingClassifier(clf, max_samples = 0.4, max_features = 10, random_state = seed)
    bagging_scores = cross_val_score(bagging_clf, X, y, cv = 10, n_jobs = -1)
    ## compare the scores of individual methods with the corresponding bagging classifier
    print("Mean is {1: .3f} with std dev of {2: .3f} [{0}]".format(clf.__class__.__name__, 
                                                                     without_bagging_scores.mean(), without_bagging_scores.std()))
    print("Mean is {1: .3f} with std dev of {2: .3f} [Bagging {0}]".format(clf.__class__.__name__, 
                                                                     bagging_scores.mean(), bagging_scores.std()))
    print("=============================================================")

Mean is  0.623 with std dev of  0.084 [RandomForestClassifier]
Mean is  0.639 with std dev of  0.069 [Bagging RandomForestClassifier]
Mean is  0.630 with std dev of  0.092 [ExtraTreesClassifier]
Mean is  0.654 with std dev of  0.073 [Bagging ExtraTreesClassifier]
Mean is  0.500 with std dev of  0.086 [KNeighborsClassifier]
Mean is  0.535 with std dev of  0.111 [Bagging KNeighborsClassifier]
Mean is  0.465 with std dev of  0.085 [SVC]
Mean is  0.535 with std dev of  0.083 [Bagging SVC]


<div class="alert alert-block alert-danger">
<b>Question: </b> 1. Will bagging always perform better than base model ?
</div>

<div class="alert alert-block alert-danger">
<b>Question: </b> 2. Sampling is done with or without replacement ?
    

### Voting ( Hard / soft )

<div class="alert alert-block alert-info">
<b> Hard voting</b> uses predicted class labels for majority rule voting. <b> Soft voting </b> predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers. 
</div>

#### Hard voting

In [30]:
from sklearn.ensemble import VotingClassifier
warnings.filterwarnings('ignore')

In [31]:
eclf = VotingClassifier(estimators=[('Random Forests', rf), ('Extra Trees', et), ('KNeighbors', knn), 
                                    ('SVC', svc)], voting='hard')

In [32]:
for clf, label in zip([rf, et, knn, svc, eclf], ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Accuracy is {1: .3f} with std dev of {2: .3f} [{0}]".format(label, scores.mean(), scores.std()))
    

Accuracy is  0.627 with std dev of  0.077 [Random Forest]
Accuracy is  0.628 with std dev of  0.064 [Extra Trees]
Accuracy is  0.500 with std dev of  0.086 [KNeighbors]
Accuracy is  0.465 with std dev of  0.085 [SVC]
Accuracy is  0.594 with std dev of  0.084 [Ensemble]


#### Soft voting

In [33]:
svc = SVC(probability=True)
eclf = VotingClassifier(estimators=[('Random Forests', rf), ('Extra Trees', et), ('KNeighbors', knn), 
                                    ('SVC', svc)], voting='soft', weights = [5,10,1,1])

In [34]:
for clf, label in zip([rf, et, knn, svc, eclf], ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Accuracy is {1: .3f} with std dev of {2: .3f} [{0}]".format(label, scores.mean(), scores.std()))
    

Accuracy is  0.637 with std dev of  0.094 [Random Forest]
Accuracy is  0.630 with std dev of  0.084 [Extra Trees]
Accuracy is  0.500 with std dev of  0.086 [KNeighbors]
Accuracy is  0.465 with std dev of  0.085 [SVC]
Accuracy is  0.643 with std dev of  0.087 [Ensemble]


<div class="alert alert-block alert-danger">
<b>DIY: </b> What will happen in case of tie ? Check out the documentation. 
</div>

## Boosting

In [35]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import EnsembleVoteClassifier

In [36]:
# Define AdaBoost Classifier
ada_boost = AdaBoostClassifier()

# Define Gradient Boosting Classifier
grad_boost = GradientBoostingClassifier()

# Define XG Boosting Classifier
xgb_boost = XGBClassifier()

boost_array = [ada_boost, grad_boost, xgb_boost]

In [37]:
eclf = EnsembleVoteClassifier(clfs = boost_array, voting='hard')

In [38]:
labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']

In [39]:
for clf, label in zip([ada_boost, grad_boost, xgb_boost, eclf], labels):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Accuracy is {1: .3f} with std dev of {2: .3f} [{0}]".format(label, scores.mean(), scores.std()))

Accuracy is  0.641 with std dev of  0.082 [Ada Boost]
Accuracy is  0.658 with std dev of  0.111 [Grad Boost]
Accuracy is  0.663 with std dev of  0.101 [XG Boost]
Accuracy is  0.665 with std dev of  0.104 [Ensemble]


## Stacking

In [40]:
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier

In [41]:
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier()
lr = LogisticRegression()
## Define stacking classifier
sclf = StackingClassifier(classifiers=[clf1, clf2], 
                          meta_classifier=lr)

In [42]:
labels = ['KNN', 'Random Forest', 'Stacking Classifier']
clf_array = [clf1, clf2, sclf]

In [43]:
for clf, label in zip(clf_array, labels):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Accuracy is {1: .3f} with std dev of {2: .3f} [{0}]".format(label, scores.mean(), scores.std()))

Accuracy is  0.500 with std dev of  0.086 [KNN]
Accuracy is  0.630 with std dev of  0.094 [Random Forest]
Accuracy is  0.630 with std dev of  0.086 [Stacking Classifier]


<div class="alert alert-block alert-danger">
<b> Questions: </b> 
    1. Which of these techniques can run in parallel ? 
</div>

## Practice on the following dataset
    - https://www.kaggle.com/lucidlenn/sloan-digital-sky-survey

## References
    - https://github.com/benedekrozemberczki/awesome-gradient-boosting-papers
    - https://github.com/talperetz/awesome-gradient-boosting