In [1]:
from sklearn.datasets import make_classification # create a classification dataset
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC # support vector classifier
from sklearn.model_selection import train_test_split

In [2]:
X,y = make_classification(n_samples=10000, n_features=10, n_informative=3)

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [4]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print("Decision tree accuracy: {} ".format(accuracy_score(y_test, y_pred)))

Decision tree accuracy: 0.91 


## Bagging

In [5]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # how many base learning models i wanna use
    max_samples=0.25, # i use 25% of my dataset rows part for splitting into 500 different parts
    bootstrap=True, # sampling with replacement is True
    random_state=42
    )

In [6]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, random_state=42)

In [7]:
y_pred = bag.predict(X_test)

In [8]:
print("Accuracy score for bagging would be: {}".format(accuracy_score(y_test,y_pred)))

Accuracy score for bagging would be: 0.936


In [9]:
bag.estimators_samples_[0].shape # dataset is 25% for every sample which means 2000 rows for every model

(2000,)

In [10]:
bag.estimators_features_[0].shape # we used for every model all features because we didn't do the feature sampling

(10,)

## Bagging using SVM

In [11]:
bag = BaggingClassifier(base_estimator=SVC(),
                        n_estimators=500,
                        max_samples=0.25,
                        bootstrap=True,
                        random_state=42
                       )

In [12]:
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print("Accuracy of Bagging using SVM: {}".format(accuracy_score(y_test, y_pred)))

Accuracy of Bagging using SVM: 0.9155


## Pasting

In [15]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # how many base learning models i wanna use
    max_samples=0.25, # i use 25% of my dataset rows part for splitting into 500 different parts
    bootstrap=False, # sampling with replacement is False
    random_state=42,
    verbose=1, # while training every information would be shown to me
    n_jobs=1 # by using this Process would be faster (this task will divide into all courses in my CPU)
    )

In [16]:
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print("Accuracy of Bagging in Pasting type: {}".format(accuracy_score(y_test, y_pred)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy of Bagging in Pasting type: 0.937


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


## Random Subspaces

In [17]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # how many base learning models i wanna use
    max_samples=1.0, # Here will not perform the row sampling
    bootstrap=False, # sampling with replacement is False
    random_state=42,
    max_features=0.5, # i will use 50% of columns
    bootstrap_features=True
    )

In [18]:
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print("Accuracy of Bagging in Random Subspaces type: {}".format(accuracy_score(y_test, y_pred)))

Accuracy of Bagging in Random Subspaces type: 0.9385


In [19]:
bag.estimators_samples_[0].shape # so here i take all of the rows ( no row sampling performed)

(8000,)

In [20]:
bag.estimators_features_[0].shape # so here i take the 5 columns out of 10 (50% columns out of 100)

(5,)

## Random Patches

In [22]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # how many base learning models i wanna use
    max_samples=0.25,
    bootstrap=True, # sampling with replacement is True
    random_state=42,
    max_features=0.5, # i will use 50% of columns
    bootstrap_features=True
    )

In [23]:
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print("Accuracy of Bagging in Random Patches type: {}".format(accuracy_score(y_test, y_pred)))

Accuracy of Bagging in Random Patches type: 0.929


## OOB Score (out of bag)

* when we giving the sampled data to the base models then there is the chance that there are some rows which will never give to the model for training or our model wouldn't seeing those rows or columns so this type of  data is called as **out of Bag sample**.These rows are around 37% of total rows.So i can use these type of rows as my testing data.

In [24]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
    random_state=42
)

In [25]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, oob_score=True, random_state=42)

In [26]:
bag.oob_score_

0.93475

In [27]:
y_pred = bag.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))

Accuracy 0.936


## Bagging Tips

* Bagging generally gives better results than Pasting
* Good results come around the 25% to 50% row sampling mark
* Random patches and subspaces should be used while dealing with high dimensional data
* To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

## Applying GridSearchCV
Hyper parameter tuning

In [28]:
from sklearn.model_selection import GridSearchCV

In [33]:
parameters = {
    'n_estimators': [50,100,500], 
    'max_samples': [0.1,0.4,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.1,0.4,0.7,1.0]
    }

clf = BaggingClassifier()

In [34]:
grid = GridSearchCV(clf, parameters, cv=5, n_jobs=-1)

In [36]:
grid.fit(X_train,y_train)

KeyboardInterrupt: 

In [None]:
search.best_params_
search.best_score_

In [None]:
search.best_params_