In [1]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
X,y=make_classification(n_samples=10000,n_features=10,n_informative=3)

In [3]:
#splitting the dataset into trainng and testing
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [4]:
clf=DecisionTreeClassifier(random_state=42)
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print("Decision Tree accuracy:",accuracy_score(y_test,y_pred))

Decision Tree accuracy: 0.919


In [5]:
bag=BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                     n_estimators=500,
                     max_samples=0.5,
                     bootstrap=True,
                     random_state=42)

In [6]:
bag.fit(x_train,y_train)



In [7]:
y_pred=bag.predict(x_test)

In [8]:
print("Accuracy score of the bagging:",accuracy_score(y_test,y_pred))

Accuracy score of the bagging: 0.95


In [9]:
bag.estimator_

In [11]:
bag.estimators_samples_[0].shape

(4000,)

In [13]:
bag.estimators_features_[0].shape

(10,)

In [14]:
#Bagging using SVM
bag=BaggingClassifier(base_estimator=SVC(),
                     n_estimators=500,
                     max_samples=0.25,
                     bootstrap=True,
                     random_state=42)

In [15]:
bag.fit(x_train,y_train)
y_pred=bag.predict(x_test)
print("Bagging using svm :",accuracy_score(y_test,y_pred))



Bagging using svm : 0.942


In [16]:
#Pasting
bag=BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                     n_estimators=500,
                     max_samples=0.25,
                     bootstrap=False,
                     random_state=42,
                     verbose=1,
                     n_jobs=-1)

In [17]:
bag.fit(x_train,y_train)
y_pred=bag.predict(x_test)
print("Pasting classifier:",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    5.9s remaining:   29.9s


Pasting classifier: 0.952


[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    6.2s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


In [18]:
#Random patches
bag=BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                     n_estimators=500,
                     max_samples=0.25,
                     bootstrap=True,
                     max_features=0.5,
                     bootstrap_features=True,
                     random_state=42)

In [19]:
bag.fit(x_train,y_train)
y_pred=bag.predict(x_test)
print("Random patches classifier:",accuracy_score(y_test,y_pred))



Random patches classifier: 0.9365


In [20]:
##OOB score
bag=BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                     n_estimators=500,
                     max_samples=0.25,
                     bootstrap=True,
                     oob_score=True,
                     random_state=42)

In [21]:
bag.fit(x_train,y_train)



In [22]:
y_pred=bag.predict(x_test)
print("Accuracy score:",accuracy_score(y_test,y_pred))

Accuracy score: 0.952


In [23]:
bag.oob_score_

0.95425

In [24]:
#Baggong tips
#bagging generally gives beteer results than pasting
#Good results come around the 25% to 50% row sampling mark
#Random patches and subspaces should be used with the higher dimensional dataset
#To find the correct hyperparameter values we can do gridsearchCv/Randomized searchCV

In [25]:
#Applying the GridsearchCV
from sklearn.model_selection import GridSearchCV

In [27]:
parameters={
    'n_estimators':[50,100,500],
    'max_samples':[0.1,0.4,0.7,1.0],
    'bootstrap':[True,False],
    'max_features':[0.1,0.4,0.7,1.0]
}

In [28]:
search=GridSearchCV(BaggingClassifier(),param_grid=parameters,cv=5)

In [None]:
search.fit(x_train,y_train)

In [None]:
search.best_params_