In [1]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# *CLASSIFICATION*

In [2]:
X,y = make_classification(n_samples = 1000 , n_features = 10 , n_informative = 3)

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [4]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Decision Tree Accuracy : ",accuracy_score(y_test,y_pred))

Decision Tree Accuracy :  0.89


# Bagging

In [5]:
bag = BaggingClassifier(
    estimator = DecisionTreeClassifier(),#base models algo
    n_estimators = 1000,
    max_samples = 0.25 ,#how many % of training samples each base model gets
    random_state=42,
    bootstrap=True #sampling is done with replacement
)

In [6]:
bag.fit(X_train,y_train)

In [7]:
y_pred1 = bag.predict(X_test)
print("Bagging Accuracy Score : ",accuracy_score(y_test ,y_pred1))

Bagging Accuracy Score :  0.905


In [8]:
bag.estimators_samples_[0].shape

(200,)

In [9]:
bag.estimators_features_[0].shape

(10,)

Bagging using SVC

In [10]:
bag1 = BaggingClassifier(
    estimator = SVC(),
    n_estimators = 500,
    max_samples=0.25,
    bootstrap = True,
    random_state=42
)

In [11]:
bag1.fit(X_train,y_train)
y_pred = bag1.predict(X_test)
print("Accuracy of Bagging via SVC",accuracy_score(y_test,y_pred))

Accuracy of Bagging via SVC 0.95


# Pasting

In [12]:
bag2 = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples = 0.25,
    bootstrap = False ,# **for pasting
    random_state=42,
    verbose=1,#how much output model prints WHILE training
    n_jobs=-1#how many CPU cores in use , -1 means all 
)

In [13]:
bag2.fit(X_train,y_train)
y_pred = bag2.predict(X_test)
print("accuracy of Pasting DT",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.5s remaining:    3.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s


accuracy of Pasting DT 0.91


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished


# Random Subspaces

In [14]:
bag3 = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples=1.0, #subspaces , only feature sampling , ALl rows are sent to models
    bootstrap=False ,#Subspaces **
    max_features = 0.5, #features  sampling
    bootstrap_features = True, # random sample features
    random_state=42
)

In [15]:
bag3.fit(X_train,y_train)
y_pred = bag3.predict(X_test)
print("Random Subspaces Classifier accuracy:",accuracy_score(y_test,y_pred))

Random Subspaces Classifier accuracy: 0.93


In [16]:
bag3.estimators_samples_[0].shape

(800,)

In [17]:
bag3.estimators_features_[0].shape

(5,)

# Random Patches

In [18]:
bag4 = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples = 0.25, #both rows and col sampling 
    bootstrap = True,
    max_features = 0.5,
    bootstrap_features = True,
    random_state=42
)

In [19]:
bag4.fit(X_train,y_train)
y_pred = bag4.predict(X_test)
print("Random Patches classifier accuracy :  ",accuracy_score(y_test,y_pred))

Random Patches classifier accuracy :   0.925


# OOB (Out Of Bag samples)

In [20]:
bag5 = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples=0.25,
    bootstrap = True,
    oob_score = True, # including oob rows for accuracy check
    random_state=42
)

In [21]:
bag5.fit(X_train,y_train)
y_pred=bag5.predict(X_test)
print("OOB Accuracy",accuracy_score(y_test,y_pred))

OOB Accuracy 0.905


In [22]:
bag5.oob_score_ #it is the validation accuracy , calculated on OOB dataset 

0.93625

# Applying GridSearchCV

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
parameters = {
    'n_estimators': [50,100,500], 
    'max_samples': [0.1,0.4,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.1,0.4,0.7,1.0]
    }

In [25]:
search = GridSearchCV(BaggingClassifier(), parameters, cv=5)

In [26]:
search.fit(X_train,y_train)

In [27]:
search.best_params_ 

{'bootstrap': False,
 'max_features': 0.7,
 'max_samples': 0.4,
 'n_estimators': 500}

In [28]:
search.best_score_

np.float64(0.9450000000000001)

# *REGRESSION*

In [29]:
from sklearn.datasets import load_diabetes
load_diabetes

<function sklearn.datasets._base.load_diabetes(*, return_X_y=False, as_frame=False, scaled=True)>

In [30]:
diabetes = load_diabetes()
X , y = diabetes.data , diabetes.target
print("Dataset Feature Name :"+str(diabetes.feature_names))
print("Dataset feature size : "+str(diabetes.data.shape))
print("Dataset target size : "+str(diabetes.target.size))

Dataset Feature Name :['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
Dataset feature size : (442, 10)
Dataset target size : 442


In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y , train_size=0.80, test_size=0.20, random_state=123)
print('Train/Test Sets Sizes : ',X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

Train/Test Sets Sizes :  (353, 10) (89, 10) (353,) (89,)


In [33]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()

In [34]:
lr.fit(X_train,Y_train)
dt.fit(X_train,Y_train)
knn.fit(X_train,Y_train)

In [35]:
y_pred1 = lr.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = knn.predict(X_test)

In [36]:
print("R^2 score for LR",r2_score(Y_test,y_pred1))
print("R^2 score for DT",r2_score(Y_test,y_pred2))
print("R^2 score for KNN",r2_score(Y_test,y_pred3))

R^2 score for LR 0.5675895725793205
R^2 score for DT 0.10443970544344838
R^2 score for KNN 0.438839665879189


In [37]:
from sklearn.ensemble import BaggingRegressor

bag = BaggingRegressor(random_state=1)
bag.fit(X_train, Y_train)

In [38]:
bag.score(X_train, Y_train)

0.8968084151942686

In [39]:
Y_preds = bag.predict(X_test)

print('Training Coefficient of R^2 : %.3f'%bag.score(X_train, Y_train))
print('Test Coefficient of R^2 : %.3f'%bag.score(X_test, Y_test)) 
#to see how well model generalize 

Training Coefficient of R^2 : 0.897
Test Coefficient of R^2 : 0.499


In [40]:
diabetes.data.shape
n_samples = diabetes.data.shape[0]
n_features = diabetes.data.shape[1]

In [41]:
params = {'estimator': [None, LinearRegression(), KNeighborsRegressor()],
          'n_estimators': [20,50,100],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1), param_grid =params, cv=3, n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(X_train, Y_train)

print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_train, Y_train))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_test, Y_test))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Train R^2 Score : 0.487
Test R^2 Score : 0.533
Best R^2 Score Through Grid Search : 0.446
Best Parameters :  {'bootstrap': False, 'bootstrap_features': True, 'estimator': LinearRegression(), 'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 100}
