In [23]:

# check scikit-learn version
import sklearn
print(sklearn.__version__)

1.2.2


## Bagging - Classifier Example !

In [2]:
import pandas as pd

url="https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"
df=pd.read_csv(url)


In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Separate X and y

X = df.iloc[:,:8]
y = df.iloc[:,8]

In [5]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [6]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [7]:
X.shape

(768, 8)

In [8]:
y.shape

(768,)

In [9]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25,random_state=42)

# define the model
model = BaggingClassifier()

#fit the model
model.fit(X_train,y_train)

#predict on Training set 
y_predict = model.predict(X_train)


#Check the accuracy on training data. Is it overfitting ?
accuracy_score(y_train,y_predict)

0.9826388888888888

In [10]:
#Lets do cross validation and check the accuracy. 

# evaluate the model
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=10 )

# report performance
print(n_scores)
print('Accuracy: ' , mean(n_scores))

[0.74137931 0.72413793 0.74137931 0.70689655 0.77586207 0.75862069
 0.78947368 0.71929825 0.75438596 0.80701754]
Accuracy:  0.7518451300665456


In [11]:
# Lets run it on test set 

#predict on test set 
y_predict_test = model.predict(X_test)


#Check the accuracy on test set
accuracy_score(y_test,y_predict_test)


0.7291666666666666

###  Let's improve 

In [12]:
# define the model
model = BaggingClassifier(n_estimators=30,oob_score=True)

#fit the model
model.fit(X_train,y_train)

#predict on Training set 
y_predict = model.predict(X_train)


#Check the accuracy on training data. Is it overfitting ?
accuracy_score(y_train,y_predict)

0.9982638888888888

In [13]:
model.oob_score_

0.7361111111111112

In [14]:
#Lets do cross validation and check the accuracy. 

# evaluate the model
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=10 )

# report performance
print(n_scores)
print('Accuracy: ' , mean(n_scores))

[0.77586207 0.62068966 0.77586207 0.77586207 0.77586207 0.72413793
 0.78947368 0.80701754 0.77192982 0.71929825]
Accuracy:  0.7535995160314579


In [15]:
# Lets run it on test set 

#predict on test set 
y_predict_test = model.predict(X_test)


#Check the accuracy on test data.
accuracy_score(y_test,y_predict_test)


0.7395833333333334

## Explore hyper parameter space !

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define base estimator
base_estimator = DecisionTreeClassifier()

# Define bagging classifier
bagging_clf = BaggingClassifier(base_estimator=base_estimator)



param_grid = {
          "base_estimator__max_depth": [3,5,10,20],
          "base_estimator__min_samples_leaf": [1, 3, 5, 7, 10],
          "base_estimator__min_samples_split": [2, 5, 7],
          'max_features': [0.5, 0.7, 1.0],
          'max_samples': [0.5, 0.7, 1.0],
          'n_estimators': [2, 5, 10, 20,30,50]
}



# Perform grid search with cross-validation
grid_search = GridSearchCV(bagging_clf, param_grid=param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Print the best score
print("Best score:", grid_search.best_score_)


Best hyperparameters: {'base_estimator__max_depth': 5, 'base_estimator__min_samples_leaf': 10, 'base_estimator__min_samples_split': 5, 'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 30}
Best score: 0.7830134932533734


### For Random Forest, in SKLearn you just need to select Random Forest Classifier - That's it !

In [17]:
from sklearn.ensemble import RandomForestClassifier
# define the model
model_rf = RandomForestClassifier()

#fit the model
model_rf.fit(X_train,y_train)

#predict on Training set 
y_predict = model_rf.predict(X_train)


#Check the accuracy on training data. Is it overfitting ?
accuracy_score(y_train,y_predict)

1.0

In [18]:
#Lets do cross validation and check the accuracy. 

# evaluate the model
n_scores = cross_val_score(model_rf, X_train, y_train, scoring='accuracy', cv=10 )

# report performance
print(n_scores)
print('Accuracy: ' , mean(n_scores))

[0.79310345 0.68965517 0.75862069 0.70689655 0.79310345 0.75862069
 0.8245614  0.77192982 0.78947368 0.73684211]
Accuracy:  0.762280701754386


In [19]:
# Lets run it on test set 

#predict on test set 
y_predict_test = model_rf.predict(X_test)


#Check the accuracy on test set
accuracy_score(y_test,y_predict_test)

0.7395833333333334

## Adaboost

In [131]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier

from sklearn.datasets import make_classification



X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=6)
print(X.shape, y.shape)

(1000, 10) (1000,)


In [132]:
model = AdaBoostClassifier()

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=5, n_jobs=-1)

print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.853 (0.029)


In [133]:
n_scores

array([0.875, 0.86 , 0.865, 0.795, 0.87 ])

## Lets play with hyper parameters

In [137]:
# Number of trees to be built 
model = AdaBoostClassifier(n_estimators=2)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=5, n_jobs=-1)

print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.844 (0.022)


In [141]:
# Depth of DTs

base = DecisionTreeClassifier(max_depth=1)
model= AdaBoostClassifier(estimator=base)

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=5, n_jobs=-1)

print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.853 (0.029)


## For regression 

In [142]:

from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import AdaBoostRegressor

X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=6)

model = AdaBoostRegressor()

n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1, error_score='raise')
print(n_scores)
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

[-77.34024412 -82.14866706 -74.12655219 -63.2081414  -75.63359601]
MAE: -74.491 (6.253)


## Gradient Boosting

In [143]:

from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

model = GradientBoostingClassifier()

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=5, n_jobs=-1)

print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Mean Accuracy: 0.902 (0.027)


In [144]:
n_scores

array([0.87 , 0.875, 0.94 , 0.9  , 0.925])

In [145]:
X.shape

(1000, 20)

In [146]:
y.shape

(1000,)

In [158]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import GradientBoostingRegressor

X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=7)

#model = GradientBoostingRegressor()
#model = GradientBoostingRegressor(n_estimators=10)
model = GradientBoostingRegressor(learning_rate=0.2)

n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -57.815 (2.037)


In [152]:
n_scores

array([-133.76131763, -131.94606542, -141.7229317 , -148.05631799,
       -132.98009641])

In [21]:

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)