In [None]:
a

# Cost Sensitive Models -II
Source- https://machinelearningmastery.com/bagging-and-random-forest-for-imbalanced-classification/

In [1]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier

## Regular Bagging

In [2]:
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
# define model
model = BaggingClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.863


## Bagged decision trees with random undersampling for imbalanced classification
This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a ``RandomUnderSampler`

In [3]:
from imblearn.ensemble import BalancedBaggingClassifier

In [4]:
# define model
model = BalancedBaggingClassifier()
# define evaluation procedure
#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=2, n_jobs=-1,error_score='raise')
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.967


## Regular Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.865


## Cost Sensitive Random Forest 
- class_weight='balanced' - weights are based on complete training data 
- class_weight = 'balanced_subsample' - weight are based on proportions in the bootstrapped sample

In [6]:
# define model
model = RandomForestClassifier(n_estimators=10, class_weight='balanced')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.869


In [7]:
# define model
model = RandomForestClassifier(n_estimators=10, class_weight='balanced_subsample')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.879


## Random Forest with random undersampling for imbalanced classification

In [8]:
from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=100)
# evaluate model
scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
# summarize performance
scores
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.934


## Easy Ensemble Classifier
- The Easy Ensemble involves creating balanced samples of the training dataset by selecting all examples from the minority class and a subset from the majority class.

- Rather than using pruned decision trees, boosted decision trees are used on each subset, specifically the AdaBoost algorithm
- Although an AdaBoost classifier is used on each subsample, alternate classifier models can be used via setting the base_estimator argument to the model

In [9]:
from imblearn.ensemble import EasyEnsembleClassifier

# define model
model = EasyEnsembleClassifier(n_estimators=10)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.961


In [11]:
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
# generate dataset

# define model
model = EasyEnsembleClassifier(base_estimator=RandomForestClassifier(),n_estimators=20)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.977
