In [211]:
from imblearn.datasets import fetch_datasets
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
wine_quality_import = fetch_datasets()['wine_quality']

In [212]:
wine_quality = pd.DataFrame(wine_quality_import.data)

wine_quality['target'] = wine_quality_import.target

In [213]:
wine_quality.loc[wine_quality['target'] == -1, 'target'] = 0

In [214]:
X_train, X_test, y_train, y_test = train_test_split(wine_quality.drop('target',axis=1), wine_quality['target'], test_size=0.33, random_state=42)

In [215]:
# Random Forest
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector

num_pipe = SimpleImputer(strategy="mean", add_indicator=True)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
)

preprocessor_tree = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

rf_clf = make_pipeline(
    preprocessor_tree, RandomForestClassifier()
)

In [471]:
## RANDOM PARAMETER GRID SEARCH
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 250, stop = 350, num = 10)]
# Number of features to consider at every split
max_features = ['log2','sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(30, 50, num = 5)]
max_depth.append(None)

criterion = ['gini', 'entropy']
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,4,5]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf':min_samples_leaf,
               'bootstrap': bootstrap,
                'criterion':criterion}

print(random_grid)

{'n_estimators': [250, 261, 272, 283, 294, 305, 316, 327, 338, 350], 'max_features': ['log2', 'sqrt'], 'max_depth': [30, 35, 40, 45, 50, None], 'min_samples_leaf': [1, 2, 3, 4, 5], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']}


In [254]:
rf_random = GridSearchCV(estimator = RandomForestClassifier(), 
                               param_grid = random_grid,
                               scoring='roc_auc',
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [255]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 1920 candidates, totalling 5760 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [30, 35, 40, 45, 50, None],
                         'max_features': ['auto'],
                         'min_samples_leaf': [1, 6, 11, 16],
                         'min_samples_split': [2, 5, 8, 12],
                         'n_estimators': [250, 261, 272, 283, 294, 305, 316,
                                          327, 338, 350]},
             return_train_score=True, scoring='roc_auc', verbose=2)

In [256]:
rf_random.best_score_

0.8956388086414308

In [257]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 45,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 327}

In [None]:
brf_random = GridSearchCV(estimator = BalancedRandomForestClassifier(), 
                               param_grid = random_grid,
                               scoring='roc_auc',
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [None]:
brf_random.fit(X_train, y_train)

In [None]:
brf_random.best_score_

In [None]:
brf_random.best_params_

In [248]:
# BALANCED BAGGING

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 150, stop = 300, num = 5)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
bb_random_grid = {'n_estimators': n_estimators,
                 'bootstrap': bootstrap}

In [249]:
bb_random = GridSearchCV(estimator = BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(random_state=42)), 
                               param_grid = bb_random_grid,
                               scoring='roc_auc',
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [250]:
bb_random.fit(X_train, y_train)
bb_random.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


0.8569596662369879

In [234]:
bb_random.best_params_

{'n_estimators': 207, 'bootstrap': True}

In [None]:
#Random Forest with oversampling using SMOTE and undersampling using RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

from collections import Counter

# summarize class distribution - highly imbalanced dataset
counter = Counter(y_train)
print(counter)


## RANDOM PARAMETER GRID SEARCH
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = [260]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [90]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Method of selecting samples for training each tree
bootstrap = [True, False]

over_n = [float(x) for x in np.arange(0.1, 0.35, 0.05)]
# RandomUnderSampler undersampling
under_n = [float(x) for x in np.arange(0.35, 0.5, 0.05)]

# Create the random grid
random_grid = {'randomforestclassifier__n_estimators': n_estimators,
               'randomforestclassifier__max_features': max_features,
               'randomforestclassifier__max_depth': max_depth,
               'randomforestclassifier__min_samples_split': min_samples_split,
               'randomforestclassifier__min_samples_leaf': min_samples_leaf,
               'randomforestclassifier__bootstrap': bootstrap,
              'smote__sampling_strategy':over_n,
              'randomundersampler__sampling_strategy':under_n}

print(random_grid)


# define pipeline
# oversample positive (minority) to be x% the number of negative (majority)
over = SMOTE(sampling_strategy = 0.1)
# randomly undersample negative (majority) to reduce the number of negative to x% of the positive (minority)
under = RandomUnderSampler(sampling_strategy = 0.25) 

#steps = [('o', over), ('u', under), ('model', RandomForestClassifier())]
#pipeline = Pipeline(steps=steps)

pipeline = make_pipeline(over, under, RandomForestClassifier(n_estimators = 260,
 min_samples_split =2,
 min_samples_leaf=1,
 max_features= 'auto',
 max_depth= 90,
 bootstrap= False))

rf_sampling_random = GridSearchCV(pipeline,
                               param_grid = random_grid,
                               scoring='roc_auc',
                               cv = 3, verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)



cvasd = cross_validate(pipeline,X_train, y_train, cv=5)

rf_sampling_random.fit(X_train,y_train)
rf_sampling_random.best_score_

rf_sampling_random.best_params_


In [229]:
# LOGISTIC REGRESSION
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate



num_pipe = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"),
)

preprocessor_linear = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000))
lr_clf.set_params(logisticregression__class_weight="balanced")

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=2,
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler()),
                                                                  ('simpleimputer',
                                                                   SimpleImputer(add_indicator=True))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001B78E858C08>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant'))

In [230]:
cv_result = cross_validate(lr_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [231]:
cv_result['test_score'].mean()

0.7959038021320113

In [468]:
# Support Vector Classification with RandomOverSampling
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline 

num_pipe = make_pipeline(
    MinMaxScaler(feature_range=(0, 1))
)
cat_pipe = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"),
)

preprocessor_svc = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

smote = SMOTE(sampling_strategy=0.1)

rus = RandomUnderSampler(sampling_strategy=0.2)

svc_clf = make_pipeline(preprocessor_svc, smote, rus, SVC(kernel='rbf',C=1))

In [469]:
cv_result = cross_validate(svc_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [470]:
cv_result['test_score'].mean()

0.8431015172536567

In [443]:
parameters = {'smote__sampling_strategy':[0.1,0.2,0.3,0.4],
              'rus__sampling_strategy':[0.1,0.2,0.3,0.4,0.5],
              'svc__kernel':('linear', 'rbf', 'poly'), 
              'svc__C':[1,3,5,10], 
              'svc__class_weight':['balanced', None],
              'svc__degree':[1,2]}

svc_grid = GridSearchCV(svc_clf, parameters, scoring="roc_auc", cv=3)

In [444]:
svc_grid.fit(X_train, y_train)
svc_grid.best_score_

1080 fits failed out of a total of 3600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\bened\appdata\local\programs\python\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\bened\appdata\local\programs\python\python37\lib\site-packages\imblearn\pipeline.py", line 268, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "c:\users\bened\appdata\local\programs\python\python37\lib\site-packages\imblearn\pipeline.py", line 232, in _fit
    **fit_params_steps[name],
  File "c:\users\bened\appdata\local\programs\python\python37

0.8577614378850189

In [445]:
svc_grid.best_params_

{'rus__sampling_strategy': 0.2,
 'smote__sampling_strategy': 0.1,
 'svc__C': 1,
 'svc__class_weight': None,
 'svc__degree': 2,
 'svc__kernel': 'rbf'}