In [16]:
from imblearn.datasets import fetch_datasets
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.pipeline import make_pipeline
abalone_raw = fetch_datasets()['abalone']

In [2]:
abalone = pd.DataFrame(abalone_raw.data)

abalone['target'] = abalone_raw.target

abalone

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,0.0,0.0,1.0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,-1
1,0.0,0.0,1.0,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,1
2,1.0,0.0,0.0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,-1
3,0.0,0.0,1.0,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,-1
4,0.0,1.0,0.0,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,1
...,...,...,...,...,...,...,...,...,...,...,...
4172,1.0,0.0,0.0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,-1
4173,0.0,0.0,1.0,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,-1
4174,0.0,0.0,1.0,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,-1
4175,1.0,0.0,0.0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,-1


In [3]:
abalone.loc[abalone['target'] == -1, 'target'] = 0

In [13]:
X_train, X_test, y_train, y_test = train_test_split(abalone.drop('target',axis=1), abalone['target'], test_size=0.33, random_state=42)

In [14]:
# Random Forest
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector

num_pipe = SimpleImputer(strategy="mean", add_indicator=True)
cat_pipe = make_pipeline(
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
)

preprocessor_tree = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

rf_clf = make_pipeline(
    preprocessor_tree, RandomForestClassifier(n_estimators=300)
)

In [17]:
# Simple CV Baseline: RF
cv_result = cross_validate(rf_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [18]:
cv_result['test_score'].mean()

0.829238055613635

In [50]:
## RANDOM PARAMETER GRID SEARCH
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = [int(x) for x in np.arange(100, 1001, 100)]
# Number of features to consider at every split
max_features = ['log2','sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.arange(5, 51, 5)]
max_depth.append(None)

criterion = ['gini', 'entropy']
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.arange(2, 53, 5)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Method of selecting samples for training each tree
bootstrap = [False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf,
               'bootstrap': bootstrap,
                'criterion':criterion}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['log2', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, None], 'min_samples_split': [2, 7, 12, 17, 22, 27, 32, 37, 42, 47, 52], 'min_samples_leaf': [1], 'bootstrap': [False], 'criterion': ['gini', 'entropy']}


In [55]:
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                               param_distributions = random_grid,
                               scoring='roc_auc',
                                n_iter=1000,
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [56]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 2000 candidates, totalling 6000 fits


KeyboardInterrupt: 

In [22]:
rf_random.best_score_

0.8184100142102974

In [23]:
rf_random.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 340}

In [57]:
brf_random = RandomizedSearchCV(estimator = BalancedRandomForestClassifier(), 
                              param_distributions = random_grid,
                               scoring='roc_auc',
                                n_iter=1000,
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [25]:
brf_random.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


GridSearchCV(cv=3, estimator=BalancedRandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False], 'criterion': ['gini', 'entropy'],
                         'max_depth': [30, 35, 40, 45, None],
                         'max_features': ['log2', 'sqrt'],
                         'min_samples_leaf': [1], 'min_samples_split': [2],
                         'n_estimators': [250, 260, 270, 280, 290, 300, 310,
                                          320, 330, 340]},
             return_train_score=True, scoring='roc_auc', verbose=2)

In [26]:
brf_random.best_score_

0.8544670457731334

In [27]:
brf_random.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 35,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 320}

In [64]:
#Random Forest with oversampling using SMOTE and undersampling using RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

over_n = [float(x) for x in np.arange(0.2, 0.351, 0.05)]
# RandomUnderSampler undersampling
under_n = [float(x) for x in np.arange(0.35, 0.5, 0.05)]

# Create the random grid
random_grid = {'randomforestclassifier__n_estimators': n_estimators,
               'randomforestclassifier__max_features': max_features,
               'randomforestclassifier__max_depth': max_depth,
               'randomforestclassifier__min_samples_split': min_samples_split,
               'randomforestclassifier__min_samples_leaf': min_samples_leaf,
               'randomforestclassifier__bootstrap': bootstrap,
               'randomforestclassifier__criterion':criterion,
              'smote__sampling_strategy':over_n,
              'randomundersampler__sampling_strategy':under_n}

print(random_grid)

{'randomforestclassifier__n_estimators': [190, 200, 210, 220], 'randomforestclassifier__max_features': ['log2', 'sqrt'], 'randomforestclassifier__max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, None], 'randomforestclassifier__min_samples_split': [2, 7, 12, 17, 22, 27, 32, 37, 42, 47, 52], 'randomforestclassifier__min_samples_leaf': [1], 'randomforestclassifier__bootstrap': [True], 'randomforestclassifier__criterion': ['gini', 'entropy'], 'smote__sampling_strategy': [0.2, 0.25, 0.3, 0.35], 'randomundersampler__sampling_strategy': [0.35, 0.39999999999999997, 0.44999999999999996, 0.49999999999999994]}


In [65]:
# define pipeline
# oversample positive (minority) to be x% the number of negative (majority)
over = SMOTE(sampling_strategy = 0.1)
# randomly undersample negative (majority) to reduce the number of negative to x% of the positive (minority)
under = RandomUnderSampler(sampling_strategy = 0.25) 

pipeline = make_pipeline(over, under, RandomForestClassifier(n_estimators = 260,
 min_samples_split =2,
 min_samples_leaf=1,
 max_features= 'auto',
 max_depth= 90,
 bootstrap= False))

rf_sampling_random = RandomizedSearchCV(pipeline,
                               param_distributions = random_grid,
                               scoring='roc_auc',
                                n_iter=1000,
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [66]:
rf_sampling_random.fit(X_train,y_train)


Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


KeyboardInterrupt: 

In [31]:
rf_sampling_random.best_score_


0.8447047312198078

In [32]:
rf_sampling_random.best_params_

{'randomforestclassifier__bootstrap': False,
 'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__max_depth': 40,
 'randomforestclassifier__max_features': 'sqrt',
 'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 330,
 'randomundersampler__sampling_strategy': 0.49999999999999994,
 'smote__sampling_strategy': 0.2}

In [62]:
# BALANCED BAGGING

# Number of trees in random forest
n_estimators = [int(x) for x in np.arange(190, 401, 20)]

# Method of selecting samples for training each tree
bootstrap = [True]

over_n = [float(x) for x in np.arange(0.25, 0.351, 0.05)]
# RandomUnderSampler undersampling
under_n = [float(x) for x in np.arange(0.35, 0.451, 0.05)]

# Create the random grid
bb_random_grid = {'balancedbaggingclassifier__n_estimators': n_estimators,
                 'balancedbaggingclassifier__bootstrap': bootstrap,
                 'smote__sampling_strategy':over_n,
              'randomundersampler__sampling_strategy':under_n}


# define pipeline
# oversample positive (minority) to be x% the number of negative (majority)
over = SMOTE(sampling_strategy = 0.1)
# randomly undersample negative (majority) to reduce the number of negative to x% of the positive (minority)
under = RandomUnderSampler(sampling_strategy = 0.25) 

pipeline = make_pipeline(over, under, BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(random_state=42)))

In [70]:
bb_random = RandomizedSearchCV(estimator = pipeline, 
                               param_distributions = bb_random_grid,
                               scoring='roc_auc',
                                n_iter=50,
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [35]:
bb_random.fit(X_train, y_train)


Fitting 3 folds for each of 24 candidates, totalling 72 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('smote', SMOTE(sampling_strategy=0.1)),
                                       ('randomundersampler',
                                        RandomUnderSampler(sampling_strategy=0.25)),
                                       ('balancedbaggingclassifier',
                                        BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(random_state=42)))]),
             n_jobs=-1,
             param_grid={'balancedbaggingclassifier__bootstrap': [True],
                         'balancedbaggingclassifier__n_estimators': [190, 200,
                                                                     210, 220],
                         'randomundersampler__sampling_strategy': [0.35,
                                                                   0.39999999999999997,
                                                                   0.44999999999999996],
                         'smote__sampling_str

In [36]:
bb_random.best_score_

0.8481549542805858

In [37]:
bb_random.best_params_

{'balancedbaggingclassifier__bootstrap': True,
 'balancedbaggingclassifier__n_estimators': 220,
 'randomundersampler__sampling_strategy': 0.35,
 'smote__sampling_strategy': 0.3}

In [38]:
# LOGISTIC REGRESSION
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate



num_pipe = make_pipeline(
    StandardScaler()
)
cat_pipe = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"),
)

preprocessor_linear = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000))
lr_clf.set_params(logisticregression__class_weight="balanced")

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=2,
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000024F9DFBC448>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000024F9C72DC08>)])),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [39]:
cv_result = cross_validate(lr_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [40]:
cv_result['test_score'].mean()

0.8437581473293818

In [41]:
# Support Vector Classification with RandomOverSampling
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline 

num_pipe = make_pipeline(
    MinMaxScaler(feature_range=(0, 1))
)
cat_pipe = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"),
)

preprocessor_svc = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

smote = SMOTE(sampling_strategy=0.1)

rus = RandomUnderSampler(sampling_strategy=0.2)

svc_clf = make_pipeline(preprocessor_svc, smote, rus, SVC(kernel='rbf',C=1))

In [42]:
cv_result = cross_validate(svc_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [43]:
cv_result['test_score'].mean()

0.823595647887335

In [44]:
parameters = {'smote__sampling_strategy':[float(x) for x in np.arange(0.2, 0.351, 0.05)],
              'randomundersampler__sampling_strategy':[float(x) for x in np.arange(0.35, 0.451, 0.05)],
              'svc__kernel':('linear', 'rbf', 'poly'), 
              'svc__C':[10,100,1000,5000,10000],
              'svc__gamma':[1,0.1,0.001,0.0001], 
              'svc__degree':[1,2,3]}

svc_grid = RandomizedSearchCV(svc_clf, param_distributions=parameters, n_iter=50, scoring="roc_auc", cv=3)

In [45]:
svc_grid.fit(X_train, y_train)


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=2,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('minmaxscaler',
                                                                                          MinMaxScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x0000024FA5CCBB08>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('onehotencoder',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                            

In [46]:
svc_grid.best_score_

0.8665701923461099

In [47]:
svc_grid.best_params_

{'randomundersampler__sampling_strategy': 0.44999999999999996,
 'smote__sampling_strategy': 0.3,
 'svc__C': 1000,
 'svc__degree': 2,
 'svc__gamma': 0.1,
 'svc__kernel': 'rbf'}