In [2]:
from imblearn.datasets import fetch_datasets
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
wine_quality_import = fetch_datasets()['wine_quality']

In [3]:
wine_quality = pd.DataFrame(wine_quality_import.data)

wine_quality['target'] = wine_quality_import.target

In [4]:
wine_quality.loc[wine_quality['target'] == -1, 'target'] = 0

In [5]:
X_train, X_test, y_train, y_test = train_test_split(wine_quality.drop('target',axis=1), wine_quality['target'], test_size=0.33, random_state=42)

In [6]:
# Random Forest
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector

num_pipe = SimpleImputer(strategy="mean", add_indicator=True)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
)

preprocessor_tree = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

rf_clf = make_pipeline(
    preprocessor_tree, RandomForestClassifier()
)

In [7]:
## RANDOM PARAMETER GRID SEARCH
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 300, num = 20)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 12, num = 3)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 16, num = 2)]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [50, 63, 76, 89, 102, 115, 128, 142, 155, 168, 181, 194, 207, 221, 234, 247, 260, 273, 286, 300], 'max_features': ['auto'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 7, 12], 'min_samples_leaf': [1, 16], 'bootstrap': [True, False]}


In [8]:
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                               param_distributions = random_grid,
                               scoring='roc_auc',
                               n_iter = 200, 
                               cv = 3, verbose=2, 
                               random_state=42, 
                               n_jobs = -1,
                               return_train_score = True)

In [9]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto'],
                                        'min_samples_leaf': [1, 16],
                                        'min_samples_split': [2, 7, 12],
                                        'n_estimators': [50, 63, 76, 89, 102,
                                                         115, 128, 142, 155,
                                                         168, 181, 194, 207,
                                                         221, 234, 247, 260,
                                                         273, 286, 300]},
                   rand

In [10]:
rf_random.best_score_

0.8922213122241888

In [11]:
rf_random.best_params_

{'n_estimators': 207,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 70,
 'bootstrap': False}

In [221]:
brf_random = RandomizedSearchCV(estimator = BalancedRandomForestClassifier(), 
                               param_distributions = random_grid,
                               scoring='roc_auc',
                               n_iter = 200, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1,
                               return_train_score = True)

In [222]:
brf_random.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


RandomizedSearchCV(cv=3, estimator=BalancedRandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto'],
                                        'min_samples_leaf': [1, 16],
                                        'min_samples_split': [2, 7, 12],
                                        'n_estimators': [50, 63, 76, 89, 102,
                                                         115, 128, 142, 155,
                                                         168, 181, 194, 207,
                                                         221, 234, 247, 260,
                                                         273, 286, 300]},
               

In [223]:
brf_random.best_score_

0.8716610261955227

In [224]:
brf_random.best_params_

{'n_estimators': 286,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': False}

In [14]:
# BALANCED BAGGING

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 150, stop = 300, num = 5)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
bb_random_grid = {'n_estimators': n_estimators,
                 'bootstrap': bootstrap}

In [13]:
bb_random = GridSearchCV(estimator = BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(random_state=42)), 
                               param_grid = bb_random_grid,
                               scoring='roc_auc',
                               cv = 3, verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

NameError: name 'BalancedBaggingClassifier' is not defined

In [None]:
bb_random.fit(X_train, y_train)
bb_random.best_score_

In [234]:
bb_random.best_params_

{'n_estimators': 207, 'bootstrap': True}

In [229]:
# LOGISTIC REGRESSION
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate



num_pipe = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"),
)

preprocessor_linear = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000))
lr_clf.set_params(logisticregression__class_weight="balanced")

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=2,
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler()),
                                                                  ('simpleimputer',
                                                                   SimpleImputer(add_indicator=True))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001B78E858C08>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant'))

In [230]:
cv_result = cross_validate(lr_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [231]:
cv_result['test_score'].mean()

0.7959038021320113