# Integration with sklearn pipelines

In this notebook, provide some illustration for integration with sklearn pipelines.

In [1]:
import keras
import imblearn

import numpy as np

import smote_variants as sv
import imblearn.datasets as imb_datasets

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

random_seed= 3

2022-08-15 13:24:39.187491: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-15 13:24:39.194945: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-15 13:24:39.194970: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Preparing the data

In [None]:
np.random.seed(random_seed)

In [None]:
libras= imb_datasets.fetch_datasets()['libras_move']
X, y= libras['data'], libras['target']

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.33)

## Fitting a pipeline

In [None]:
oversampler = ('smote_variants', 'MulticlassOversampling', 
                {'oversampler': 'distance_SMOTE', 'oversampler_params': {}})

classifier = ('sklearn.neighbors', 'KNeighborsClassifier', {})

#oversampler= sv.MulticlassOversampling('distance_SMOTE')
#classifier= KNeighborsClassifier(n_neighbors= 5)

In [None]:
model= Pipeline([('scale', StandardScaler()), 
            ('clf', sv.classifiers.OversamplingClassifier(oversampler, classifier))])

In [None]:
model.fit(X, y)

2022-08-15 11:07:39,755:INFO:MulticlassOversampling: Running multiclass oversampling with strategy eq_1_vs_many_successive
2022-08-15 11:07:39,757:INFO:MulticlassOversampling: Sampling minority class with label: 1
2022-08-15 11:07:39,760:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:39,762:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:39,763:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:40,021:INFO:distance_SMOTE: simplex sampling with n_dim 2


## Grid search

In [None]:
param_grid= {'clf__oversampler':[('smote_variants', 'distance_SMOTE', {'proportion': 0.5}),
                                 ('smote_variants', 'distance_SMOTE', {'proportion': 1.0}),
                                 ('smote_variants', 'distance_SMOTE', {'proportion': 1.5})]}

In [None]:
grid= GridSearchCV(model, param_grid= param_grid, cv= 3, n_jobs= 1, verbose= 2, scoring= 'accuracy')

In [None]:
grid.fit(X, y)

2022-08-15 11:07:40,813:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:40,815:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:40,816:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:40,969:INFO:distance_SMOTE: simplex sampling with n_dim 2


Fitting 3 folds for each of 3 candidates, totalling 9 fits


2022-08-15 11:07:41,240:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:41,242:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:41,245:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:41,369:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 0.5}); total time=   0.4s


2022-08-15 11:07:41,637:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:41,639:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:41,649:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:41,805:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 0.5}); total time=   0.4s


2022-08-15 11:07:42,080:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:42,081:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:42,082:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:42,229:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 0.5}); total time=   0.4s


2022-08-15 11:07:42,476:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:42,478:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:42,479:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:42,619:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.0}); total time=   0.4s


2022-08-15 11:07:42,798:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:42,800:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:42,801:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:42,899:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.0}); total time=   0.3s


2022-08-15 11:07:43,122:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:43,124:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:43,125:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:43,218:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.0}); total time=   0.3s


2022-08-15 11:07:43,438:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:43,440:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:43,441:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:43,532:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.5}); total time=   0.3s


2022-08-15 11:07:43,659:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:43,663:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-15 11:07:43,665:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-15 11:07:43,753:INFO:distance_SMOTE: simplex sampling with n_dim 2
2022-08-15 11:07:43,848:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-15 11:07:43,850:INFO:NearestNeighborsWithMetricTensor: NN fitting wit

[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.5}); total time=   0.2s
[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.5}); total time=   0.2s


2022-08-15 11:07:43,967:INFO:distance_SMOTE: simplex sampling with n_dim 2


In [None]:
print(grid.best_score_)
print(grid.cv_results_)

0.9138888888888889
{'mean_fit_time': array([0.16049767, 0.13478096, 0.10484409]), 'std_fit_time': array([0.02110984, 0.02172018, 0.00081145]), 'mean_score_time': array([0.26054605, 0.21101316, 0.13537391]), 'std_score_time': array([0.00443091, 0.02703684, 0.05381506]), 'param_clf__oversampler': masked_array(data=[('smote_variants', 'distance_SMOTE', {'proportion': 0.5}),
                   ('smote_variants', 'distance_SMOTE', {'proportion': 1.0}),
                   ('smote_variants', 'distance_SMOTE', {'proportion': 1.5})],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'clf__oversampler': ('smote_variants', 'distance_SMOTE', {'proportion': 0.5})}, {'clf__oversampler': ('smote_variants', 'distance_SMOTE', {'proportion': 1.0})}, {'clf__oversampler': ('smote_variants', 'distance_SMOTE', {'proportion': 1.5})}], 'split0_test_score': array([0.81666667, 0.81666667, 0.81666667]), 'split1_test_score': array([0.94166667, 0.94166667, 0.958