# Integration with sklearn pipelines

In this notebook, provide some illustration for integration with sklearn pipelines.

In [1]:
import keras
import imblearn

import numpy as np

import smote_variants as sv
import imblearn.datasets as imb_datasets

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

random_seed= 3

Using TensorFlow backend.


## Preparing the data

In [2]:
np.random.seed(random_seed)

In [3]:
libras= imb_datasets.fetch_datasets()['libras_move']
X, y= libras['data'], libras['target']

In [4]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.33)

## Fitting a pipeline

In [5]:
oversampler= sv.MulticlassOversampling(sv.distance_SMOTE())
classifier= KNeighborsClassifier(n_neighbors= 5)

In [6]:
model= Pipeline([('scale', StandardScaler()), ('clf', sv.OversamplingClassifier(oversampler, classifier))])

In [7]:
model.fit(X, y)

2019-07-28 15:48:30,348:INFO:MulticlassOversampling: Running multiclass oversampling with strategy equalize_1_vs_many_successive
2019-07-28 15:48:30,350:INFO:MulticlassOversampling: Sampling minority class with label: 1
2019-07-28 15:48:30,352:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")


Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', OversamplingClassifier(classifier=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
            oversampler=<smote_variants._smote_variants.MulticlassOversampling object at 0x7fa8d717ee48>))])

## Grid search

In [8]:
param_grid= {'clf__oversampler':[sv.distance_SMOTE(proportion=0.5),
                                 sv.distance_SMOTE(proportion=1.0),
                                 sv.distance_SMOTE(proportion=1.5)]}

In [9]:
grid= GridSearchCV(model, param_grid= param_grid, cv= 3, n_jobs= 1, verbose= 2, scoring= 'accuracy')

In [10]:
grid.fit(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
2019-07-28 15:48:30,993:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}")
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
2019-07-28 15:48:31,021:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}")
2019-07-28 15:48:31,043:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}")
2019-07-28 15:48:31,065:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2019-07-28 15:48:31,091:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2019-07-28 15:48:31,119:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2019-

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}") 
[CV]  clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}"), total=   0.0s
[CV] clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}") 
[CV]  clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}"), total=   0.0s
[CV] clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}") 
[CV]  clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}"), total=   0.0s
[CV] clf__oversampler=('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}") 
[CV]  clf__oversampler=('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}"), total=   0.0s
[CV] clf__oversampler=('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}") 
[CV]  clf__oversa

2019-07-28 15:48:31,216:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}")
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s finished
2019-07-28 15:48:31,253:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}")


[CV]  clf__oversampler=('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}"), total=   0.0s
[CV] clf__oversampler=('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}") 
[CV]  clf__oversampler=('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}"), total=   0.0s


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', OversamplingClassifier(classifier=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
            oversampler=<smote_variants._smote_variants.MulticlassOversampling object at 0x7fa8d717ee48>))]),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'clf__oversampler': [<smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5c0>, <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5f8>, <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e630>]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=2)

In [11]:
print(grid.best_score_)
print(grid.cv_results_)

0.9222222222222223
{'mean_fit_time': array([0.00849088, 0.01005777, 0.01242367]), 'std_fit_time': array([0.00347365, 0.00070242, 0.0003146 ]), 'mean_score_time': array([0.0064021 , 0.00667214, 0.00908041]), 'std_score_time': array([0.0005612 , 0.00025586, 0.00024809]), 'param_clf__oversampler': masked_array(data=[<smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5c0>,
                   <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5f8>,
                   <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e630>],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5c0>}, {'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5f8>}, {'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e630>}], 'split0_test_score': array([0.833