# Speed test

In this notebook, we compare the performance of the ```smote_variants``` package with that of the ```imblearn``` package through the three oversamplers implemented in common. Note that the implementations contain different logic to determine the number of samples to be generated. Generally, ```imblearn``` implementations are more flexible, ```smote_variants``` implementations are more simple to use.

In [1]:
import smote_variants as sv
import common_datasets.binary_classification as bin_clas

import matplotlib.pyplot as plt
import time
import numpy as np
import pandas as pd

import logging

logger = logging.getLogger('smote_variants')
logger.setLevel(logging.CRITICAL)

2022-08-16 16:05:32.494081: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-16 16:05:32.666357: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-16 16:05:32.666386: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def measure(sv, datasets):
    """
    The function measuring the runtimes of oversamplers on a set of datasets.
    
    Args:
        sv (list(smote_variants.Oversampling)): the list of oversampling objects from smote_variants
        datasets (list(function)): dataset loading functions
    Returns:
        pd.DataFrame: mean oversampling runtimes for the various oversamplers over all datasets
    """
    
    results= {}
    # iterating through all datasets
    for d in datasets:
        data= d()
        print('processing: %s' % data['name'])
        
        X= data['data']
        y= data['target']
        for i, s in enumerate(sv):
            # imblearn seems to fail on some edge cases
            try:
                # measuring oversampling runtime using smote_variants
                t0= time.time()
                X_samp, y_samp= sv[i]().sample(X, y)
                res_sv= time.time() - t0
                
                if not s.__name__ in results:
                    results[s.__name__]= []
                
                # appending the results
                results[s.__name__].append(res_sv)
            except Exception as exception:
                print('failed', sv[i].__name__, str(exception))
    
    # preparing the final dataframe
    for k in results:
        results[k]= [np.mean(results[k])]
    
    return results


In [3]:
# Executing the evaluation for the techniques implemented by both smote_variants and imblearn, using the
# same parameters, involving 104 datasets

sv_techniques= sv.get_all_oversamplers()

results= measure(sv_techniques,
                 bin_clas.get_filtered_data_loaders(n_bounds=(1, 200), 
                                                    n_attr_encoded_bounds=(1, 20)))

processing: cleveland-0_vs_4


2022-08-16 16:05:57.955971: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-08-16 16:05:57.956064: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-08-16 16:05:57.956089: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (daa): /proc/driver/nvidia/version does not exist
2022-08-16 16:05:57.956885: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


processing: glass-0-1-5_vs_2
processing: glass-0-1-6_vs_2
processing: glass-0-1-6_vs_5
processing: glass-0-4_vs_5
processing: glass-0-6_vs_5
processing: shuttle-c2-vs-c4
processing: winequality-white-9_vs_4
processing: iris0
processing: appendicitis


In [4]:
# Printing the results, the unit is 'seconds'

print({key: item[0] for key, item in results.items()})

{'A_SUWO': 0.06874444484710693, 'ADASYN': 0.003873300552368164, 'ADG': 5.060082960128784, 'ADOMS': 0.0985719919204712, 'AHC': 0.001047801971435547, 'AMSCO': 19.409874176979066, 'AND_SMOTE': 0.00766148567199707, 'ANS': 0.0068471431732177734, 'ASMOBD': 0.03955373764038086, 'Assembled_SMOTE': 0.004727840423583984, 'Borderline_SMOTE1': 0.0040040254592895504, 'Borderline_SMOTE2': 0.0025308847427368162, 'CBSO': 0.007673573493957519, 'CCR': 0.0032512903213500976, 'CE_SMOTE': 0.1817408561706543, 'cluster_SMOTE': 0.02349236011505127, 'CURE_SMOTE': 0.0022810220718383787, 'DBSMOTE': 0.03819341659545898, 'DE_oversampling': 0.0767374038696289, 'DEAGO': 3.6618275880813598, 'distance_SMOTE': 0.003498554229736328, 'DSMOTE': 0.5715477705001831, 'DSRBF': 40.746605324745175, 'E_SMOTE': 0.19726550579071045, 'Edge_Det_SMOTE': 0.0034935712814331056, 'G_SMOTE': 0.007154273986816406, 'GASMOTE': 0.8189513683319092, 'Gaussian_SMOTE': 0.004134678840637207, 'polynom_fit_SMOTE_star': 0.0004878044128417969, 'polyno

In [5]:
sorted = dict(list(sorted(list(results.items()), key=lambda x: x[1][0])))

In [6]:
for key, item in sorted.items():
    print(f"'{key}': {item[0]},")

'NoSMOTE': 0.00015900135040283203,
'ROSE': 0.00033104419708251953,
'polynom_fit_SMOTE_bus': 0.00037889480590820314,
'polynom_fit_SMOTE_star': 0.0004878044128417969,
'MSMOTE': 0.0010436534881591796,
'AHC': 0.001047801971435547,
'SPY': 0.0010745525360107422,
'RWO_sampling': 0.0012566089630126954,
'Random_SMOTE': 0.0014075994491577148,
'NT_SMOTE': 0.0015106916427612305,
'Safe_Level_SMOTE': 0.0016789674758911134,
'polynom_fit_SMOTE_mesh': 0.0018918991088867187,
'MCT': 0.002189373970031738,
'CURE_SMOTE': 0.0022810220718383787,
'Gazzah': 0.002281856536865234,
'polynom_fit_SMOTE_poly': 0.002485060691833496,
'Borderline_SMOTE2': 0.0025308847427368162,
'SMOTE': 0.0029209613800048827,
'Selected_SMOTE': 0.003140377998352051,
'CCR': 0.0032512903213500976,
'SMOTE_RSB': 0.0032561779022216796,
'SMOTE_Cosine': 0.0034481048583984374,
'SMOTE_D': 0.0034512758255004885,
'Edge_Det_SMOTE': 0.0034935712814331056,
'distance_SMOTE': 0.003498554229736328,
'SMMO': 0.0038316965103149412,
'SMOTE_TomekLinks': 0.003