<center><h1  style="color:white; background-color:#000000; border-radius: 0px; padding:25px;"> Theoretical study of SMOTE </h1></center>

This notebook reproduces the numerical experiments presented in "Theoretical and experimental study of SMOTE: limitations and comparisons of rebalancing strategies".

<ins>Experimented Datasets: </ins>
- [Phoneme](#Phoneme)
- Pima
- Abalone
- Haberman
- Yeast


In [None]:
import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from pathlib import Path


from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler,NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE


from oversampling_strategies.oversampling_strategies import MGS, NoSampling, CVSmoteModel

from validation.classif_experiments import run_eval, subsample_to_ratio_indices,read_subsampling_indices, compute_metrics_several_protocols

In [None]:
output_dir_path =  "../saved_experiments" ## Fill it
Path(output_dir_path).mkdir(parents=True, exist_ok=True)
output_dir_path

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;" name='Phoneme'> Phoneme</h1></center>

## <a name='Phoneme'></a>

## Run protocol :

In [None]:
from data.data import load_phoneme_data
X_phoneme, y_phoneme = load_phoneme_data()

In [None]:
indices_kept_20 = subsample_to_ratio_indices(X=X_phoneme,y=y_phoneme,ratio=0.2,seed_sub=11,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='phoneme_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(
    X=X_phoneme,y=y_phoneme,ratio=0.1,seed_sub=9,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='phoneme_sub_20_to_10',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_20)

indices_kept_1 = subsample_to_ratio_indices(X=X_phoneme,y=y_phoneme,ratio=0.01,seed_sub=5,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='phoneme_sub_10_to_1',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_10)

In [None]:
X_phoneme_1,y_phoneme_1 = X_phoneme[indices_kept_1,:],y_phoneme[indices_kept_1]

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [5,8,11]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [None]:
init_name_file = '2024-02-14-RF100_phoneme_depthNone_test'
for i in range(100):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_phoneme_1,y=y_phoneme_1,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_phoneme,df_final_std_phoneme = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_phoneme_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=100)

df_final_mean_phoneme.style.format(precision=3)

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Pima </h1></center>

## Run protocol :

In [None]:
from data.data import load_pima_data
X_pima, y_pima = load_pima_data()

In [None]:
indices_kept_20 = subsample_to_ratio_indices(X=X_pima,y=y_pima,ratio=0.2,seed_sub=15,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='pima_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(X=X_pima,y=y_pima,ratio=0.1,seed_sub=7,
                                             output_dir_subsampling=output_dir_path,
                                             name_subsampling_file='pima_sub_20_to_10',has_previous_under_sampling=True,
                                               previous_under_sampling=indices_kept_20)


In [None]:
X_pima_10,y_pima_10 = read_subsampling_indices(X=X_pima,y=y_pima, dir_subsampling=output_dir_path, name_subsampling_file='pima_sub_20_to_10', get_indexes=False)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [4,8,12]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
    ('None_tuned_depth', NoSampling(), {}, grid_RF),
]

In [None]:
init_name_file = '2024-02-14-RF100_pima_depthNone_test'
for i in range(100):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_pima_10,y=y_pima_10,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_pima,df_final_std_pima = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_pima_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=100)

df_final_mean_pima.style.format(precision=3)

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Abalone </h1></center>

## Run protocol :

In [None]:
from data.data import load_abalone_data
X_abalone, y_abalone = load_abalone_data()

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [3,7,10,13]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [None]:
init_name_file = '2024-02-14-RF100_abalone_depthNone_test'
for i in range(100):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_abalone,y=y_abalone,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_abalone,df_final_std_abalone = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_abalone_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=100)

df_final_mean_abalone.style.format(precision=3)

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Haberman </h1></center>

## Run protocol :

In [None]:
from data.data import load_haberman_data
X_haberman,y_haberman = load_haberman_data()

In [None]:
indices_kept_10 = subsample_to_ratio_indices(X=X_haberman,y=y_haberman,ratio=0.1,seed_sub=11,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='haberman_sub_original_to_20')
X_haberman_10,y_haberman_10 = X_haberman[indices_kept_10,:],y_haberman[indices_kept_10]

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [5,15,19,24,29,32]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [None]:
init_name_file = '2024-02-14-RF100_haberman_depthNone_test'
for i in range(100):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_haberman_10,y=y_haberman_10,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_haberman,df_final_std_haberman = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_haberman_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=100)

df_final_mean_haberman.style.format(precision=3)

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Yeast </h1></center>

In [None]:
from data.data import load_yeast_data
X_yeast,y_yeast = load_yeast_data()

In [None]:
indices_kept_10 = subsample_to_ratio_indices(X=X_yeast,y=y_yeast,ratio=0.01,seed_sub=15,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='yeast_sub_original_to_1')
X_yeast_1,y_yeast_1 = X_yeast[indices_kept_10,:],y_yeast[indices_kept_10]

In [None]:
# Display nice table with all the metrics here ?

<center><h1  style="color:white; background-color:#008b96; border-radius: 10px; padding:15px;"> END </h1></center>