<center><h1  style="color:white; background-color:#000000; border-radius: 0px; padding:25px;"> Theoretical study of SMOTE : </h1></center>

This notebook highlights the protocols used for the numerical illustrations of the Paper named "Theoretical and experimental study of SMOTE: limitations and comparisons of rebalancing strategies".
The classification protocol is used here.

In [None]:
import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from pathlib import Path


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit

from oversampling_strategies.oversampling_strategies import MGS
from oversampling_strategies.oversampling_strategies import NoSampling 
from oversampling_strategies.oversampling_strategies import CVSmoteModel
from oversampling_strategies.oversampling_strategies import BS_imb
from oversampling_strategies.oversampling_strategies import RUS_imb
from oversampling_strategies.oversampling_strategies import ROS_imb
from oversampling_strategies.oversampling_strategies import NM1_imb

from validation.classif_experiments import run_eval
from validation.classif_experiments import subsample_to_ratio_indices

In [None]:
output_dir_path = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'saved_experiments') ## Fill it
Path(output_dir_path).mkdir(parents=True, exist_ok=True)
output_dir_path

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Pima </h1></center>

In [None]:
os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'data','externals', 'abalone.data')

## Run protocol :

In [None]:
from data.data import load_pima_data
X_pima, y_pima = load_pima_data()

In [None]:


indices_kept_20 = subsample_to_ratio_indices(X=X_pima,y=y_pima,ratio=0.2,seed_sub=15,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='pima_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(X=X_pima,y=y_pima,ratio=0.1,seed_sub=9,
                                             output_dir_subsampling=output_dir_path,
                                             name_subsampling_file='pima_sub_20_to_10',has_previous_under_sampling=True,
                                               previous_under_sampling=indices_kept_20)

X_pima_10,y_pima_10 = X_pima[indices_kept_10,:],y_pima[indices_kept_10]

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [4,8,12]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, RF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},RF),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('RUS', RUS_imb(),{}, RF),
    ('ROS', ROS_imb(),{}, RF),
    ('c_weight', NoSampling(), {}, RF2),
    ('NearMiss1', NM1_imb(),{}, RF),
    ('BorderlineS_1', BS_imb(),{'K':5,'kind':"borderline-1"}, RF),
    ('BorderlineS_2', BS_imb(),{'K':5,'kind':"borderline-2"}, RF),
    ('None_tuned_depth', NoSampling(), {}, grid_RF),
]

In [None]:
init_name_file = '2024-02-14-RF100_pima_depthNone_test'
for i in range(2):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_pima_10,y=y_pima_10,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols

In [None]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_pima,df_final_std_pima = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_pima_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=2)

df_final_mean_pima

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Phoneme </h1></center>

## Run protocol :

In [None]:
from data.data import load_phoneme_data
X_phoneme, y_phoneme = load_phoneme_data()

In [None]:
indices_kept_20 = subsample_to_ratio_indices(X=X_phoneme,y=y_phoneme,ratio=0.2,seed_sub=11,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='phoneme_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(
    X=X_phoneme,y=y_phoneme,ratio=0.2,seed_sub=9,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='phoneme_sub_20_to_10',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_20)

indices_kept_1 = subsample_to_ratio_indices(X=X_phoneme,y=y_phoneme,ratio=0.01,seed_sub=5,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='phoneme_sub_10_to_1',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_10)

X_phoneme_1,y_phoneme_1 = X_phoneme[indices_kept_1,:],y_phoneme[indices_kept_1]

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [5,8,11]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, RF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},RF),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('RUS', RUS_imb(),{}, RF),
    ('ROS', ROS_imb(),{}, RF),
    ('c_weight', NoSampling(), {}, RF2),
    ('NearMiss1', NM1_imb(),{}, RF),
    ('BorderlineS_1', BS_imb(),{'K':5,'kind':"borderline-1"}, RF),
    ('BorderlineS_2', BS_imb(),{'K':5,'kind':"borderline-2"}, RF),
    ('None_tuned_depth', NoSampling(), {}, grid_RF),
]

In [None]:
init_name_file = '2024-02-14-RF100_phoneme_depthNone_test'
for i in range(100):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_phoneme_1,y=y_phoneme_1,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols

In [None]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_phoneme,df_final_std_phoneme = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_phoneme_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=100)

df_final_mean_phoneme

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Abalone </h1></center>

## Run protocol :

In [None]:
from data.data import load_abalone_data
X_abalone, y_abalone = load_abalone_data()

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [3,7,10,13]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, RF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},RF),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('RUS', RUS_imb(),{}, RF),
    ('ROS', ROS_imb(),{}, RF),
    ('c_weight', NoSampling(), {}, RF2),
    ('NearMiss1', NM1_imb(),{}, RF),
    ('BorderlineS_1', BS_imb(),{'K':5,'kind':"borderline-1"}, RF),
    ('BorderlineS_2', BS_imb(),{'K':5,'kind':"borderline-2"}, RF),
    ('None_tuned_depth', NoSampling(), {}, grid_RF),
]

In [None]:
init_name_file = '2024-02-14-RF100_abalone_depthNone_test'
for i in range(100):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_abalone,y=y_abalone,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_abalone,df_final_std_abalone = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_abalone_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=100)

df_final_mean_abalone

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Haberman </h1></center>

## Run protocol :

In [None]:
from data.data import load_haberman_data
X_haberman,y_haberman = load_haberman_data()

In [None]:
indices_kept_10 = subsample_to_ratio_indices(X=X_phoneme,y=y_phoneme,ratio=0.1,seed_sub=9,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='phoneme_sub_original_to_20')
X_haberman_10,y_haberman_10 = X_haberman[indices_kept_10,:],y_haberman[indices_kept_10]

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [5,15,19,24,29,32]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, RF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},RF),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('RUS', RUS_imb(),{}, RF),
    ('ROS', ROS_imb(),{}, RF),
    ('c_weight', NoSampling(), {}, RF2),
    ('NearMiss1', NM1_imb(),{}, RF),
    ('BorderlineS_1', BS_imb(),{'K':5,'kind':"borderline-1"}, RF),
    ('BorderlineS_2', BS_imb(),{'K':5,'kind':"borderline-2"}, RF),
    ('None_tuned_depth', NoSampling(), {}, grid_RF),
]

In [None]:
init_name_file = '2024-02-14-RF100_ga4_depthNone_test'
for i in range(100):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_haberman_10,y=y_haberman_10,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_ga4,df_final_std_ga4 = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_ga4_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=100)

df_final_mean_ga4

<center><h1  style="color:white; background-color:#008b96; border-radius: 10px; padding:15px;"> END </h1></center>