<center><h1  style="color:white; background-color:#000000; border-radius: 0px; padding:25px;"> Theoretical study of SMOTE </h1></center>

This notebook reproduces the numerical experiments presented in "Theoretical and experimental study of SMOTE: limitations and comparisons of rebalancing strategies".

<ins>Experimented Datasets: </ins>
- [Phoneme](#Phoneme)
- Pima
- Abalone
- Haberman
- Yeast


In [1]:
import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from pathlib import Path


from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler,NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE


from oversampling_strategies.oversampling_strategies import MGS, NoSampling, CVSmoteModel

from validation.classif_experiments import run_eval, subsample_to_ratio_indices,read_subsampling_indices, compute_metrics_several_protocols

In [2]:
output_dir_path =  "../saved_experiments" ## Fill it
Path(output_dir_path).mkdir(parents=True, exist_ok=True)
output_dir_path

'../saved_experiments'

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;" name='Phoneme'> Phoneme</h1></center>

## <a name='Phoneme'></a>

## Run protocol :

In [None]:
from data.data import load_phoneme_data
X_phoneme, y_phoneme = load_phoneme_data()

In [None]:
indices_kept_20 = subsample_to_ratio_indices(X=X_phoneme,y=y_phoneme,ratio=0.2,seed_sub=11,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='phoneme_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(
    X=X_phoneme,y=y_phoneme,ratio=0.1,seed_sub=9,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='phoneme_sub_20_to_10',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_20)

indices_kept_1 = subsample_to_ratio_indices(X=X_phoneme,y=y_phoneme,ratio=0.01,seed_sub=5,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='phoneme_sub_10_to_1',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_10)

In [None]:
X_phoneme_1,y_phoneme_1 = X_phoneme[indices_kept_1,:],y_phoneme[indices_kept_1]

In [None]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [5,8,11]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [None]:
init_name_file = '2024-02-14-RF100_phoneme_depthNone_test'
for i in range(100):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_phoneme_1,y=y_phoneme_1,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_phoneme,df_final_std_phoneme = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_phoneme_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=100)

df_final_mean_phoneme.style.format(precision=3)

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Pima </h1></center>

## Run protocol :

In [3]:
from data.data import load_pima_data
X_pima, y_pima = load_pima_data()

In [4]:
indices_kept_20 = subsample_to_ratio_indices(X=X_pima,y=y_pima,ratio=0.2,seed_sub=15,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='pima_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(X=X_pima,y=y_pima,ratio=0.1,seed_sub=7,
                                             output_dir_subsampling=output_dir_path,
                                             name_subsampling_file='pima_sub_20_to_10',has_previous_under_sampling=True,
                                               previous_under_sampling=indices_kept_20)


In [5]:
X_pima_10,y_pima_10 = read_subsampling_indices(X=X_pima,y=y_pima, dir_subsampling=output_dir_path, name_subsampling_file='pima_sub_20_to_10', get_indexes=False)

In [6]:
indices_kept_10

array([338, 306, 397, 155,  99, 702, 662,  22, 314,  72, 414, 386, 360,
         9,  19,  53, 635, 696, 370, 378, 366, 284, 440, 242, 638, 165,
       719, 185, 614, 363, 291, 493, 750, 237, 283, 611, 321, 604, 683,
       255, 317, 355,  70,  56, 618, 502,  17, 109, 757, 359, 402, 612,
       539, 116, 748,   1,   3,   5,   7,  10,  12,  18,  20,  21,  27,
        28,  29,  30,  32,  33,  34,  35,  36,  40,  41,  42,  44,  46,
        47,  49,  50,  51,  52,  54,  55,  57,  58,  59,  60,  62,  63,
        65,  67,  68,  69,  71,  73,  74,  75,  76,  77,  79,  80,  81,
        82,  83,  85,  86,  87,  89,  90,  91,  92,  94,  95,  96,  97,
        98, 101, 102, 103, 104, 105, 106, 107, 108, 112, 113, 117, 118,
       119, 121, 122, 123, 126, 127, 133, 134, 135, 136, 137, 138, 139,
       140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 153, 156,
       157, 158, 160, 161, 162, 163, 166, 167, 168, 169, 172, 173, 174,
       176, 178, 180, 181, 182, 183, 184, 190, 191, 194, 196, 20

In [21]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [4,8,12]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
    #('None_tuned_depth', NoSampling(), {}, grid_RF),
]

In [22]:
init_name_file = '2024-02-14-RF100_pima_depthNone_test'
for i in range(10,20):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_pima_10,y=y_pima_10,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [23]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [24]:
df_final_mean_pima,df_final_std_pima = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_pima_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=20)

df_final_mean_pima.style.format(precision=3)

Unnamed: 0,None,c_weight,RUS,ROS,NearMiss1,BorderlineS_1,BorderlineS_2,SMOTE,cv_smote,MGS
ROC AUC,0.762,0.769,0.766,0.761,0.737,0.768,0.765,0.764,0.76,0.754


<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Abalone </h1></center>

## Run protocol :

In [3]:
from data.data import load_abalone_data
X_abalone, y_abalone = load_abalone_data()

In [4]:
X_abalone

array([[0.455 , 0.365 , 0.095 , ..., 0.2245, 0.101 , 0.15  ],
       [0.35  , 0.265 , 0.09  , ..., 0.0995, 0.0485, 0.07  ],
       [0.53  , 0.42  , 0.135 , ..., 0.2565, 0.1415, 0.21  ],
       ...,
       [0.6   , 0.475 , 0.205 , ..., 0.5255, 0.2875, 0.308 ],
       [0.625 , 0.485 , 0.15  , ..., 0.531 , 0.261 , 0.296 ],
       [0.71  , 0.555 , 0.195 , ..., 0.9455, 0.3765, 0.495 ]])

In [5]:
from collections import Counter
Counter(y_abalone)

Counter({0: 4135, 1: 42})

In [8]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [3,7,10,13]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [9]:
init_name_file = '2024-02-14-RF100_abalone_depthNone_test'
for i in range(1):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_abalone,y=y_abalone,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [10]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [11]:
df_final_mean_abalone,df_final_std_abalone = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_abalone_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=1)

df_final_mean_abalone.style.format(precision=3)

Unnamed: 0,None,c_weight,RUS,ROS,NearMiss1,BorderlineS_1,BorderlineS_2,SMOTE,cv_smote,MGS
ROC AUC,0.69,0.653,0.764,0.67,0.563,0.74,0.763,0.746,0.782,0.81


<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Haberman </h1></center>

## Run protocol :

In [12]:
from data.data import load_haberman_data
X_haberman,y_haberman = load_haberman_data()

In [13]:
indices_kept_10 = subsample_to_ratio_indices(X=X_haberman,y=y_haberman,ratio=0.1,seed_sub=11,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='haberman_sub_original_to_20')
X_haberman_10,y_haberman_10 = X_haberman[indices_kept_10,:],y_haberman[indices_kept_10]

In [14]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_depth = [5,15,19,24,29,32]
grid_RF = GridSearchCV(estimator=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=10),
             param_grid={'max_depth': list_depth }, scoring='roc_auc')

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [15]:
init_name_file = '2024-02-14-RF100_haberman_depthNone_test'
for i in range(1):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_haberman_10,y=y_haberman_10,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [16]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from validation.classif_experiments import compute_metrics_several_protocols

list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [17]:
df_final_mean_haberman,df_final_std_haberman = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_haberman_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=1)

df_final_mean_haberman.style.format(precision=3)

Unnamed: 0,None,c_weight,RUS,ROS,NearMiss1,BorderlineS_1,BorderlineS_2,SMOTE,cv_smote,MGS
ROC AUC,0.706,0.722,0.752,0.712,0.656,0.733,0.702,0.745,0.764,0.76


<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Yeast </h1></center>

In [14]:
from data.data import load_yeast_data
X_yeast,y_yeast = load_yeast_data()

In [15]:
indices_kept_10 = subsample_to_ratio_indices(X=X_yeast,y=y_yeast,ratio=0.01,seed_sub=15,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='yeast_sub_original_to_1')
X_yeast_1,y_yeast_1 = X_yeast[indices_kept_10,:],y_yeast[indices_kept_10]

In [16]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=7,list_k_step=1)

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [17]:
init_name_file = '2024-02-14-RF100_yeast_depthNone_test'
for i in range(10):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_yeast_1,y=y_yeast_1,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute Metrics :

In [18]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from validation.classif_experiments import compute_metrics_several_protocols

list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [19]:
df_final_mean_haberman,df_final_std_haberman = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_yeast_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=10)

df_final_mean_haberman.style.format(precision=3)

Unnamed: 0,None,c_weight,RUS,ROS,NearMiss1,BorderlineS_1,BorderlineS_2,SMOTE,cv_smote,MGS
ROC AUC,0.9,0.927,0.923,0.906,0.725,0.949,0.944,0.946,0.941,0.935


<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> MagicTel </h1></center>

In [3]:
from data.data import load_magictel_data
X_magic,y_magic = load_magictel_data()

In [4]:
indices_kept_20 = subsample_to_ratio_indices(X=X_magic,y=y_magic,ratio=0.2,seed_sub=11,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='magicTel_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(
    X=X_magic,y=y_magic,ratio=0.1,seed_sub=9,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='magicTel_sub_20_to_10',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_20)

indices_kept_1 = subsample_to_ratio_indices(X=X_magic,y=y_magic,ratio=0.01,seed_sub=5,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='magicTel_sub_10_to_1',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_10)

In [5]:
X_magic_1,y_magic_1 = read_subsampling_indices(X=X_magic,y=y_magic, dir_subsampling=output_dir_path, name_subsampling_file='magicTel_sub_10_to_1', get_indexes=False)

In [6]:
indices_kept_1[:50]

array([ 863, 4583, 4525, 4043, 4982, 5598, 4891, 6560, 1218, 1363,  162,
       5197, 4395, 2620, 4782, 1396, 5223,   14, 4209, 6676,  263, 1834,
       6198, 1540, 1999, 5394, 2958, 1490, 4127, 4295, 2791, 4380, 2372,
       4825, 4935, 1822, 3035, 2161, 4965, 4771, 1952, 4539, 2853, 5202,
       4346, 5080, 1226, 3800, 3558, 5568])

In [7]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [8]:
init_name_file = '2024-02-14-RF100_magicTel_depthNone_test'
for i in range(10):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_magic_1,y=y_magic_1,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute Metrics :

In [9]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from validation.classif_experiments import compute_metrics_several_protocols

list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [10]:
df_final_mean_haberman,df_final_std_haberman = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_magicTel_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=10)

df_final_mean_haberman.style.format(precision=3)

Unnamed: 0,None,c_weight,RUS,ROS,NearMiss1,BorderlineS_1,BorderlineS_2,SMOTE,cv_smote,MGS
ROC AUC,0.814,0.828,0.889,0.815,0.539,0.836,0.811,0.868,0.87,0.893


<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> California </h1></center>

In [11]:
from data.data import load_california_data
X_california,y_california = load_california_data()

In [12]:
X_california

array([[   2.1827    ,   26.        ,    4.52142857, ...,    2.17857143,
          40.05      , -122.1       ],
       [   3.0755    ,   32.        ,    4.62306778, ...,    4.59928656,
          32.77      , -117.06      ],
       [   1.8235    ,   40.        ,    4.70114943, ...,    3.55555556,
          37.75      , -122.16      ],
       ...,
       [   1.9327    ,   10.        ,    5.78947368, ...,    3.93859649,
          38.57      , -121.92      ],
       [   3.75      ,   38.        ,    5.27522936, ...,    2.37614679,
          38.72      , -121.9       ],
       [   2.8542    ,   37.        ,    5.25373134, ...,    3.55223881,
          38.84      , -121.81      ]])

In [13]:
X_california.shape

(20634, 8)

In [14]:
indices_kept_20 = subsample_to_ratio_indices(X=X_california,y=y_california,ratio=0.2,seed_sub=11,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='california_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(
    X=X_california,y=y_california,ratio=0.1,seed_sub=9,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='california_sub_20_to_10',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_20)

indices_kept_1 = subsample_to_ratio_indices(X=X_california,y=y_california,ratio=0.01,seed_sub=5,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='california_sub_10_to_1',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_10)

In [15]:
X_california_1,y_california_1 = read_subsampling_indices(X=X_california,y=y_california, dir_subsampling=output_dir_path, name_subsampling_file='california_sub_10_to_1', get_indexes=False)

In [16]:
indices_kept_1

array([18782, 12352, 20361, ..., 10314, 10315, 10316])

In [17]:
indices_kept_10

array([12137, 10798, 17461, ..., 10314, 10315, 10316])

In [18]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [19]:
init_name_file = '2024-02-14-RF100_california_depthNone_test'
for i in range(5):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_california_1,y=y_california_1,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute Metrics :

In [20]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from validation.classif_experiments import compute_metrics_several_protocols

list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [21]:
df_final_mean_haberman,df_final_std_haberman = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_california_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=5)

df_final_mean_haberman.style.format(precision=3)

Unnamed: 0,None,c_weight,RUS,ROS,NearMiss1,BorderlineS_1,BorderlineS_2,SMOTE,cv_smote,MGS
ROC AUC,0.85,0.875,0.882,0.878,0.632,0.89,0.875,0.905,0.906,0.919


<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> House_16H </h1></center>

In [8]:
from data.data import load_house_data
X_house,y_house = load_house_data()

In [9]:
X_house

array([[1.55120e+04, 4.60869e-01, 4.92520e-02, ..., 2.60116e-01,
        5.22460e-02, 7.74059e-01],
       [1.55000e+03, 4.70968e-01, 2.58100e-03, ..., 2.85267e-01,
        6.06060e-02, 1.42857e-01],
       [4.74100e+03, 4.85341e-01, 2.11000e-04, ..., 3.15433e-01,
        6.51160e-02, 6.87500e-01],
       ...,
       [9.77000e+02, 4.82088e-01, 1.24872e-01, ..., 1.87817e-01,
        1.14504e-01, 3.33333e-01],
       [3.06000e+02, 5.09804e-01, 0.00000e+00, ..., 2.82258e-01,
        9.09090e-02, 0.00000e+00],
       [6.19800e+03, 5.08874e-01, 1.35530e-02, ..., 3.74757e-01,
        3.03030e-02, 6.00000e-01]])

In [10]:
X_house.shape

(22784, 16)

In [11]:
indices_kept_20 = subsample_to_ratio_indices(X=X_house,y=y_house,ratio=0.2,seed_sub=11,
                                           output_dir_subsampling=output_dir_path,
                                           name_subsampling_file='house16_sub_original_to_20')

indices_kept_10 = subsample_to_ratio_indices(
    X=X_house,y=y_house,ratio=0.1,seed_sub=9,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='house16_sub_20_to_10',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_20)

indices_kept_1 = subsample_to_ratio_indices(X=X_house,y=y_house,ratio=0.01,seed_sub=5,
    output_dir_subsampling=output_dir_path,
    name_subsampling_file='house16_sub_10_to_1',has_previous_under_sampling=True,
    previous_under_sampling=indices_kept_10)

In [12]:
X_house_1,y_house_1 = read_subsampling_indices(X=X_house,y=y_house, dir_subsampling=output_dir_path, name_subsampling_file='magicTel_sub_10_to_1', get_indexes=False)

In [13]:
indices_kept_1

array([17644,   636,  9292, ..., 22779, 22780, 22782])

In [None]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
balanced_rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_oversampling_and_params = [
    ('None', NoSampling(), {}, rf_model),
    ('c_weight', NoSampling(), {}, balanced_rf_model),
    ('RUS', RandomUnderSampler(sampling_strategy="majority",replacement=False),{}, rf_model),
    ('ROS',  RandomOverSampler(sampling_strategy="minority"),{}, rf_model),
    ('NearMiss1', NearMiss(sampling_strategy="majority", version=1),{}, rf_model),
    ('BorderlineS_1', BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-1"),{}, rf_model),
    ('BorderlineS_2',BorderlineSMOTE(sampling_strategy="minority", k_neighbors=5, kind="borderline-2"),{}, rf_model),
    ('SMOTE',SMOTE(sampling_strategy="minority"),{},rf_model),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},rf_model),
]

In [None]:
init_name_file = '2024-02-14-RF100_house16_depthNone_test'
for i in range(5):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_house_1,y=y_house_1,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute Metrics :

In [None]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from validation.classif_experiments import compute_metrics_several_protocols

list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_haberman,df_final_std_haberman = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_house16_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=5)

df_final_mean_haberman.style.format(precision=3)

In [None]:
# Display nice table with all the metrics here ?

<center><h1  style="color:white; background-color:#008b96; border-radius: 10px; padding:15px;"> END </h1></center>