<center><h1  style="color:white; background-color:#000000; border-radius: 0px; padding:25px;"> Theoretical study of SMOTE : </h1></center>

This notebook highlights the protocols used for the numerical illustrations of the Paper named "Theoretical and experimental study of SMOTE: limitations and comparisons of rebalancing strategies".
The classification protocol is used here.

In [None]:
import sys
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit

from oversampling_strategies.oversampling_strategies import MGS
from oversampling_strategies.oversampling_strategies import NoSampling 
from oversampling_strategies.oversampling_strategies import CVSmoteModel
from validation.classif_experiments import run_eval

In [None]:
sys.path.insert(1, '/home/abdoulaye_sakho/S1/th_smote/smote_strategies_study')

In [None]:
output_dir_path ="/home/abdoulaye_sakho/S1/th_smote/theoretical_smote/saved_experiments" ## Fill it

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Pima </h1></center>

## Run protocol :

In [None]:
from data.data import load_pima_data
X_pima, y_pima = load_pima_data()

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_oversampling_and_params = [
    ('no_sampling', NoSampling(), {}, RF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},RF),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
]

In [None]:
init_name_file = '2024-02-14-RF100_pima_depthNone_test'
for i in range(2):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_pima,y=y_pima,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols

In [None]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_pima,df_final_std_pima = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_pima_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=2)

df_final_mean_pima

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Phoneme </h1></center>

## Run protocol :

In [None]:
from data.data import load_pima_data
X_phoneme, y_phoneme = load_phoneme_data()()

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_oversampling_and_params = [
    ('no_sampling', NoSampling(), {}, RF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},RF),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
]

In [None]:
init_name_file = '2024-02-14-RF100_phoneme_depthNone_test'
for i in range(2):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_phoneme,y=y_phoneme,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols

In [None]:
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_phoneme,df_final_std_phoneme = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_phoneme_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=2)

df_final_mean_phoneme

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> GA4 </h1></center>

## Run protocol :

In [None]:
from data.data import load_ga4_data
X_ga4,y_ga4,meta_df_ga4 = load_ga4_data()

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_oversampling_and_params = [
    ('no_sampling', NoSampling(), {}, RF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},RF),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
]

In [None]:
from validation.classif_experiments import MyTimeSeriesSplit_groupout
splitter_Mytss_groupout = MyTimeSeriesSplit_groupout(n_splits=10, starting_split=5,
                                                     meta_df=meta_df_ga4,col_name_id='user_pseudo_id')

In [None]:
init_name_file = '2024-02-14-RF100_ga4_depthNone_test'
for i in range(1):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_pima,y=y_pima,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_Mytss_groupout)

## Compute metrics :

In [None]:

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_ga4,df_final_std_ga4 = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_ga4_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=1)

df_final_mean_ga4

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Abalone </h1></center>

## Run protocol :

In [None]:
from data.data import load_abalone_data
X_abalone, y_abalone = load_abalone_data()

In [None]:
RF = RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5)
RF2 = RandomForestClassifier(n_estimators=100,criterion='gini',class_weight='balanced',n_jobs=5)

splitter_stratified_cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=None)
CVSmoteRF = CVSmoteModel(
    splitter=splitter_stratified_cv,
    model=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=5),
    list_k_max=15,list_k_step=1)

list_oversampling_and_params = [
    ('no_sampling', NoSampling(), {}, RF),
    ('MGS',MGS(K=5,n_points=5,llambda=1.0),{},RF),
    ('cv_smote', NoSampling(),{}, CVSmoteRF),
]

In [None]:
init_name_file = '2024-02-14-RF100_abalone_depthNone_test'
for i in range(2):
    splitter_stratified = StratifiedKFold(n_splits=5,shuffle=True,random_state=100+i)
    name_file = init_name_file + str(i) +'.npy'
    run_eval(output_dir=output_dir_path,name_file=name_file,X=X_pima,y=y_pima,
             list_oversampling_and_params=list_oversampling_and_params,
             splitter=splitter_stratified)

## Compute metrics :

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from validation.classif_experiments import compute_metrics_several_protocols
list_metric = [
    (precision_score,'precision','pred'),
    (recall_score,'recall','pred'),
    (roc_auc_score,'roc_auc','proba')
]

In [None]:
df_final_mean_abalone,df_final_std_abalone = compute_metrics_several_protocols(
    output_dir=output_dir_path,
    init_name_file='2024-02-14-RF100_abalone_depthNone_test',
    list_metric=list_metric,
    bool_roc_auc_only=True,n_iter=2)

df_final_mean_abalone

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Yeast </h1></center>

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Wine </h1></center>

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> CreditCard </h1></center>

<center><h1  style="color:white; background-color:#808b96; border-radius: 10px; padding:15px;"> Haberman </h1></center>

<center><h1  style="color:white; background-color:#008b96; border-radius: 10px; padding:15px;"> END </h1></center>