#### [00] Importing Data

In [1]:
import pandas as pd

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks 
from imblearn.under_sampling import EditedNearestNeighbours 

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE 

**!! Edit kdPIC parameter**

specify the "kdPIC" and the band that would be calculated

In [2]:
kdPIC = "F"

#### [01] Under dan Over Sampling Function Definition

In [3]:
def ratio_multiplier(y):
    from collections import Counter

    multiplier = strategy_under2
    target_stats = Counter(y)
    for key, value in target_stats.items():
        if key in multiplier:
            target_stats[key] = int(value * multiplier[key])
    return target_stats

def finalise(X,y):
    res = X.copy()
    res['nama_valid'] = y
    return res

def create_dataset(name_file):
    
    gs_folder = "gs://bps-gcp-bucket/MLST2023/preprocessing/"
    df_sample = pd.read_csv(gs_folder+name_file)
    
    y = df_sample.nama_valid
    X = df_sample[['B1_p15', 'B2_p15', 'B3_p15',
           'B4_p15', 'B5_p15', 'B6_p15', 'B7_p15', 'B8_p15', 'B8A_p15', 'B11_p15',
           'B12_p15', 'NDVI_p50', 'NDWI_p50', 'NDBI_p50', 'SAVI_p50', 'EVI_p50',
           'GNDVI_p50']]

    n = df_sample.groupby("nama_valid").nama_valid.count()
    ntmp = y.value_counts().median()
    nmean=int(0.8*ntmp)
    
    up_=n[n>int(1.3*nmean)]
    under_=n[n<int(0.7*nmean)]
    another_=n[(n<int(1.3*nmean))&(n>int(0.7*nmean))]
    
    strategy_upper = dict(zip(under_.index, [int(0.7*nmean)]*len(under_)))
    strategy_under = dict(zip(up_.index, [int(1.3*nmean)]*len(under_)))
    strategy_ = dict(zip(another_.index, another_))


    tl_ = TomekLinks('majority')
    enn_ = EditedNearestNeighbours('majority')
    rus_ = RandomUnderSampler(sampling_strategy=strategy_under)

    X_tl, y_tl = tl_.fit_resample(X, y)
    X_enn, y_enn = enn_.fit_resample(X, y)

    X_res_tl, y_res_tl = rus_.fit_resample(X_tl, y_tl)
    X_res_enn, y_res_enn = rus_.fit_resample(X_enn, y_enn)

    smote_ = SMOTE(sampling_strategy=strategy_upper)
    B_smote_ = BorderlineSMOTE(sampling_strategy=strategy_upper)

    X_res_tl_s, y_res_tl_s = smote_.fit_resample(X_res_tl, y_res_tl)
    X_res_enn_s, y_res_enn_s = smote_.fit_resample(X_res_enn, y_res_enn)

    X_res_tl_bs, y_res_tl_bs = B_smote_.fit_resample(X_res_tl, y_res_tl)
    X_res_enn_bs, y_res_enn_bs = B_smote_.fit_resample(X_res_enn, y_res_enn)
  
    finalise(X_res_tl_s, y_res_tl_s).to_csv(gs_folder+name_file[:-4]+"_tl_smote.csv", index=False)
    finalise(X_res_tl_bs, y_res_tl_bs).to_csv(gs_folder+name_file[:-4]+"_tl_border_smote.csv", index=False)
    finalise(X_res_enn_s, y_res_enn_s).to_csv(gs_folder+name_file[:-4]+"_enn_smote.csv", index=False)
    finalise(X_res_enn_bs, y_res_enn_bs).to_csv(gs_folder+name_file[:-4]+"_enn_border_smote.csv", index=False)

#### [02] Running Under dan Over Sampling

In [4]:
name_file = ["sample_"+kdPIC+".csv", 
            "sample_"+kdPIC+"_no_outlier.csv"]
for i in name_file: 
    create_dataset(i)

