In [1]:
seed = 123

In [2]:
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load dataset (no NAs)
X, Y = load_p53_ds()
# sratified 80/20 split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 3.4 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
# Dimensionality-reduction (optimized cf. 1__RFC__DimRed_A/B notebooks)
    # deze is MCC=0.5889 on downsampled DS
FU = FeatureUnion([
    ("fs_univ", SelectKBest(f_classif, k=20)),
    ("fs_model", SelectFromModel(
        ExtraTreesClassifier(n_estimators=100, random_state=seed), threshold=-np.inf, max_features=125
    ))
])

pipe = Pipeline([
    ("remove_corr", RemoveCorrelatedFeatures(n_remove=10)),
    ("scale", StandardScaler()),
    ("fs", FU)
])

# perform PP steps and fit as a baseline to evaluate resampling algorithms
X_train_prepped = pipe.fit_transform(X_train, Y_train)

In [4]:
CV = 4

results = pd.DataFrame(
    index=np.arange(1000), columns=[
    "seed", "upsampler", "downsampler", "up_class_ratio", "down_vs_up_ratio","final_class_ratio",
    "train_mcc", "val_mcc", "train_bal_acc", "val_bal_acc"]
)

i = 0

# the stochastic nature of the resampling algorithms resulted in large variability, therefore the different
# algorithms were repeated and the average validation fold performance (4-fold CV) was used to evaluate
# the downstream performance of the different resampling algorithms
for seed in [1]:
    
    # upsampling minority class
    for up in [RandomOverSampler, SMOTE, ADASYN, SVMSMOTE]:

        for down in [RandomUnderSampler]:

            for up_ratio in [0.01, 0.05, 0.1, 0.2]:
                
                for down_vs_up_ratio in [1, 1.25, 1.5, 1.75, 2]:

                    down_ratio = down_vs_up_ratio * up_ratio

                    pipe = Pipeline([
                        ("up", up(sampling_strategy=up_ratio, random_state=seed)),
                        ("down", down(sampling_strategy=down_ratio, random_state=seed)),
                        ("clf", RandomForestClassifier(random_state=seed, class_weight="balanced", min_samples_leaf=10, n_estimators=500, max_depth=15))
                    ])

                    out = [seed, up.__name__, down.__name__, up_ratio, down_vs_up_ratio, down_ratio]

                    print("=" * 80)
                    print(" | ".join(str(o) for o in out[1:]))
                    print("=" * 80)

                    # 4-fold CV w/ LR
                    scores = cv(
                        model=pipe, 
                        x=X_train_prepped, 
                        y=Y_train,
                        cv=CV,
                        n_jobs=-1
                    )
                    out.extend(scores)

                    # write to results df
                    for col, value in zip(results.columns, out):
                        results[col][i] = value

                    i += 1

    # add as a reference: no resampling
    out = [seed, "no_resampling", "no_resampling", "NA", "NA", "NA"]

    scores = cv(
        model=RandomForestClassifier(random_state=seed, class_weight="balanced", min_samples_leaf=10, n_estimators=500, max_depth=15),
        x=X_train_prepped,
        y=Y_train,
        cv=CV,
        n_jobs=-1
    )

    out.extend(scores)

    for col, value in zip(results.columns, out):
        results[col][i] = value

    i += 1

RandomOverSampler | RandomUnderSampler | 0.01 | 1 | 0.01
test_balanced_acc              0.75 (± 0.02)       [0.73, 0.76, 0.77, 0.75]
test_mcc                       0.49 (± 0.05)       [0.57, 0.43, 0.49, 0.47]
--------------------------------------------------------------------------------
Training MCC:                  0.80 (± 0.02)       [0.77, 0.82, 0.82, 0.79]

RandomOverSampler | RandomUnderSampler | 0.01 | 1.25 | 0.0125
test_balanced_acc              0.75 (± 0.02)       [0.73, 0.76, 0.77, 0.75]
test_mcc                       0.47 (± 0.04)       [0.54, 0.42, 0.47, 0.45]
--------------------------------------------------------------------------------
Training MCC:                  0.78 (± 0.02)       [0.76, 0.8, 0.82, 0.77]

RandomOverSampler | RandomUnderSampler | 0.01 | 1.5 | 0.015
test_balanced_acc              0.76 (± 0.03)       [0.73, 0.76, 0.8, 0.75]
test_mcc                       0.48 (± 0.05)       [0.56, 0.42, 0.5, 0.44]
----------------------------------------------------

In [5]:
results = results.dropna(axis=0)

results.sort_values("val_mcc", ascending=False)

Unnamed: 0,seed,upsampler,downsampler,up_class_ratio,down_vs_up_ratio,final_class_ratio,train_mcc,val_mcc,train_bal_acc,val_bal_acc
60,1,SVMSMOTE,RandomUnderSampler,0.01,1,0.01,0.797386,0.528798,0.998602,0.775316
61,1,SVMSMOTE,RandomUnderSampler,0.01,1.25,0.0125,0.779014,0.518783,0.998414,0.783314
21,1,SMOTE,RandomUnderSampler,0.01,1.25,0.0125,0.785985,0.514576,0.998495,0.774926
20,1,SMOTE,RandomUnderSampler,0.01,1,0.01,0.801048,0.513831,0.998643,0.762682
65,1,SVMSMOTE,RandomUnderSampler,0.05,1,0.05,0.698824,0.509715,0.944499,0.795491
...,...,...,...,...,...,...,...,...,...,...
58,1,ADASYN,RandomUnderSampler,0.2,1.75,0.35,0.57735,0.402969,0.988413,0.829107
56,1,ADASYN,RandomUnderSampler,0.2,1.25,0.25,0.584662,0.398823,0.988573,0.812804
59,1,ADASYN,RandomUnderSampler,0.2,2,0.4,0.568799,0.397877,0.988232,0.820835
55,1,ADASYN,RandomUnderSampler,0.2,1,0.2,0.600348,0.397699,0.993,0.808758


In [6]:
results.to_csv(f"/kaggle/working/results_resampling.csv")