In [1]:
seed = 123

In [2]:
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load dataset (no NAs)
X, Y = load_p53_ds()
# sratified 80/20 split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 3.9 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
# preprocessing based on previously optimized preprocessing pipeline
FU = FeatureUnion([
    ("pca", PCA(n_components = 10)),
    ("fs_univ", SelectKBest(f_classif, k=5)),
    ("fs_model", SelectFromModel(
        LinearSVC(max_iter=15000, random_state=seed, class_weight="balanced"), threshold=-np.inf, max_features=1000)
    )
])

pipe = Pipeline([
    ("remove_corr", RemoveCorrelatedFeatures(n_remove=500)),
    ("ss", StandardScaler()),
    ("pp", FU)
])

# perform PP steps and fit a simple LogReg clf as a baseline reference to evaluate resampling algorithms
X_train_prepped = pipe.fit_transform(X_train, Y_train)

_ = cv(
    model=LogisticRegression(max_iter=5000, penalty="none", class_weight="balanced", random_state=seed),
    x=X_train_prepped,
    y=Y_train
)



test_balanced_acc              0.74 (± 0.06)       [0.77, 0.68, 0.79, 0.79, 0.67]
test_mcc                       0.52 (± 0.11)       [0.66, 0.41, 0.52, 0.62, 0.37]
--------------------------------------------------------------------------------
Training MCC:                  1.00 (± 0.00)       [1.0, 1.0, 1.0, 1.0, 1.0]



In [4]:
CV = 4

results = pd.DataFrame(
    index=np.arange(1000), columns=[
    "seed", "upsampler", "downsampler", "up_class_ratio", "down_vs_up_ratio","final_class_ratio",
    "train_mcc", "val_mcc", "train_bal_acc", "val_bal_acc"]
)

i = 0

# the stochastic nature of the resampling algorithms resulted in large variability, therefore the different
# algorithms were repeated and the average validation fold performance (4-fold CV) was used to evaluate
# the downstream performance of the different resampling algorithms
for seed in [1, 2, 3]:
    
    # upsampling minority class
    for up in [RandomOverSampler, SMOTE, ADASYN, SVMSMOTE]:

        for down in [RandomUnderSampler]:

            for up_ratio in [0.01, 0.05, 0.1, 0.2, 0.4]:
                
                for down_vs_up_ratio in [1, 1.25, 1.5, 1.75, 2]:

                    down_ratio = down_vs_up_ratio * up_ratio

                    pipe = Pipeline([
                        ("up", up(sampling_strategy=up_ratio, random_state=seed)),
                        ("down", down(sampling_strategy=down_ratio, random_state=seed)),
                        ("lr", LogisticRegression(max_iter=5000, penalty="none", random_state=seed, class_weight="balanced"))
                    ])

                    out = [seed, up.__name__, down.__name__, up_ratio, down_vs_up_ratio, down_ratio]

                    print("=" * 80)
                    print(" | ".join(str(o) for o in out[1:]))
                    print("=" * 80)

                    # 3-fold CV w/ LR
                    scores = cv(
                        model=pipe, 
                        x=X_train_prepped, 
                        y=Y_train,
                        cv=CV,
                        n_jobs=-1
                    )
                    out.extend(scores)

                    # write to results df
                    for col, value in zip(results.columns, out):
                        results[col][i] = value

                    i += 1

                    
    # add as a reference: no resampling (per seed)
    out = [seed, "no_resampling", "no_resampling", "NA", "NA", "NA"]

    scores = cv(
        model=LogisticRegression(max_iter=5000, penalty="none", random_state=seed, class_weight="balanced"),
        x=X_train_prepped,
        y=Y_train,
        cv=CV,
        n_jobs=-1
    )

    out.extend(scores)

    for col, value in zip(results.columns, out):
        results[col][i] = value

    i += 1

RandomOverSampler | RandomUnderSampler | 0.01 | 1 | 0.01
test_balanced_acc              0.72 (± 0.03)       [0.7, 0.75, 0.74, 0.68]
test_mcc                       0.45 (± 0.03)       [0.49, 0.44, 0.47, 0.41]
--------------------------------------------------------------------------------
Training MCC:                  1.00 (± 0.00)       [1.0, 1.0, 1.0, 1.0]

RandomOverSampler | RandomUnderSampler | 0.01 | 1.25 | 0.0125
test_balanced_acc              0.73 (± 0.03)       [0.78, 0.73, 0.71, 0.71]
test_mcc                       0.45 (± 0.08)       [0.58, 0.36, 0.44, 0.4]
--------------------------------------------------------------------------------
Training MCC:                  0.94 (± 0.01)       [0.92, 0.93, 0.95, 0.94]

RandomOverSampler | RandomUnderSampler | 0.01 | 1.5 | 0.015
test_balanced_acc              0.75 (± 0.02)       [0.75, 0.76, 0.71, 0.76]
test_mcc                       0.43 (± 0.06)       [0.52, 0.37, 0.39, 0.45]
-------------------------------------------------------



test_balanced_acc              0.77 (± 0.04)       [0.76, 0.83, 0.76, 0.71]
test_mcc                       0.44 (± 0.04)       [0.47, 0.44, 0.47, 0.38]
--------------------------------------------------------------------------------
Training MCC:                  0.84 (± 0.04)       [0.79, 0.83, 0.9, 0.84]

ADASYN | RandomUnderSampler | 0.4 | 2 | 0.8
test_balanced_acc              0.77 (± 0.04)       [0.75, 0.85, 0.77, 0.73]
test_mcc                       0.42 (± 0.02)       [0.4, 0.44, 0.45, 0.4]
--------------------------------------------------------------------------------
Training MCC:                  0.80 (± 0.03)       [0.75, 0.79, 0.85, 0.8]

SVMSMOTE | RandomUnderSampler | 0.01 | 1 | 0.01
test_balanced_acc              0.71 (± 0.02)       [0.7, 0.72, 0.72, 0.68]
test_mcc                       0.43 (± 0.04)       [0.44, 0.45, 0.46, 0.36]
--------------------------------------------------------------------------------
Training MCC:                  1.00 (± 0.00)       [1.0, 1.0

In [5]:
results = results.dropna(axis=0)

results.sort_values("val_mcc", ascending=False)

Unnamed: 0,seed,upsampler,downsampler,up_class_ratio,down_vs_up_ratio,final_class_ratio,train_mcc,val_mcc,train_bal_acc,val_bal_acc
15,1,RandomOverSampler,RandomUnderSampler,0.2,1,0.2,1,0.5138,1,0.746512
299,3,SVMSMOTE,RandomUnderSampler,0.4,1.5,0.6,0.880879,0.507897,0.999295,0.774926
187,2,SVMSMOTE,RandomUnderSampler,0.1,1.25,0.125,0.950308,0.500565,0.999738,0.762641
203,3,RandomOverSampler,RandomUnderSampler,0.01,1.25,0.0125,0.939727,0.498245,0.999677,0.763219
45,1,SMOTE,RandomUnderSampler,0.4,1,0.4,1,0.498079,1,0.746774
...,...,...,...,...,...,...,...,...,...,...
3,1,RandomOverSampler,RandomUnderSampler,0.01,1.75,0.0175,0.842108,0.400187,0.999006,0.741687
149,2,SMOTE,RandomUnderSampler,0.4,1.75,0.7,0.834865,0.399441,0.998918,0.745424
81,1,SVMSMOTE,RandomUnderSampler,0.05,1.25,0.0625,0.938648,0.398166,0.999671,0.717138
42,1,SMOTE,RandomUnderSampler,0.2,1.5,0.3,0.874591,0.397713,0.999254,0.721123


In [6]:
results.to_csv(f"/kaggle/working/results_resampling.csv")