In [1]:
seed = 123

In [2]:
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load dataset (no NAs)
X, Y = load_p53_ds()
# sratified 80/20 split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 3.7 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
# make model and return the prop of the minority class to allow optimization the class weights
def make_model(
    x, y,
    u1=256,
    d1=0.5,
    lr=0.001,
    smart_init=True
):
    
    # define metrics
    metrics = [
        mcc,
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc', curve="PR")
    ]
    
    if smart_init:
        n_active = y.sum()
        n_inactive = len(y) - n_active
        b0 = tf.keras.initializers.Constant(np.log([n_active / n_inactive]))
    else:
        b0=None # random initialization

    # build & compile
    model = keras.Sequential([
        keras.layers.Dense(u1, activation='relu', input_shape=(x.shape[-1], )),
        keras.layers.Dropout(d1),
        keras.layers.Dense(1, activation='sigmoid', bias_initializer=b0)
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(lr=lr),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics
    )
    
    return model

In [4]:
# to numpy arrays
X_train, Y_train = np.array(X_train), np.array(Y_train)

# Preprocessing
pipe = Pipeline([
    ("scale", StandardScaler()),
    ("clip", ClipFeatures(c=3))
])

X_train_prepped = pipe.fit_transform(X_train, Y_train)

In [5]:
# optimize class_weight hyperparameter after resampling

results = pd.DataFrame(
    index=np.arange(1000), columns=[
    "seed", "fold", "upsampler", "up_ratio", "cw_0",
    "precision", "recall", "auc", "mcc"]
)

i = 0
CV = 4

for seed in [1, 2, 3]:
    
    print("=" * 50)
    print("\nRandom seed", seed)
    print("=" * 50)
    
    SKF = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    folds = SKF.split(X_train_prepped, Y_train)

    for f, (train_idx, val_idx) in enumerate(folds):

        print('\nFold ', f)
        print("-" * 50)

        X_t, Y_t = X_train_prepped[train_idx], Y_train[train_idx]
        X_v, Y_v = X_train_prepped[val_idx], Y_train[val_idx]

        for upsampler in [SVMSMOTE]:
            
            print("\nUpsampler: ", upsampler.__name__)
            
            for up_ratio in [0.1, 0.2]:
                
                print("up_ratio : ", up_ratio)
                
                up = upsampler(sampling_strategy=up_ratio, m_neighbors=10, k_neighbors=10)
                X_t_resampled, Y_t_resampled = up.fit_resample(X_t, Y_t)

                # optimize class weight 
                for cw_0 in [0, 0.1, 0.2, 0.3]:

                    model = make_model(x=X_t_resampled, y=Y_t_resampled, lr=0.001)

                    if cw_0:
                        cw = {0: cw_0, 1: 1 - cw_0}
                    else:
                        cw = None # cw_0 = 0 no csl

                    print(f"class_weight:       {str(cw)}")

                    early_stopping = tf.keras.callbacks.EarlyStopping(
                        monitor='val_mcc',
                        verbose=1,
                        patience=20,
                        mode='max',
                        restore_best_weights=True
                    )

                    out = model.fit(
                        X_t_resampled,
                        Y_t_resampled,
                        batch_size=4096,
                        epochs=75,
                        callbacks=[early_stopping],
                        validation_data=(X_v, Y_v),
                        verbose=0,
                        class_weight=cw
                    )

                    summary=[seed, f, upsampler.__name__, up_ratio, cw_0]

                    scores = [
                        out.history["val_" + m][-1]
                        for m in ["precision", "recall", "auc", "mcc"]
                    ]

                    summary.extend(scores)

                    for col, value in zip(results.columns, summary):
                        results[col][i] = value

                    i += 1


Random seed 1

Fold  0
--------------------------------------------------

Upsampler:  SVMSMOTE
up_ratio :  0.1
class_weight:       None
Restoring model weights from the end of the best epoch.
Epoch 00062: early stopping
class_weight:       {0: 0.1, 1: 0.9}
Restoring model weights from the end of the best epoch.
Epoch 00046: early stopping
class_weight:       {0: 0.2, 1: 0.8}
Restoring model weights from the end of the best epoch.
Epoch 00037: early stopping
class_weight:       {0: 0.3, 1: 0.7}
Restoring model weights from the end of the best epoch.
Epoch 00067: early stopping
up_ratio :  0.2
class_weight:       None
Restoring model weights from the end of the best epoch.
Epoch 00043: early stopping
class_weight:       {0: 0.1, 1: 0.9}
Restoring model weights from the end of the best epoch.
Epoch 00041: early stopping
class_weight:       {0: 0.2, 1: 0.8}
Restoring model weights from the end of the best epoch.
Epoch 00045: early stopping
class_weight:       {0: 0.3, 1: 0.7}
Restoring m

In [6]:
results = results.dropna(axis=0)

results.sort_values("mcc", ascending=False)

Unnamed: 0,seed,fold,upsampler,up_ratio,cw_0,precision,recall,auc,mcc
18,1,2,SVMSMOTE,0.1,0.2,0.814815,0.709677,0.648853,0.778309
17,1,2,SVMSMOTE,0.1,0.1,0.807692,0.677419,0.682786,0.759926
23,1,2,SVMSMOTE,0.2,0.3,0.807692,0.677419,0.61199,0.759926
22,1,2,SVMSMOTE,0.2,0.2,0.758621,0.709677,0.610076,0.758493
21,1,2,SVMSMOTE,0.2,0.1,0.647059,0.709677,0.606268,0.720779
...,...,...,...,...,...,...,...,...,...
27,1,3,SVMSMOTE,0.1,0.3,0.461538,0.4,0.313962,0.429432
28,1,3,SVMSMOTE,0.2,0,0.413793,0.4,0.304321,0.42484
25,1,3,SVMSMOTE,0.1,0.1,0.378378,0.466667,0.254256,0.423925
26,1,3,SVMSMOTE,0.1,0.2,0.393939,0.433333,0.306224,0.420391


In [7]:
results.to_csv(f"/kaggle/working/results_NN_CSL_resampled.csv")