In [1]:
seed = 123

In [2]:
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load dataset (no NAs)
X, Y = load_p53_ds()
# sratified 80/20 split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 4.5 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
# make model and return the prop of the minority class to allow optimization the class weights
def make_model(
    x, y,
    u1=256,
    d1=0.5,
    lr=0.001,
    smart_init=True
):
    
    # define metrics
    metrics = [
        mcc,
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc', curve="PR")
    ]
    
    if smart_init:
        n_active = y.sum()
        n_inactive = len(y) - n_active
        b0 = tf.keras.initializers.Constant(np.log([n_active / n_inactive]))
    else:
        b0=None # random initialization

    # build & compile
    model = keras.Sequential([
        keras.layers.Dense(u1, activation='relu', input_shape=(x.shape[-1], )),
        keras.layers.Dropout(d1),
        keras.layers.Dense(1, activation='sigmoid', bias_initializer=b0)
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(lr=lr),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics
    )
    
    return model

In [4]:
# to numpy arrays
X_train, Y_train = np.array(X_train), np.array(Y_train)

# Preprocessing
pipe = Pipeline([
    ("scale", StandardScaler()),
    ("clip", ClipFeatures(c=3))
])

X_train_prepped = pipe.fit_transform(X_train, Y_train)

In [5]:
# optimize class_weight hyperparameter after resampling

results = pd.DataFrame(
    index=np.arange(1000), columns=[
    "seed", "fold", "upsampler", "up_ratio", "cw_0",
    "precision", "recall", "auc", "mcc"]
)

i = 0
CV = 4

for seed in [1, 2, 3]:
    
    print("=" * 50)
    print("Random seed", seed)
    print("=" * 50)
    
    SKF = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    folds = SKF.split(X_train_prepped, Y_train)

    for f, (train_idx, val_idx) in enumerate(folds):

        print('\nFold ', f)
        print("-" * 50)

        X_t, Y_t = X_train_prepped[train_idx], Y_train[train_idx]
        X_v, Y_v = X_train_prepped[val_idx], Y_train[val_idx]

        # optimize class weight 
        for cw_0 in [0.2, 0.3, 0.4, 0.5]:

            model = make_model(x=X_t, y=Y_t, lr=0.001)

            if cw_0:
                cw = {0: cw_0, 1: 1 - cw_0}
            else:
                cw = None # cw_0 = 0 no csl

            print(f"class_weight:       {str(cw)}")

            early_stopping = tf.keras.callbacks.EarlyStopping(
                monitor='val_mcc',
                verbose=1,
                patience=20,
                mode='max',
                restore_best_weights=True
            )

            out = model.fit(
                X_t,
                Y_t,
                batch_size=4096,
                epochs=75,
                callbacks=[early_stopping],
                validation_data=(X_v, Y_v),
                verbose=0,
                class_weight=cw
            )

            summary=[seed, f, "no_resampling", "no_resampling", cw_0]

            scores = [
                out.history["val_" + m][-1]
                for m in ["precision", "recall", "auc", "mcc"]
            ]
            
            print(f"Val fold MCC:       {scores[-1]:.2f}")
            
            summary.extend(scores)

            for col, value in zip(results.columns, summary):
                results[col][i] = value

            i += 1

Random seed 1

Fold  0
--------------------------------------------------
class_weight:       {0: 0.2, 1: 0.8}
Restoring model weights from the end of the best epoch.
Epoch 00038: early stopping
Val fold MCC:       0.47
class_weight:       {0: 0.3, 1: 0.7}
Restoring model weights from the end of the best epoch.
Epoch 00043: early stopping
Val fold MCC:       0.53
class_weight:       {0: 0.4, 1: 0.6}
Restoring model weights from the end of the best epoch.
Epoch 00040: early stopping
Val fold MCC:       0.56
class_weight:       {0: 0.5, 1: 0.5}
Restoring model weights from the end of the best epoch.
Epoch 00041: early stopping
Val fold MCC:       0.53

Fold  1
--------------------------------------------------
class_weight:       {0: 0.2, 1: 0.8}
Restoring model weights from the end of the best epoch.
Epoch 00053: early stopping
Val fold MCC:       0.52
class_weight:       {0: 0.3, 1: 0.7}
Restoring model weights from the end of the best epoch.
Epoch 00040: early stopping
Val fold MCC:  

In [6]:
results = results.dropna(axis=0)

results.sort_values("mcc", ascending=False)

Unnamed: 0,seed,fold,upsampler,up_ratio,cw_0,precision,recall,auc,mcc
10,1,2,no_resampling,no_resampling,0.4,0.842105,0.516129,0.617497,0.67389
9,1,2,no_resampling,no_resampling,0.3,0.8,0.516129,0.625074,0.660289
45,3,3,no_resampling,no_resampling,0.3,0.68,0.566667,0.563631,0.652744
44,3,3,no_resampling,no_resampling,0.2,0.642857,0.6,0.570328,0.642139
8,1,2,no_resampling,no_resampling,0.2,0.761905,0.516129,0.618224,0.640393
46,3,3,no_resampling,no_resampling,0.4,0.695652,0.533333,0.585067,0.638138
36,3,1,no_resampling,no_resampling,0.2,0.714286,0.5,0.607992,0.63239
39,3,1,no_resampling,no_resampling,0.5,0.777778,0.466667,0.57215,0.630167
37,3,1,no_resampling,no_resampling,0.3,0.681818,0.5,0.607717,0.623213
25,2,2,no_resampling,no_resampling,0.3,0.714286,0.483871,0.529204,0.610179


In [7]:
results.to_csv(f"/kaggle/working/results_NN_CSL_no_resampling.csv")