In [1]:
seed = 123

In [2]:
# import helper functions
from shutil import copyfile

copyfile(
    src = "/kaggle/input/bdc-p53/p53_helper_functions.py", 
    dst = "/kaggle/working/p53_helpers.py"
)

from p53_helpers import *

# load dataset (no NAs)
X, Y = load_p53_ds()

# sratified 80/20 split
X_train, X_test, Y_train, Y_test = split_p53(X, Y)

Using TensorFlow backend.



Import completed after 3.1 sec

CLASS RATIOS
Training set active classes:   121/24927 (0.485 %)
Test set active classes:       30/6232 (0.481 %)

MATRIX DIMENSIONS
TRAINING SET
. Features:   (24927, 5408)
. Classes:    (24927,)
TEST SET
. Features:   (6232, 5408)
. Classes:    (6232,)


In [3]:
def make_model(
    x, y,
    u1=256,
    d1=0.5,
    lr=0.001,
    smart_init=True
):
    
    # define metrics
    metrics = [
        mcc,
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc', curve="PR")
    ]
    
    if smart_init:
        n_active = y.sum()
        n_inactive = len(y) - n_active
        b0 = tf.keras.initializers.Constant(np.log([n_active / n_inactive]))
    else:
        b0=None # random initialization

    # build & compile
    model = keras.Sequential([
        keras.layers.Dense(u1, activation='relu', input_shape=(x.shape[-1], )),
        keras.layers.Dropout(d1),
        keras.layers.Dense(1, activation='sigmoid', bias_initializer=b0)
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(lr=lr),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics
    )
    
    return model

In [4]:
CV = 4

# to numpy arrays
X_train, Y_train = np.array(X_train), np.array(Y_train)

# init results df
results = pd.DataFrame(
    index=np.arange(5000), 
    columns=[
        "i", "fold", "n_remove", "scaler", "clip", "u1", "d1", "batch", "epoch", "patience", "lr",
        "precision", "recall", "auc", "mcc"
    ]
)

i = 0

SKF = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
folds = SKF.split(X_train, Y_train)

for f, (train_idx, val_idx) in enumerate(folds):
    
    print('\nFold ', f)
    
    X_t, Y_t = X_train[train_idx], Y_train[train_idx]
    X_v, Y_v = X_train[val_idx], Y_train[val_idx]

    for n_remove in [0]:
        
        if n_remove:
            rm = RemoveCorrelatedFeatures(n_remove=n_remove)
            X_t_ = rm.fit_transform(X_t)
            X_v_ = rm.transform(X_v)
            
        else:
            X_t_, X_v_ = X_t, X_v

        # scale
        scaler = StandardScaler()
        X_t_scaled = scaler.fit_transform(X_t_)
        X_v_scaled = scaler.transform(X_v_)
        scaler_name = scaler.__class__.__name__
        
        for clip in [None, 3, 5]:

            if clip:
                for x in [X_t_scaled, X_v_scaled]:
                    np.clip(x, -clip, clip, out=x)
            
            u1, d1 = 256, 0.5
            batch_size = 4096
            epoch = 75
            patience = 20

            for lr in [0.01, 0.005, 0.001]:

                early_stopping = tf.keras.callbacks.EarlyStopping(
                    monitor='val_mcc',
                    verbose=1,
                    patience=patience,
                    mode='max',
                    restore_best_weights=True
                )

                model = make_model(x=X_t_scaled, y=Y_t, lr=lr)

                out = model.fit(
                    X_t_scaled,
                    Y_t,
                    batch_size=batch_size,
                    epochs=epoch,
                    callbacks=[early_stopping],
                    validation_data=(X_v_scaled, Y_v)
                )

                summary = [i, f, n_remove, scaler_name, str(clip), u1, d1, batch_size, epoch, patience, lr]
                scores = [
                    out.history["val_" + m][-1]
                    for m in ["precision", "recall", "auc", "mcc"]
                ]
                summary.extend(scores)

                for col, value in zip(results.columns, summary):
                    results[col][i] = value

                i += 1


Fold  0
Train on 18695 samples, validate on 6232 samples
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 00033: early stopping
Train on 18695 samples, validate on 6232 samples
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 4

In [5]:
results = results.dropna(axis=0)
results

Unnamed: 0,i,fold,n_remove,scaler,clip,u1,d1,batch,epoch,patience,lr,precision,recall,auc,mcc
0,0,0,0,StandardScaler,,256,0.5,4096,75,20,0.01,0.733333,0.366667,0.369028,0.527347
1,1,0,0,StandardScaler,,256,0.5,4096,75,20,0.005,0.785714,0.366667,0.354943,0.543207
2,2,0,0,StandardScaler,,256,0.5,4096,75,20,0.001,0.705882,0.4,0.449071,0.540245
3,3,0,0,StandardScaler,3.0,256,0.5,4096,75,20,0.01,0.857143,0.2,0.334897,0.425686
4,4,0,0,StandardScaler,3.0,256,0.5,4096,75,20,0.005,0.611111,0.366667,0.319881,0.490657
5,5,0,0,StandardScaler,3.0,256,0.5,4096,75,20,0.001,0.666667,0.4,0.510447,0.528053
6,6,0,0,StandardScaler,5.0,256,0.5,4096,75,20,0.01,0.705882,0.4,0.43861,0.549365
7,7,0,0,StandardScaler,5.0,256,0.5,4096,75,20,0.005,0.6875,0.366667,0.436673,0.506759
8,8,0,0,StandardScaler,5.0,256,0.5,4096,75,20,0.001,0.666667,0.4,0.47348,0.528053
9,9,1,0,StandardScaler,,256,0.5,4096,75,20,0.01,0.5,0.4,0.353625,0.451157


In [6]:
results.to_csv("results.csv")