### Build env for this notebook (temp. fix):

```
conda env create -n dnn_cpu python=3.12 numpy pandas scikit-learn tensorflow-cpu
conda activate dnn_cpu
```

In [45]:
import pickle
with open("../data/tmp/ml_data_at_sugaramino.pickle", "rb") as handle:
    ml_datasets = pickle.load(handle)

In [46]:
feature_name, X, y, sample_names, feature_names = ml_datasets[11]
feature_name

'3Di_KMER2'

In [61]:
from tensorflow import keras


def create_model(input_shape):
    # TODO possibly more layers, try without dropout
    model = keras.Sequential(
        [
            keras.layers.Input(shape=(input_shape,)),
            keras.layers.Dense(512, activation="relu"),
            keras.layers.Dropout(0.5),
            keras.layers.Dense(256, activation="relu"),
            keras.layers.Dropout(0.5),
            keras.layers.Dense(128, activation="relu"),
            keras.layers.Dense(1, activation="sigmoid"),
        ]
    )

    model.compile(
        optimizer="adam",  # Adam(learning_rate=0.001)
        loss="binary_crossentropy",
        metrics=["Precision", "Recall", "AUC"],
    )  # ['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.AUC()])
    #     metrics=['Precision', 'Recall', 'AUC']

    # TODO class weights
    # TODO early stopping?
    # TODO macro
    return model


In [62]:
X.shape

(54, 400)

In [None]:
import pandas as pd
import numpy as np
# from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (
    cross_val_score,
    RepeatedStratifiedKFold,
    StratifiedKFold,
)
from sklearn.metrics import f1_score, balanced_accuracy_score
def nested_crossval(feature_name, X, y, sample_names, feature_names):
    print(f"=== {feature_name} ===")
    # model = make_pipeline(VarianceThreshold(), StandardScaler(), SelectKBest(), SVC())
    preprocess = make_pipeline(VarianceThreshold(0.0), StandardScaler())

    history_logs = []
    scores = []

    # scale : 1 / (n_features * X.var()). 
    # larger variance and more features leads to a smoother decision boundary, 
    # where each sample has less influence
    # TODO 3-fold cv?
    # TODO try LOO
    splits=3
    repeats=10
    cross_validator = RepeatedStratifiedKFold(n_splits=splits, n_repeats=repeats)
    for fold_count, (train_idx, val_idx) in enumerate(cross_validator.split(X, y)):
        print(f"Fold {fold_count+1} out of {splits*repeats}")

        X_train, X_test = X[train_idx], X[val_idx]
        y_train, y_test = y[train_idx], y[val_idx]

        X_train = preprocess.fit_transform(X_train, y_train)
        X_test = preprocess.transform(X_test)

        model = create_model(X_train.shape[1])

        # TODO tune validation split?
        history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2, verbose=0)
        # TODO balanced_accuracy for direct compare to SVM
        # TODO macro averaged?? maybe use sklearn score
        res = model.evaluate(X_test, y_test)
        print(res)
        # score = model.score(X_test, y_test)
        scores.append(res)
        # history_logs.append(history)
        # print(res)

    return scores


In [64]:
scores = nested_crossval(*ml_datasets[11])

=== 3Di_KMER2 ===
Fold 1 out of 30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - AUC: 1.0000 - Precision: 1.0000 - Recall: 1.0000 - loss: 0.0242
[0.024161580950021744, 1.0, 1.0, 1.0]
Fold 2 out of 30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - AUC: 1.0000 - Precision: 1.0000 - Recall: 1.0000 - loss: 0.0163
[0.016255078837275505, 1.0, 1.0, 1.0]
Fold 3 out of 30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - AUC: 1.0000 - Precision: 0.9000 - Recall: 1.0000 - loss: 0.0568
[0.056788042187690735, 0.8999999761581421, 1.0, 1.0]
Fold 4 out of 30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - AUC: 1.0000 - Precision: 0.9091 - Recall: 1.0000 - loss: 0.1146
[0.11458232998847961, 0.9090909361839294, 1.0, 1.0]
Fold 5 out of 30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - AUC: 1.0000 - Precision: 0.9000 - Recall: 1.0000 - loss: 0.0907
[0.09073749929666519, 0.89

In [52]:
scores = np.array(scores)
print(scores.shape)
print(
    f"Loss {scores[:,0].mean():.2f}+-{scores[:,0].std():.2f}, Precision {scores[:,1].mean():.2f}+-{scores[:,1].std():.2f} Recall {scores[:,2].mean():.2f}+-{scores[:,2].std():.2f}, AUC {scores[:,3].mean():.2f}+-{scores[:,3].std():.2f}"
)

(30, 4)
Loss 0.12+-0.12, Precision 0.94+-0.06 Recall 0.97+-0.06, AUC 1.00+-0.01


In [53]:
# from sklearn.utils import class_weight
# import numpy as np

# class_weights = class_weight.compute_class_weight(
#     class_weight='balanced',
#     classes=np.unique(y_train),
#     y=y_train
# )
# class_weights_dict = dict(enumerate(class_weights))

# model.fit(X_train, y_train, 
#           epochs=20, 
#           batch_size=32, 
#           class_weight=class_weights_dict, 
#           validation_data=(X_val, y_val))

In [54]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

np.random.seed(0)
tf.random.set_seed(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = keras.Sequential([
    keras.layers.Input(shape=(len(feature_names),)),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=["Accuracy","Precision", "Recall", "AUC"])

history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2)

res = model.evaluate(X_test, y_test)
# print(f"Test Accuracy: {test_acc:.4f}")
print(res)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - AUC: 0.5263 - Accuracy: 0.4412 - Precision: 0.5000 - Recall: 0.1579 - loss: 0.8281 - val_AUC: 0.8571 - val_Accuracy: 0.7778 - val_Precision: 0.5000 - val_Recall: 0.5000 - val_loss: 0.5743
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - AUC: 0.7456 - Accuracy: 0.6765 - Precision: 0.7500 - Recall: 0.6316 - loss: 0.6137 - val_AUC: 0.9286 - val_Accuracy: 0.6667 - val_Precision: 0.4000 - val_Recall: 1.0000 - val_loss: 0.6309
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - AUC: 0.7228 - Accuracy: 0.6765 - Precision: 0.6818 - Recall: 0.7895 - loss: 0.6050 - val_AUC: 0.8571 - val_Accuracy: 0.5556 - val_Precision: 0.3333 - val_Recall: 1.0000 - val_loss: 0.6332
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - AUC: 0.9772 - Accuracy: 0.8824 - Precision: 0.8261 - Recall: 1.0000 - loss: 0.4001 - val_AUC: 