### Build env for this notebook (temp. fix):

```
conda env create -n dnn_cpu python=3.12 numpy pandas scikit-learn tensorflow-cpu
conda activate dnn_cpu
```

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
# from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (
    cross_val_score,
    RepeatedStratifiedKFold,
    StratifiedKFold,
)
from sklearn.metrics import f1_score, balanced_accuracy_score, make_scorer
np.random.seed(0)
tf.random.set_seed(0)

In [1]:
import pickle
with open("../data/tmp_data/svc_at_sugaramino_ml_data.pickle", "rb") as handle:
    ml_datasets = pickle.load(handle)

In [3]:
ml_datasets[12]

Name: PROSTT5_3DI, Features: 1024, Samples: 54, Classes: [0 1], Labels: ['amino acid transmembrane transporter activity'
 'sugar transmembrane transporter activity'])

0 DUMMY
1 AAC
2 PAAC
3 AA_KMER3
4 3Di_COMP
5 3Di_KMER2
6 3Di_KMER3
7 COMB_KMER1
8 COMB_KMER2
9 COMB_KMER3
10 PROTT5_AA
11 PROSTT5_AA
12 PROSTT5_3DI
13 PSSM_50_1
14 PSSM_50_3
15 PSSM_90_1
16 PSSM_90_3
17 PSSM_META
18 META
19 META_STD


In [None]:
from tensorflow import keras

# TODO test other model

# model = Sequential([
#     Dense(512, activation='relu', input_shape=(1024,)),
#     BatchNormalization(),
#     Dropout(0.3),

#     Dense(128, activation='relu'),
#     Dropout(0.3),

#     Dense(1, activation='sigmoid') 
# ])

# model.compile(
#     optimizer=Adam(learning_rate=1e-3),
#     loss=BinaryCrossentropy(),
#     metrics=[BinaryAccuracy()]
# )

def create_model(input_shape):
    # TODO possibly more layers, try without dropout
    model = keras.Sequential(
        [
            keras.layers.Input(shape=(input_shape,)),
            keras.layers.Dense(512, activation="relu"),
            keras.layers.Dropout(0.5),
            keras.layers.Dense(256, activation="relu"),
            keras.layers.Dropout(0.5),
            keras.layers.Dense(128, activation="relu"),
            keras.layers.Dense(1, activation="sigmoid"),
        ]
    )

    model.compile(
        optimizer="adam",  # Adam(learning_rate=0.001)
        loss="binary_crossentropy",
        metrics=[
            keras.metrics.F1Score(average="macro"),
            keras.metrics.FalseNegatives(),
            keras.metrics.FalsePositives(),
            keras.metrics.TruePositives(),
            keras.metrics.TrueNegatives(),
        ],
    )  # ['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.AUC()])
    #     metrics=['Precision', 'Recall', 'AUC']

    # TODO try BatchNormalization
    # TODO class weights
    # TODO early stopping?
    # TODO macro
    return model

In [None]:
def nested_crossval(ml_dataset, model, scores_dict, splits=5, repeats=5):
    print(f"=== {ml_dataset.name} ===")
    preprocess = make_pipeline(VarianceThreshold(0.0), StandardScaler())


    X,y = ml_dataset.X, ml_dataset.y

    train_scores = list()
    test_scores = list() #

    cross_validator = RepeatedStratifiedKFold(n_splits=splits, n_repeats=repeats)
    for fold_count, (train_idx, val_idx) in enumerate(cross_validator.split(X, y)):
        print(f"Fold {fold_count+1} out of {splits*repeats}")

        X_train, X_test = X[train_idx], X[val_idx]
        y_train, y_test = y[train_idx], y[val_idx]

        X_train = preprocess.fit_transform(X_train, y_train)
        X_test = preprocess.transform(X_test)

        # TODO batch size epochs
        history = model.fit(X_train, y_train.reshape(-1, 1), epochs=50, batch_size=8, validation_split=0.2, verbose=0)
        # TODO balanced_accuracy for direct compare to SVM
        # TODO macro averaged?? maybe use sklearn score
        res = model.evaluate(X_test, y_test.reshape(-1, 1))
        y_prob = model.predict(X_test)
        y_pred = (y_prob > 0.5).astype(int).flatten()

        for score_name, score_func in scores_dict.items():
            print(score_name, score_func(y_test, y_pred))

        # print("amino",f1_score(y_true=y_test, y_pred=y_pred, pos_label=0))
        # print("sugar",f1_score(y_true=y_test, y_pred=y_pred, pos_label=1))
        test_score_model = model.score(X_test, y_test)
        scores.append(res)
        # history_logs.append(history)
        # print(res)

    return scores


In [122]:
scoring_outer = {
    "Balanced Accuracy": balanced_accuracy_score,
    "F1 Macro": lambda y_test,y_pred: f1_score(y_true=y_test, y_pred=y_pred, average="macro"),
    "F1 Amino Acid": lambda y_test,y_pred: f1_score(y_true=y_test, y_pred=y_pred, pos_label=0),
    "F1 Sugar": lambda y_test,y_pred: f1_score(y_true=y_test, y_pred=y_pred, pos_label=1),
}

In [123]:
for pos, ml_dataset in enumerate(ml_datasets):
    print(pos, ml_dataset.name)

0 DUMMY
1 AAC
2 PAAC
3 AA_KMER3
4 3Di_COMP
5 3Di_KMER2
6 3Di_KMER3
7 COMB_KMER1
8 COMB_KMER2
9 COMB_KMER3
10 PROTT5_AA
11 PROSTT5_AA
12 PROSTT5_3DI
13 PSSM_50_1
14 PSSM_50_3
15 PSSM_90_1
16 PSSM_90_3
17 PSSM_META
18 META
19 META_STD


In [124]:
ml_datasets[10]

Name: PROTT5_AA, Features: 1024, Samples: 54, Classes: [0 1], Labels: ['amino acid transmembrane transporter activity'
 'sugar transmembrane transporter activity'])

In [125]:
prott5 = ml_datasets[10]
model = create_model(1024)
scores = nested_crossval(ml_dataset=prott5, model=model, scores_dict=scoring_outer)

=== PROTT5_AA ===
Fold 1 out of 25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - f1_score: 0.8571 - false_negatives_10: 0.0000e+00 - false_positives_10: 1.0000 - loss: 1.4055 - true_negatives_10: 4.0000 - true_positives_10: 6.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Balanced Accuracy 0.9
F1 Macro 0.905982905982906
F1 Amino Acid 0.8888888888888888
F1 Sugar 0.9230769230769231
Fold 2 out of 25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - f1_score: 1.0000 - false_negatives_10: 0.0000e+00 - false_positives_10: 0.0000e+00 - loss: 6.1944e-11 - true_negatives_10: 5.0000 - true_positives_10: 6.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Balanced Accuracy 1.0
F1 Macro 1.0
F1 Amino Acid 1.0
F1 Sugar 1.0
Fold 3 out of 25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - f1_score: 1.0000 - false_negatives_10: 0.0000e+00 - false_positives_10: 0.0000e+0

KeyboardInterrupt: 

In [None]:
scores = np.array(scores)
print(scores.shape)
print(
    f"Loss {scores[:,0].mean():.2f}+-{scores[:,0].std():.2f}, Precision {scores[:,1].mean():.2f}+-{scores[:,1].std():.2f} Recall {scores[:,2].mean():.2f}+-{scores[:,2].std():.2f}, AUC {scores[:,3].mean():.2f}+-{scores[:,3].std():.2f}"
)

(50, 4)
Loss 0.14+-0.22, Precision 0.96+-0.07 Recall 0.97+-0.07, AUC 0.99+-0.03


In [None]:
# from sklearn.utils import class_weight
# import numpy as np

# class_weights = class_weight.compute_class_weight(
#     class_weight='balanced',
#     classes=np.unique(y_train),
#     y=y_train
# )
# class_weights_dict = dict(enumerate(class_weights))

# model.fit(X_train, y_train, 
#           epochs=20, 
#           batch_size=32, 
#           class_weight=class_weights_dict, 
#           validation_data=(X_val, y_val))

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

np.random.seed(0)
tf.random.set_seed(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = keras.Sequential([
    keras.layers.Input(shape=(len(feature_names),)),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=["Accuracy","Precision", "Recall", "AUC"])

history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2)

res = model.evaluate(X_test, y_test)
# print(f"Test Accuracy: {test_acc:.4f}")
print(res)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - AUC: 0.5263 - Accuracy: 0.4412 - Precision: 0.5000 - Recall: 0.1579 - loss: 0.8281 - val_AUC: 0.8571 - val_Accuracy: 0.7778 - val_Precision: 0.5000 - val_Recall: 0.5000 - val_loss: 0.5743
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - AUC: 0.7456 - Accuracy: 0.6765 - Precision: 0.7500 - Recall: 0.6316 - loss: 0.6137 - val_AUC: 0.9286 - val_Accuracy: 0.6667 - val_Precision: 0.4000 - val_Recall: 1.0000 - val_loss: 0.6309
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - AUC: 0.7228 - Accuracy: 0.6765 - Precision: 0.6818 - Recall: 0.7895 - loss: 0.6050 - val_AUC: 0.8571 - val_Accuracy: 0.5556 - val_Precision: 0.3333 - val_Recall: 1.0000 - val_loss: 0.6332
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - AUC: 0.9772 - Accuracy: 0.8824 - Precision: 0.8261 - Recall: 1.0000 - loss: 0.4001 - val_AUC: 