In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import kerastuner
import tensorflow
import seaborn as sns
import pandas as pd

from tensorflow import keras

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import rdMolDescriptors

from sklearn import datasets, metrics
from sklearn.metrics import auc, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import StandardScaler



from scipy import interp
from tensorflow.keras.layers import Embedding, Dense 
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


from kerastuner.tuners import BayesianOptimization

from tensorflow.keras import backend as K 
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import initializers

2024-03-26 09:48:52.692663: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-26 09:48:52.692723: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-26 09:48:52.698490: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-26 09:48:52.713116: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  import kerastuner


In [2]:
import os
os.getcwd()

'/data/home/ldhyun7222/hERGAT/NN with self-attention'

## Prepare the dataset

In [3]:
train_df = pd.read_csv('train_df2.csv')
test_df = pd.read_csv('test_df2.csv')

train_df = train_df.drop(columns = ['SMILES', 'cano_smiles'])
test_df = test_df.drop(columns = ['SMILES', 'cano_smiles'])

x_train = train_df.drop(columns = 'Class')
x_test = test_df.drop(columns = 'Class')

y_train = train_df['Class']
y_test = test_df['Class']

X_df = pd.concat([train_df, test_df], axis = 0)
X_df = X_df.reset_index().drop(columns = 'index')
X = X_df.drop(columns = 'Class')
y = X_df['Class']

print(X.shape)
print(y.shape)

(9238, 1029)
(9238,)


In [4]:
# input data의 차원 수
input_dim = x_train.shape[1]

# Layer weight initializers 설정 (가중치 초기화 설정)
initializer = tf.keras.initializers.HeNormal()


# L2 regularizer 설정
from tensorflow.keras import regularizers
regularizer = regularizers.l2(0.001)

#model hyperparameter
epochs = 100
batch_size = 32

#callbacks
callbacks = [
    tensorflow.keras.callbacks.ModelCheckpoint(
        "hERGattention.h5", save_best_only=True, monitor="val_loss"
    ),
    tensorflow.keras.callbacks.EarlyStopping(monitor="val_loss", patience=30, verbose=1),
]

In [5]:
# PregTaboo
from keras.layers import Dense, Dropout, MultiHeadAttention
from keras import backend as K


# 모델 세부 설정
inputs = tf.keras.layers.Input(shape=(input_dim,))

dense_v = tf.keras.layers.Dense(input_dim, activation = None)(inputs)
attn_score = tf.keras.layers.Softmax(axis = -1)(dense_v)
cal_score = tf.math.multiply(inputs, attn_score)

Dense1 = tf.keras.layers.Dense(512, activation = 'relu',kernel_initializer = initializer)(cal_score)
Dense1_BN = tf.keras.layers.BatchNormalization()(Dense1)
Dropout = Dropout(rate=0.25)(Dense1_BN)

Dense2 = tf.keras.layers.Dense(256, activation = 'relu', kernel_initializer = initializer, kernel_regularizer=regularizer)(Dropout)
Dense2_BN = tf.keras.layers.BatchNormalization()(Dense2)
Dense3 = tf.keras.layers.Dense(16, activation = 'relu', kernel_initializer = initializer, kernel_regularizer=regularizer)(Dense2_BN)
Dense3_BN = tf.keras.layers.BatchNormalization()(Dense3)
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid')(Dense3_BN)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1029)]               0         []                            
                                                                                                  
 dense (Dense)               (None, 1029)                 1059870   ['input_1[0][0]']             
                                                                                                  
 softmax (Softmax)           (None, 1029)                 0         ['dense[0][0]']               
                                                                                                  
 tf.math.multiply (TFOpLamb  (None, 1029)                 0         ['input_1[0][0]',             
 da)                                                                 'softmax[0][0]']         

2024-03-26 09:49:02.205221: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [6]:
from tensorflow.keras.optimizers import Adam

model.compile(
    optimizer=Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999),
    loss="binary_crossentropy",
    metrics=["accuracy"])

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, auc, precision_recall_curve
import numpy as np

# 점수를 저장할 리스트 초기화
accuracies = []
precisions = []
recalls = []
roc_scores = [] 
pr_aucs = []
f1_scores = []

# 그래프 준비
tpr_list = []
fpr_list = []
roc_auc_list = []
precision_list = []
recall_list = []
pr_auc_list = []

# 10겹 계층화 교차 검증 설정
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # 모델정의
    dense_v = tf.keras.layers.Dense(input_dim, activation=None, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(inputs)
    attn_score = tf.keras.layers.Softmax(axis=-1)(dense_v)
    cal_score = tf.math.multiply(inputs, attn_score)
    
    Dense1 = tf.keras.layers.Dense(512, activation='relu', kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(cal_score)
    Dense1_BN = tf.keras.layers.BatchNormalization()(Dense1)
    Dropout_layer = tf.keras.layers.Dropout(rate=0.25)(Dense1_BN)  # 이름 변경
    
    Dense2 = tf.keras.layers.Dense(256, activation='relu', kernel_initializer=tf.keras.initializers.HeNormal(seed=42), kernel_regularizer=regularizer)(Dropout_layer)
    Dense2_BN = tf.keras.layers.BatchNormalization()(Dense2)
    
    Dense3 = tf.keras.layers.Dense(16, activation='relu', kernel_initializer=tf.keras.initializers.HeNormal(seed=42), kernel_regularizer=regularizer)(Dense2_BN)
    Dense3_BN = tf.keras.layers.BatchNormalization()(Dense3)
    
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(Dense3_BN)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)


    # 모델 compile
    model.compile(optimizer=Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999), loss="binary_crossentropy", metrics=["accuracy"],)
    
   # 모델 학습
    history = model.fit(X_train, y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_data=(X_test, y_test),callbacks=callbacks)
    
    # 테스트 셋 예측
    test_preds = model.predict(X_test)
    test_preds[test_preds >= 0.5] = 1
    test_preds[test_preds < 0.5] = 0

    # 점수 계산
    accuracies.append(accuracy_score(y_test, test_preds))
    precisions.append(precision_score(y_test, test_preds))
    recalls.append(recall_score(y_test, test_preds))
    roc_scores.append(roc_auc_score(y_test, model.predict(X_test)))

    pr_precision, pr_recall, _ = precision_recall_curve(y_test, test_preds)
    pr_aucs.append(auc(pr_recall, pr_precision))

    f1_scores.append(2 * (precisions[-1] * recalls[-1]) / (precisions[-1] + recalls[-1]))

    # AUROC 계산을 위한 값 저장
    y_pred_proba = model.predict(X_test).ravel()
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    roc_auc_list.append(roc_auc)

    # AUPR 계산을 위한 값 저장
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)
    precision_list.append(precision)
    recall_list.append(recall)
    pr_auc_list.append(pr_auc)



    

In [None]:
# 각 평가 지표에 대한 평균 계산
print('Average Accuracy:', np.mean(accuracies))
print('Average Precision:', np.mean(precisions))
print('Average Recall/Sensitivity:', np.mean(recalls))
print('Average ROC AUC Score:', np.mean(roc_scores))
print('Average PR AUC:', np.mean(pr_aucs))
print('Average F1 Score:', np.mean(f1_scores))

<nn with self-attention 성능평가>

# Stratified 10-cross validation AUROC 그래프 그리기

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

plt.figure()
for i in range(len(fpr_list)):
    plt.plot(fpr_list[i], tpr_list[i], lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.3f)' % (i, roc_auc_list[i]))
    
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('Stratified NN with attention_AUROC.png')
plt.show()

In [None]:

# 평균 TPR 및 평균 AUC 계산
mean_fpr = np.linspace(0, 1, 100)
mean_tpr_list = []

for i in range(len(tpr_list)):
    mean_tpr_list.append(interp(mean_fpr, fpr_list[i], tpr_list[i]))
    
mean_tpr = np.mean(mean_tpr_list, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)

std_tpr = np.std(mean_tpr_list, axis = 0)

# 평균 ROC Curve 그리기
plt.figure()
plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean NN_ROC (AUC = %0.3f)' % mean_auc, lw=2, alpha=.8)
plt.fill_between(mean_fpr, mean_tpr - std_tpr, mean_tpr + std_tpr, color='grey', alpha=0.2, label=r'$\pm$ 1 std. dev.')
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# 각 fold별로 AUPR Curve

In [None]:
plt.figure()
for i in range(len(precision_list)):
    plt.plot(recall_list[i], precision_list[i], lw=2, alpha=0.3, label='PR fold %d (AUC = %0.3f)' % (i+1, pr_auc_list[i]))

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.ylim([0.0, 1.0])
plt.savefig('Stratified NN with attention_AUPR.png')
plt.show()

### 평균값으로 AUPR 그래프 그리기

In [None]:
from scipy.interpolate import interp1d

# 공통 recall 값 설정
mean_recall = np.linspace(0, 1, 100)
mean_pr_auc = np.mean(pr_auc_list)

# 각 fold의 precision 값을 공통 recall 값에 대해 보간
interp_precision_list = []
for precision, recall in zip(precision_list, recall_list):
    # Interpolation 함수 생성
    interp_func = interp1d(recall, precision, kind='linear', bounds_error=False, fill_value=(precision[0], precision[-1]))
    # 공통 recall 값에 대해 precision 값을 보간
    interp_precision = interp_func(mean_recall)
    interp_precision_list.append(interp_precision)

# 보간된 precision 값의 평균 계산
mean_precision = np.mean(interp_precision_list, axis=0)
std_precision = np.std(interp_precision_list, axis=0)

# 평균 PR Curve 그리기
plt.figure()
plt.plot(mean_recall, mean_precision, label='Mean PR (AUC = {:.3f})'.format(mean_pr_auc), lw=2, alpha=.8)
plt.fill_between(mean_recall, mean_precision - std_precision, mean_precision + std_precision, color='gray', alpha=0.2, label='Std Deviation')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.ylim([0.0, 1.0])

plt.legend(loc='best')
plt.show()


In [None]:
# np.savetxt('../cross validation figure/Stratified 10-cross validation/nn_with_attention_mean_fpr.csv', mean_fpr, delimiter=',')
# np.savetxt('../cross validation figure/Stratified 10-cross validation/nn_with_attention_mean_tpr.csv', mean_tpr, delimiter=',')
# np.savetxt('../cross validation figure/Stratified 10-cross validation/nn_with_attention_mean_precision.csv', mean_precision, delimiter=',')
# np.savetxt('../cross validation figure/Stratified 10-cross validation/nn_with_attention_mean_recall.csv', mean_recall, delimiter=',')



# np.savetxt('../cross validation figure/Stratified 10-cross validation/nn_with_attention_std_tpr.csv', std_tpr, delimiter=',')
# np.savetxt('../cross validation figure/Stratified 10-cross validation/nn_with_attention_std_precision.csv', std_precision, delimiter=',')
# np.savetxt('../cross validation figure/Stratified 10-cross validation/nn_with_attention_pr_auc_list.csv', pr_auc_list, delimiter=',')