In [1]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import pandas as pd # Tambahkan impor Pandas

# Atur seed untuk reproducibility
tf.random.set_seed(42)
np.random.seed(42)


2025-10-09 10:27:57.768254: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-09 10:27:58.352958: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-09 10:28:00.505448: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [2]:
# Baca CSV tanpa header
df = pd.read_csv("https://raw.githubusercontent.com/adistyadito/LSTM-MBA/refs/heads/main/GSARPC3.csv")

# Tampilkan beberapa baris pertama
df

# Check for missing values
print(f'Data kosong = {df.isnull().sum()}')

Data kosong = cid           0
smiles        0
acvalue       0
categories    0
dtype: int64


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Reshape
from tensorflow.keras.optimizers import Adam

In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem # Tetep perlu buat beberapa hal di RDKit
from rdkit.Chem import rdFingerprintGenerator # Import generator
smiles_clean = df['smiles'].tolist()

mols = []
for smiles in smiles_clean:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        mols.append(mol)
    else:
        print(f"Warning: Could not create mol object for SMILES: {smiles}")
        pass # Skip adding None

# --- Langkah 4: Generate Fingerprint dari List Objek Molekul (mols) ---
# Parameter fingerprint
radius = 2
nBits = 1024 # Ukuran vektor fingerprint

# Bikin generatornya
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)

fingerprints = []
# Sekarang lu iterasi di list 'mols' yang isinya objek molekul
for mol in mols:
    # Cek lagi kalau-kalau ada None di list mols (meskipun seharusnya nggak ada kalau langkah 3 bener)
    if mol is not None:
        # Panggil generator untuk bikin fingerprint
        fp = fpgen.GetFingerprint(mol)
        # Ubah BitVector ke numpy array
        fingerprints.append(np.array(fp))
    else:
        # Handle kalau ada None di mols (misalnya append vektor nol)
        fingerprints.append(np.zeros(nBits))

# Konversi list fingerprints jadi NumPy array besar (Ini X lu)
X = np.array(fingerprints)

# --- Langkah 5: Siapkan Data Target (y) ---
# Pastiin nama kolom target lu bener
df['label'] = df['categories'].map({'inhibitor': 1, 'neutral': 0})
y = df['label'].values

In [5]:
# ====================================================================
# B. FUNGSI UNTUK MEMBANGUN MODEL (Wajib untuk CV)
# ====================================================================

def build_lstm_model(nBits, units=32, dropout_rate=0.7, learning_rate=0.001):
    """Membangun Model LSTM dengan hyperparameter yang ditentukan."""
    model = Sequential()
    
    # Input dan Reshape ke 3D (samples, 1, nBits)
    model.add(tf.keras.Input(shape=(nBits,)))
    model.add(Reshape((1, nBits))) 

    # Layer LSTM (Ganti ke 'relu' sesuai request)
    model.add(LSTM(units, activation='tanh', recurrent_activation='sigmoid', return_sequences=False))

    # Layer Dropout (Dinaikkan untuk regulasi)
    model.add(Dropout(dropout_rate))

    # Output Layer (untuk binary classification)
    model.add(Dense(1, activation='sigmoid')) 

    # Compile
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'AUC'])
    return model

In [6]:
from sklearn.metrics import matthews_corrcoef

# ====================================================================
# C. IMPLEMENTASI STRATIFIED K-FOLD CROSS-VALIDATION (dengan MCC)
# ====================================================================

# Hyperparameter yang akan difiksasi
N_BITS = 1024 
N_SPLITS = 5      # Jumlah Fold
BATCH_SIZE = 32
EPOCHS = 100    # Dibuat besar, Early Stopping yang akan mengontrol

# Inisialisasi Stratified K-Fold
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
fold_results = []
fold_no = 1

# Definisikan Callback Early Stopping
early_stopping_callback = EarlyStopping(
    monitor='val_loss', 
    patience=10, 
    restore_best_weights=True,
    verbose=1 
)

# Loop untuk setiap Fold
for train_index, val_index in skf.split(X, y):
    print(f"\n==========================================")
    print(f"--- FOLD {fold_no}/{N_SPLITS} ---")
    print(f"==========================================")
    
    # A. Pembagian Data per Fold
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # B. Buat Ulang Model BARU
    model_cv = build_lstm_model(nBits=N_BITS)
    
    # C. Training Model
    print("Mulai Training...")
    history = model_cv.fit(
        X_train, y_train,
        epochs=EPOCHS, 
        batch_size=BATCH_SIZE,
        validation_data=(X_val, y_val), 
        callbacks=[early_stopping_callback],
        verbose=1
    )
    
    # D. Evaluasi dan Simpan Hasil
    print("\nEvaluasi Fold...")
    loss, accuracy, auc = model_cv.evaluate(X_val, y_val, verbose=0)
    
    # Hitung prediksi buat MCC
    y_val_pred_prob = model_cv.predict(X_val, verbose=0)
    y_val_pred = (y_val_pred_prob > 0.5).astype("int32")
    mcc = matthews_corrcoef(y_val, y_val_pred)
    
    print(f"Fold {fold_no} Selesai di Epoch {len(history.history['loss'])}.")
    print(f"Fold {fold_no} - FINAL METRICS (Val): AUC: {auc:.4f}, Acc: {accuracy:.4f}, MCC: {mcc:.4f}, Loss: {loss:.4f}")
    
    fold_results.append({'AUC': auc, 'Accuracy': accuracy, 'MCC': mcc, 'Loss': loss})

    fold_no += 1

# Setelah semua fold, lo bisa bikin summary
print("\n===== HASIL AKHIR (Cross-Validation) =====")
for metric in ['AUC', 'Accuracy', 'MCC', 'Loss']:
    values = [f[metric] for f in fold_results]
    print(f"{metric}: {np.mean(values):.4f} ± {np.std(values):.4f}")



--- FOLD 1/5 ---
Mulai Training...


2025-10-09 10:28:11.660836: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - AUC: 0.6454 - accuracy: 0.5965 - loss: 0.6753 - val_AUC: 0.8632 - val_accuracy: 0.7188 - val_loss: 0.6365
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - AUC: 0.8235 - accuracy: 0.7382 - loss: 0.6072 - val_AUC: 0.8907 - val_accuracy: 0.7812 - val_loss: 0.5757
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - AUC: 0.8527 - accuracy: 0.7697 - loss: 0.5570 - val_AUC: 0.9020 - val_accuracy: 0.8125 - val_loss: 0.5134
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - AUC: 0.8773 - accuracy: 0.7972 - loss: 0.4902 - val_AUC: 0.8977 - val_accuracy: 0.8281 - val_loss: 0.4658
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - AUC: 0.9107 - accuracy: 0.8425 - loss: 0.4403 - val_AUC: 0.8963 - val_accuracy: 0.8281 - val_loss: 0.4387
Epoch 6/100
[1m16/16[0m [32m━

In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

xgb = XGBClassifier(eval_metric="logloss")

# Get AUC scores
auc_scores = cross_val_score(xgb, X, y, cv=5, scoring="roc_auc")
print("XGB AUC:", auc_scores.mean())

# Get Accuracy scores
acc_scores = cross_val_score(xgb, X, y, cv=5, scoring="accuracy")
print("XGB Accuracy:", acc_scores.mean())

# Optional: Show both metrics together
print(f"XGB Results - AUC: {auc_scores.mean():.4f}, Accuracy: {acc_scores.mean():.4f}")

XGB AUC: 0.790072496947497
XGB Accuracy: 0.7280511811023622
XGB Results - AUC: 0.7901, Accuracy: 0.7281
