In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import tensorflow
import seaborn as sns
import pandas as pd

from tensorflow import keras

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import rdMolDescriptors

from sklearn import datasets, metrics
from sklearn.metrics import auc, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import StandardScaler


from scipy import interp
from tensorflow.keras.layers import Embedding, Dense 
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint



from tensorflow.keras import backend as K 
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import initializers

2025-06-25 13:22:23.790534: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-25 13:22:24.570807: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-06-25 13:22:24.570871: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from rdkit import Chem
from rdkit.Chem import AllChem, QED, DataStructs
import joblib
import os

# XGBoost import
from xgboost import XGBClassifier
# 데이터 로드 및 전처리
df = pd.read_csv('/data/home/dbswn0814/2025JCM/data/single task/lun_data.csv')

def preprocess_dataframe(df):
    df_copy = df.copy()
    df_copy.reset_index(drop=True, inplace=True)
    mols, none_list = [], []
    for i in range(len(df_copy)):
        smiles = str(df_copy.loc[i, 'SMILES'])
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            try:
                Chem.SanitizeMol(mol)
                mols.append(mol)
            except:
                none_list.append(i)
                print(f'Index {i}: 유효하지 않은 SMILES, 제외')
        else:
            none_list.append(i)
            print(f'Index {i}: None SMILES, 제외')
    if none_list:
        df_copy.drop(none_list, inplace=True)
        df_copy.reset_index(drop=True, inplace=True)
    return df_copy, mols

# 전처리 실행
df_clean, mols = preprocess_dataframe(df)

# Morgan Fingerprint 생성
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in mols]
arr_list = []
for bit in fps:
    arr = np.zeros((1024,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(bit, arr)
    arr_list.append(arr)
x_fingerprint = np.array(arr_list, dtype=np.float32)

# QED 속성 계산 및 스케일링
qe_props = [QED.properties(mol) for mol in mols]
qe_df = pd.DataFrame(qe_props)
scaler = StandardScaler()
qe_scaled = scaler.fit_transform(qe_df)
qe_scaled_df = pd.DataFrame(qe_scaled, columns=qe_df.columns)

# 최종 Feature 결합
features = np.hstack((x_fingerprint, qe_scaled_df.values))
final_df = pd.concat([pd.DataFrame(features), df_clean['lun'].reset_index(drop=True)], axis=1)
final_df.dropna(inplace=True)

X = final_df.drop('lun', axis=1).values
y = final_df['lun'].values

# K-Fold 외부 검증 설정
n_splits = 20
outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

save_dir = './grid_search'
os.makedirs(save_dir, exist_ok=True)

fold = 1
for train_idx, test_idx in outer_cv.split(X, y):
    print(f'=== Fold {fold} ===')
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Test 데이터 저장
    test_data = {'features': torch.tensor(X_test), 'labels': torch.tensor(y_test)}
    test_save_path = os.path.join(save_dir, f'LR_test_fold{fold}.pt')
    torch.save(test_data, test_save_path)
    print(f'save Test: {test_save_path}')
    
    # Train/Validation 분리
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
    for tr_idx, val_idx in sss.split(X_train, y_train):
        X_tr, X_val = X_train[tr_idx], X_train[val_idx]
        y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    # Train/Val 데이터 저장
    train_save_path = os.path.join(save_dir, f'LR_train_fold{fold}.pt')
    torch.save({'features': torch.tensor(X_tr), 'labels': torch.tensor(y_tr)},
               train_save_path)
    
    print(f'save Train: {train_save_path}')
    
    val_save_path = os.path.join(save_dir, f'LR_val_fold{fold}.pt')
    torch.save({'features': torch.tensor(X_val), 'labels': torch.tensor(y_val)},
               val_save_path)
    print(f'save Validation: {val_save_path}')
    
    # 불균형 처리: scale_pos_weight 계산
    neg_count = np.sum(y_tr == 0)
    pos_count = np.sum(y_tr == 1)
    scale_pos_weight = neg_count / pos_count

    # Inner CV 및 Grid Search 설정 (XGBoost)
    inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    base_model = XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='auc',
        scale_pos_weight=scale_pos_weight,
        random_state=42
    )
    grid_search = GridSearchCV(
        base_model,
        param_grid,
        cv=inner_cv,
        scoring='roc_auc',
        n_jobs=-1
    )

    # 하이퍼파라미터 탐색
    grid_search.fit(X_tr, y_tr)
    best_model = grid_search.best_estimator_
    print('Best params:', grid_search.best_params_)

    y_val_pred = best_model.predict(X_val)
    y_val_proba = best_model.predict_proba(X_val)[:, 1]
    val_metrics = {
        'accuracy': accuracy_score(y_val, y_val_pred),
        'roc_auc': roc_auc_score(y_val, y_val_proba),
        'avg_precision': average_precision_score(y_val, y_val_proba),
        'precision': precision_score(y_val, y_val_pred),
        'recall': recall_score(y_val, y_val_pred),
        'f1': f1_score(y_val, y_val_pred)
    }
    print(f'Validation Metrics: {val_metrics}')
    joblib.dump(val_metrics, os.path.join(save_dir, f'XGB_val_metrics_fold{fold}.pkl'))

    # 최적 모델 저장
    model_path = os.path.join(save_dir, f'xgboost_grid_fold{fold}.pkl')
    joblib.dump(best_model, model_path)
    print(f'Fold {fold}, save model: {model_path}\n')

    fold += 1

print('fin.')

=== Fold 1 ===
save Test: ./grid_search/LR_test_fold1.pt
save Train: ./grid_search/LR_train_fold1.pt
save Validation: ./grid_search/LR_val_fold1.pt
Best params: {'colsample_bytree': 1.0, 'max_depth': 6, 'n_estimators': 100, 'subsample': 1.0}
Validation Metrics: {'accuracy': 0.7575757575757576, 'roc_auc': 0.7222222222222222, 'avg_precision': 0.7278875604440395, 'precision': 0.7777777777777778, 'recall': 0.7777777777777778, 'f1': 0.7777777777777778}
Fold 1, save model: ./grid_search/xgboost_grid_fold1.pkl

=== Fold 2 ===
save Test: ./grid_search/LR_test_fold2.pt
save Train: ./grid_search/LR_train_fold2.pt
save Validation: ./grid_search/LR_val_fold2.pt
Best params: {'colsample_bytree': 1.0, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Validation Metrics: {'accuracy': 0.7878787878787878, 'roc_auc': 0.8925925925925926, 'avg_precision': 0.8898820396627416, 'precision': 0.8235294117647058, 'recall': 0.7777777777777778, 'f1': 0.7999999999999999}
Fold 2, save model: ./grid_search/xgbo