In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import tensorflow
import seaborn as sns
import pandas as pd

from tensorflow import keras

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import rdMolDescriptors

from sklearn import datasets, metrics
from sklearn.metrics import auc, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import StandardScaler


from scipy import interp
from tensorflow.keras.layers import Embedding, Dense 
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint



from tensorflow.keras import backend as K 
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import initializers

2025-06-25 13:20:17.637617: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-25 13:20:18.429573: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-06-25 13:20:18.429646: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from rdkit import Chem
from rdkit.Chem import AllChem, QED, DataStructs
import joblib
import os
# XGBoost import
from xgboost import XGBClassifier

# 데이터 로드 및 전처리 함수 (기존과 동일)
def preprocess_dataframe(df):
    df_copy = df.copy()
    df_copy.reset_index(drop=True, inplace=True)
    mols, none_list = [], []
    for i in range(len(df_copy)):
        smiles = str(df_copy.loc[i, 'SMILES'])
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            try:
                Chem.SanitizeMol(mol)
                mols.append(mol)
            except:
                none_list.append(i)
                print(f'Index {i}: 유효하지 않은 SMILES, 제외')
        else:
            none_list.append(i)
            print(f'Index {i}: None SMILES, 제외')
    if none_list:
        df_copy.drop(none_list, inplace=True)
        df_copy.reset_index(drop=True, inplace=True)
    return df_copy, mols

# 1) 데이터 로드
df = pd.read_csv('/data/home/dbswn0814/2025JCM/data/single task/liv_data.csv')
df_clean, mols = preprocess_dataframe(df)

# 2) Morgan FP 생성
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in mols]
arr_list = []
for bit in fps:
    arr = np.zeros((1024,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(bit, arr)
    arr_list.append(arr)
x_fingerprint = np.array(arr_list, dtype=np.float32)

# 3) QED 속성 계산 및 스케일링
qe_props = [QED.properties(mol) for mol in mols]
qe_df = pd.DataFrame(qe_props)
scaler = StandardScaler()
qe_scaled = scaler.fit_transform(qe_df)
qe_scaled_df = pd.DataFrame(qe_scaled, columns=qe_df.columns)

# 4) Feature 결합 및 X, y 준비
features = np.hstack((x_fingerprint, qe_scaled_df.values))
final_df = pd.concat([pd.DataFrame(features), df_clean['liv'].reset_index(drop=True)], axis=1)
final_df.dropna(inplace=True)
X = final_df.drop('liv', axis=1).values
y = final_df['liv'].values

# 5) 외부 CV 설정
n_splits = 20
outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

save_dir = './processed'
os.makedirs(save_dir, exist_ok=True)

fold = 1
for train_idx, test_idx in outer_cv.split(X, y):
    print(f'=== Fold {fold} ===')
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # 5-1) Test 데이터 저장만
    test_data = {'features': torch.tensor(X_test), 'labels': torch.tensor(y_test)}
    test_save_path = os.path.join(save_dir, f'XGB_test_fold{fold}.pt')
    torch.save(test_data, test_save_path)
    print(f'save Test data: {test_save_path}')
    
    # 5-2) Train/Validation 분리
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
    for tr_idx, val_idx in sss.split(X_train, y_train):
        X_tr, X_val = X_train[tr_idx], X_train[val_idx]
        y_tr, y_val = y_train[tr_idx], y_train[val_idx]
    
    # 5-3) Train/Val 데이터 저장
    torch.save({'features': torch.tensor(X_tr), 'labels': torch.tensor(y_tr)},
               os.path.join(save_dir, f'XGB_train_fold{fold}.pt'))
    torch.save({'features': torch.tensor(X_val), 'labels': torch.tensor(y_val)},
               os.path.join(save_dir, f'XGB_val_fold{fold}.pt'))
    print(f'save Train/Val data (fold {fold})')
    
    # 5-4) 불균형 처리용 가중치 계산
    neg_count = np.sum(y_tr == 0)
    pos_count = np.sum(y_tr == 1)
    scale_pos_weight = neg_count / pos_count

    # 5-5) XGBoost 모델 학습 (Grid Search 제거)
    model = XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='auc',
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_estimators=100,
        max_depth=3,
        subsample=0.8,
        colsample_bytree=0.8)

    model.fit(X_tr, y_tr)
    
    # 5-6) Validation 데이터 평가
    y_val_pred = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)[:, 1]
    val_metrics = {
        'accuracy': accuracy_score(y_val, y_val_pred),
        'roc_auc': roc_auc_score(y_val, y_val_proba),
        'avg_precision': average_precision_score(y_val, y_val_proba),
        'precision': precision_score(y_val, y_val_pred),
        'recall': recall_score(y_val, y_val_pred),
        'f1': f1_score(y_val, y_val_pred)
    }
    print(f'Validation Metrics: {val_metrics}')
    joblib.dump(val_metrics, os.path.join(save_dir, f'XGB_val_metrics_fold{fold}.pkl'))
    
    # 5-7) 모델 저장
    model_path = os.path.join(save_dir, f'xgboost_fold{fold}.pkl')
    joblib.dump(model, model_path)
    print(f'Fold {fold}, save model: {model_path}\n')

    fold += 1

print('fin.')

=== Fold 1 ===
save Test data: ./processed/XGB_test_fold1.pt
save Train/Val data (fold 1)
Validation Metrics: {'accuracy': 0.6060606060606061, 'roc_auc': 0.6397058823529411, 'avg_precision': 0.7005119201939197, 'precision': 0.625, 'recall': 0.5882352941176471, 'f1': 0.6060606060606061}
Fold 1, save model: ./processed/xgboost_fold1.pkl

=== Fold 2 ===
save Test data: ./processed/XGB_test_fold2.pt
save Train/Val data (fold 2)
Validation Metrics: {'accuracy': 0.48484848484848486, 'roc_auc': 0.6066176470588236, 'avg_precision': 0.7129960246188465, 'precision': 0.5, 'recall': 0.5294117647058824, 'f1': 0.5142857142857143}
Fold 2, save model: ./processed/xgboost_fold2.pkl

=== Fold 3 ===
save Test data: ./processed/XGB_test_fold3.pt
save Train/Val data (fold 3)
Validation Metrics: {'accuracy': 0.5757575757575758, 'roc_auc': 0.6544117647058824, 'avg_precision': 0.7245004616505122, 'precision': 0.6, 'recall': 0.5294117647058824, 'f1': 0.5625}
Fold 3, save model: ./processed/xgboost_fold3.pkl

=