In [None]:
import numpy as np
from rdkit import Chem 
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score, average_precision_score
import os, joblib

# import utils

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
np.random.seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

# 고정된 랜덤 시드를 사용하여 재현 가능한 셔플링 설정
g = torch.Generator()
g.manual_seed(777)  # 고정된 시드 설정

if torch.cuda.is_available():
    torch.cuda.manual_seed(777)
    torch.cuda.manual_seed_all(777)  # 멀티 GPU 환경 시 사용
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
# Early Stopping을 위한 클래스 정의
class EarlyStopping:
    def __init__(self, patience=10, delta=0):
        self.patience = patience  # 개선되지 않는 에포크를 기다릴 수 있는 횟수
        self.delta = delta  # 개선 기준이 되는 최소 변화량
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        self.best_loss = np.Inf

    def __call__(self, val_loss):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss)
            self.counter = 0

    def save_checkpoint(self, val_loss):
        """검증 손실이 개선되었을 때 호출"""
        self.best_loss = val_loss

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.length = len(self.X)  # 데이터의 길이 저장
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        if idx >= self.length:
            idx = idx % self.length  # 인덱스가 데이터의 범위를 벗어나면 나머지 연산을 통해 인덱스를 조정
        return self.X[idx], self.y[idx]

In [None]:
# Outlier 제거 함수
def remove_outliers(data, column, threshold):
    data[column] = data[column].apply(lambda x: np.nan if x > threshold else x)
    return data

# Fingerprint 불러오기 함수
def load_fingerprints(nbits, radius, file_fingerprint):
    file = os.path.join(file_fingerprint, f"fingerprints_{nbits}_{radius}.joblib")
    if os.path.exists(file):
        print(f"Loading fingerprints from {file}")
        return joblib.load(file)
    else:
        raise FileNotFoundError(f"Fingerprint file for nBits={nbits}, radius={radius} not found.")

In [None]:
class MLP(nn.Module):
    def __init__(self, nBits, drop_rate, seed=777):
        super(MLP, self).__init__()

        # 전역 랜덤 시드 고정
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

        # 모델 레이어 정의
        self.fc1 = nn.Linear(nBits, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 16)
        self.bn2 = nn.BatchNorm1d(16)
        self.fc3 = nn.Linear(16, 1)

        self.dropout = nn.Dropout(drop_rate)

        # 가중치 초기화
        self._initialize_weights(seed)

    def forward(self, x):
        x = nn.ReLU()(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = nn.ReLU()(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = nn.Sigmoid()(self.fc3(x))
        return x

    def _initialize_weights(self, seed):
        torch.manual_seed(seed)  # 시드 고정

        # Fully connected layers
        nn.init.kaiming_uniform_(self.fc1.weight, a=0, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc2.weight, a=0, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc3.weight, a=0, mode='fan_in', nonlinearity='linear')

        if self.fc1.bias is not None:
            nn.init.constant_(self.fc1.bias, 0)
        if self.fc2.bias is not None:
            nn.init.constant_(self.fc2.bias, 0)
        if self.fc3.bias is not None:
            nn.init.constant_(self.fc3.bias, 0)

        # BatchNorm layers
        nn.init.constant_(self.bn1.weight, 1)
        nn.init.constant_(self.bn1.bias, 0)
        nn.init.constant_(self.bn2.weight, 1)
        nn.init.constant_(self.bn2.bias, 0)

In [None]:
# 학습 데이터 경로
nBits=1024
file_path = ''
file_fingerprint = ''
data = pd.read_csv(file_path, low_memory=False)
fingerprints = load_fingerprints(nBits, 2, file_fingerprint)

# 사용할 열 이름
target_endpoint = 'BBB_logbb(cls)'

# Filter out rows where the target value is NaN
data_task = data.dropna(subset=[target_endpoint])

# Extract the fingerprints and target values
X_morgan_np = fingerprints[data_task.index]  # 필요한 인덱스에 맞게 fingerprint 선택
y = np.array(data_task[target_endpoint])

X_train, X_test, y_train, y_test = train_test_split(X_morgan_np, y, test_size=0.1, shuffle=True, random_state=42)

In [None]:
def main(target_endpoint, drop_rate, lr, weight_decay, model_class):
    # 학습 데이터 경로
    file_path = '/DAS_Storage4/Federate_learning/Processed_data/DeepPK_merge/Deeppk_merged_ADMET_fdamdd_merged.csv'
    file_fingerprint = '/DAS_Storage4/yohan/ADMET'
    data = pd.read_csv(file_path, low_memory=False)
    fingerprints = load_fingerprints(nBits, 2, file_fingerprint)
    
    # 사용할 열 이름
    target_endpoint = target_endpoint
    
    # Filter out rows where the target value is NaN
    data_task = data.dropna(subset=[target_endpoint])
    
    # Extract the fingerprints and target values
    X_morgan_np = fingerprints[data_task.index]  # 필요한 인덱스에 맞게 fingerprint 선택
    y = np.array(data_task[target_endpoint])
    
    X_train, X_test, y_train, y_test = train_test_split(X_morgan_np, y, test_size=0.1, shuffle=True, random_state=42)
    
    acc_scores, precision_scores, recall_scores, f1_scores, roc_auc_scores, pr_auc_scores = [], [], [], [], [], []
    # Test 성능 기록 리스트 추가
    test_acc_scores, test_precision_scores, test_recall_scores, test_f1_scores, test_roc_auc_scores, test_pr_auc_scores = [], [], [], [], [], []
    
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        print(f'Fold {fold + 1}/{k_folds}')
        
        # kfold index를 이용해서 train data와 val data의 분리
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
        # 사용자 정의 데이터셋
        train_dataset = CustomDataset(X_train_fold, y_train_fold)
        val_dataset = CustomDataset(X_val_fold, y_val_fold)
        test_dataset = CustomDataset(X_test, y_test)
    
        # 데이터로더 생성
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    
        model = model_class(nBits=nBits, drop_rate=drop_rate).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.BCELoss().to(device)
        
        # Early stopping 초기화
        early_stopping = EarlyStopping(patience=patience, delta=0.001)
    
        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs.to(device))
                loss = criterion(outputs, labels.to(device).float().unsqueeze(1))  # labels 크기 맞춤
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            
            # Validation step
            model.eval()
            val_loss = 0.0
            val_preds = []
            val_targets = []
            
            with torch.no_grad():
                for inputs, labels in val_loader:
                    inputs = inputs.to(device)
                    labels = labels.to(device).float().unsqueeze(1)  # labels 크기 맞춤
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()
                    
                    # 예측값 수집
                    val_preds.extend(outputs.cpu().numpy().flatten())
                    val_targets.extend(labels.cpu().numpy().flatten())
            
            val_loss /= len(val_loader)
            print(f'Epoch {epoch+1}, Validation Loss: {val_loss}, Train Loss: {running_loss/len(train_loader)}')
    
            # Early Stopping 체크
            early_stopping(val_loss)
            if early_stopping.early_stop:
                print(f"Early stopping at epoch {epoch+1}")
                break
    
        # Validation 성능 지표 계산
        y_true = np.array(val_targets).flatten()
        y_pred = np.array(val_preds).flatten()
    
        # Test 성능 평가
        model.eval()
        test_preds = []
        test_targets = []
        
        with torch.no_grad():
            for inputs, labels in test_loader:  # Test loader는 고정된 데이터로 평가
                inputs = inputs.to(device)
                labels = labels.to(device).float().unsqueeze(1)  # labels 크기 맞춤
                outputs = model(inputs)
                test_preds.extend(outputs.cpu().numpy())
                test_targets.extend(labels.cpu().numpy())
    
        # Test 성능 지표 계산
        test_y_true = np.array(test_targets).flatten()
        test_y_pred = np.array(test_preds).flatten()
    
        test_acc_scores.append(accuracy_score(test_y_true, (test_y_pred > 0.5).astype(int)))
        test_precision_scores.append(precision_score(test_y_true, (test_y_pred > 0.5).astype(int)))
        test_recall_scores.append(recall_score(test_y_true, (test_y_pred > 0.5).astype(int)))
        test_f1_scores.append(f1_score(test_y_true, (test_y_pred > 0.5).astype(int)))
        test_roc_auc_scores.append(roc_auc_score(test_y_true, test_y_pred))
        test_pr_auc_scores.append(average_precision_score(test_y_true, test_y_pred))
    
        # # 각 fold별 Test 성능 출력
        # print(f"Fold {fold + 1} Test Results:")
        # print(f"ROC_AUC: {test_roc_auc_scores:.4f}, PR_AUC: {test_pr_auc_scores:.4f}, ACC: {test_acc_scores:.4f}, Precision: {test_precision_scores:.4f}, Recall: {test_recall_scores:.4f}, F1: {test_f1_scores:.4f}\n")
    
        # Valid 성능 지표 계산
        y_true = np.array(val_targets).flatten()
        y_pred = np.array(val_preds).flatten()
    
        acc_scores.append(accuracy_score(y_true, (y_pred > 0.5).astype(int)))
        precision_scores.append(precision_score(y_true, (y_pred > 0.5).astype(int)))
        recall_scores.append(recall_score(y_true, (y_pred > 0.5).astype(int)))
        f1_scores.append(f1_score(y_true, (y_pred > 0.5).astype(int)))
        roc_auc_scores.append(roc_auc_score(y_true, y_pred))
        pr_auc_scores.append(average_precision_score(y_true, y_pred))
    
    # Cross-validation 점수의 평균 계산
    acc_mean, acc_std = np.mean(acc_scores), np.std(acc_scores)
    precision_mean, precision_std = np.mean(precision_scores), np.std(precision_scores)
    recall_mean, recall_std = np.mean(recall_scores), np.std(recall_scores)
    f1_mean, f1_std = np.mean(f1_scores), np.std(f1_scores)
    roc_auc_mean, roc_auc_std = np.mean(roc_auc_scores), np.std(roc_auc_scores)
    pr_auc_mean, pr_auc_std = np.mean(pr_auc_scores), np.std(pr_auc_scores)
    
    print(f"Mean ROC_AUC: {roc_auc_mean:.4f} ± {roc_auc_std:.4f}")
    print(f"Mean PR_AUC: {pr_auc_mean:.4f} ± {pr_auc_std:.4f}")
    print(f"Mean ACC: {acc_mean:.4f} ± {acc_std:.4f}")
    print(f"Mean Precision: {precision_mean:.4f} ± {acc_std:.4f}")
    print(f"Mean Recall: {recall_mean:.4f} ± {precision_std:.4f}")
    print(f"Mean F1: {f1_mean:.4f} ± {f1_std:.4f}")
    
    
    # Test Set 점수의 평균 계산
    test_acc_mean, test_acc_std = np.mean(test_acc_scores), np.std(test_acc_scores)
    test_precision_mean, test_precision_std = np.mean(test_precision_scores), np.std(test_precision_scores)
    test_recall_mean, test_recall_std = np.mean(test_recall_scores), np.std(test_recall_scores)
    test_f1_mean, test_f1_std = np.mean(test_f1_scores), np.std(test_f1_scores)
    test_roc_auc_mean, test_roc_auc_std = np.mean(test_roc_auc_scores), np.std(test_roc_auc_scores)
    test_pr_auc_mean, test_pr_auc_std = np.mean(test_pr_auc_scores), np.std(test_pr_auc_scores)
    
    # 전체 Test 결과 출력
    print("\nFinal Test Set Results (Mean ± Std):")
    print(f"Mean ROC_AUC: {test_roc_auc_mean:.4f} ± {test_roc_auc_std:.4f}")
    print(f"Mean PR_AUC: {test_pr_auc_mean:.4f} ± {test_pr_auc_std:.4f}")
    print(f"Mean ACC: {test_acc_mean:.4f} ± {test_acc_std:.4f}")
    print(f"Mean Precision: {test_precision_mean:.4f} ± {test_acc_std:.4f}")
    print(f"Mean Recall: {test_recall_mean:.4f} ± {test_precision_std:.4f}")
    print(f"Mean F1: {test_f1_mean:.4f} ± {test_f1_std:.4f}")

In [None]:
main(
    target_endpoint='BBB_logbb(cls)',
    drop_rate = 0.2,
    lr = 0.01,
    weight_decay = 0.001,
    model_class=MLP
)