In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score, average_precision_score
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import pearsonr
from utils import EarlyStopping, load_fingerprints, MLP, CustomDataset
from deepchem.data import NumpyDataset
from deepchem.splits import ScaffoldSplitter

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
np.random.seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

# 고정된 랜덤 시드를 사용하여 재현 가능한 셔플링 설정
g = torch.Generator()
g.manual_seed(777)  # 고정된 시드 설정

if torch.cuda.is_available():
    torch.cuda.manual_seed(777)
    torch.cuda.manual_seed_all(777)  # 멀티 GPU 환경 시 사용
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
nBits=1024
num_epochs = 300
k_folds=5
patience = 10

file_path = ''
file_fingerprint = ''

In [None]:
def generate_scaffold_splits(train_dataset, n_splits=5, seed=42):
    splitter = ScaffoldSplitter()
    scaffold_splits = []
    np.random.seed(seed)
    
    indices = np.arange(len(train_dataset))
    
    for _ in range(n_splits):
        # 데이터를 무작위로 섞기
        np.random.shuffle(indices)
        shuffled_dataset = train_dataset.select(indices)
        
        # Train 데이터를 Scaffold Split으로 나눔 (Train:Valid = 4:1)
        train_subset, valid_subset = splitter.train_test_split(
            shuffled_dataset, frac_train=0.8, seed=np.random.randint(1e6)
        )
        scaffold_splits.append((train_subset, valid_subset))
    
    return scaffold_splits