In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import tensorflow
import seaborn as sns
import pandas as pd

from tensorflow import keras

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import rdMolDescriptors

from sklearn import datasets, metrics
from sklearn.metrics import auc, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import StandardScaler


from scipy import interp
from tensorflow.keras.layers import Embedding, Dense 
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint



from tensorflow.keras import backend as K 
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import initializers

2025-06-26 08:33:03.586156: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-26 08:33:04.382915: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-06-26 08:33:04.382979: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
from rdkit import Chem
from rdkit.Chem import AllChem, QED, DataStructs
import joblib
import os

# 1) Load and preprocess data
df = pd.read_csv('/data/home/dbswn0814/2025JCM/data/single task/liv_data.csv')

def preprocess_dataframe(df):
    df_copy = df.copy().reset_index(drop=True)
    mols, invalid_idxs = [], []
    for i, smi in enumerate(df_copy['SMILES']):
        mol = Chem.MolFromSmiles(str(smi))
        if mol:
            try:
                Chem.SanitizeMol(mol)
                mols.append(mol)
            except:
                invalid_idxs.append(i)
                print(f"Index {i} has invalid SMILES and will be dropped.")
        else:
            invalid_idxs.append(i)
            print(f"Index {i} SMILES is None and will be dropped.")
    if invalid_idxs:
        df_copy = df_copy.drop(invalid_idxs, axis=0).reset_index(drop=True)
    return df_copy, mols

# Run preprocessing
df_clean, mols = preprocess_dataframe(df)

# Generate Morgan fingerprints
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in mols]
fingerprints = []
for fp in fps:
    arr = np.zeros((1024,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    fingerprints.append(arr)
x_fp = np.array(fingerprints, dtype=np.float32)

# Compute QED properties and scale
eq_props = [QED.properties(mol) for mol in mols]
qe_df = pd.DataFrame(eq_props)
scaler = StandardScaler()
qe_scaled = scaler.fit_transform(qe_df)
qe_scaled_df = pd.DataFrame(qe_scaled, columns=qe_df.columns)

# Combine features and labels
features = np.hstack((x_fp, qe_scaled_df.values))
final_df = pd.concat([pd.DataFrame(features), df_clean['liv'].reset_index(drop=True)], axis=1)
final_df = final_df.dropna().reset_index(drop=True)

X = final_df.drop('liv', axis=1).values
y = final_df['liv'].values

# 2) Split into train/test (hold-out)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=0
)

# Save test set
save_dir = './oob_grid_search_rf'
os.makedirs(save_dir, exist_ok=True)
torch.save({
    'features': torch.tensor(X_test, dtype=torch.float32),
    'labels':   torch.tensor(y_test, dtype=torch.long)
}, os.path.join(save_dir, 'RF_test_holdout.pt'))

# 3) Compute class weights on training data
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
cw = dict(enumerate(weights))

# 4) Parameter grid for Random Forest
grid_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

# 5) Manual grid search with OOB score
best_score = -np.inf
best_params = None
for params in ParameterGrid(grid_params):
    rf = RandomForestClassifier(
        oob_score=True,
        random_state=42,
        n_jobs=-1,
        class_weight=cw,
        **params
    )
    rf.fit(X_train, y_train)
    oob = rf.oob_score_
    print(f"Params: {params}, OOB score: {oob:.4f}")
    if oob > best_score:
        best_score = oob
        best_params = params

print(f"Best params: {best_params}, Best OOB score: {best_score:.4f}")

# 6) Retrain best model on full training set
best_rf = RandomForestClassifier(
    oob_score=True,
    random_state=42,
    n_jobs=-1,
    class_weight=cw,
    **best_params
)
best_rf.fit(X_train, y_train)

# 7) Save best params and model
joblib.dump(best_params, os.path.join(save_dir, 'RF_best_params.pkl'))
joblib.dump(best_rf, os.path.join(save_dir, 'random_forest_oob_model.pkl'))

print('OOB-based grid search completed. Test set saved at RF_test_holdout.pt')

Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 50}, OOB score: 0.6583
Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}, OOB score: 0.6667
Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 150}, OOB score: 0.6667
Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 50}, OOB score: 0.6333
Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}, OOB score: 0.6333
Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 150}, OOB score: 0.6500
Params: {'max_depth': None, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 50}, OOB score: 0.6458
Params: {'max_depth': None, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 100}, OOB score: 0.6500
Params: {'max_depth': None, 'max_features': 'log2', 'min_samples_split': 2,