<a href="https://colab.research.google.com/github/bopeng-sue/Optimal-Biofluid-Matrices-for-Human-Exposome-Biomonitoring/blob/main/preliminary_model_and_feature_screening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import packages and file

In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, MACCSkeys, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')
RDLogger.DisableLog('rdApp.*')

In [None]:
df = pd.read_excel('dataset_1211.xlsx')
df.head(2)

Unnamed: 0.1,Unnamed: 0,Biomonitor Compound PubChem CID,Blood study numbers,Urine study numbers,Tendency for the selection of blood samples,Biospecimen,CAS Number,Name,SMILES,hhlb(hours),...,JGI10,JGT,VE1_D,VE3_D,VR1_D,VR2_D,SRW5,AMW,WTPT-3,XLogP
0,0,70,0,2,0.0,Urine,816-66-0,4-Methyl-2-oxovaleric acid,CC(C)CC(=O)C(=O)O,0.61,...,0.0,0.601944,0.046505,-2.761367,35.180772,3.908975,0.0,6.845421,7.15891,0.904
1,1,89,0,1,0.0,Urine,484-78-6,Hydroxykynurenine,C1=CC(=C(C(=C1)O)N)C(=O)CC(C(=O)O)N,0.43,...,0.0,0.556317,0.130944,-3.25278,192.246721,12.01542,0.0,8.002847,14.772289,-2.464


# feature function

In [None]:
def padel_descriptor(df: pd.DataFrame) -> np.ndarray:
    """
    Extracts Padel descriptors from a given DataFrame starting from column index 16
    to the end. It replaces NaN values with 0 and infinite values with 1e10.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing feature columns.

    Returns
    -------
    np.ndarray
        A NumPy array of transformed feature values.
    """
    # Slice columns from index 16 to the end
    X = df.iloc[:, 16:].values

    # Replace NaNs with 0
    X = np.nan_to_num(X, nan=0.0)

    # Replace infinities with a large finite value
    X[np.isinf(X)] = 1e10

    return X

In [None]:
def maccs_descriptor(smiles_list):
    """
    Generates MACCS fingerprints for a list of SMILES strings and returns them as a DataFrame.

    Parameters
    ----------
    smiles_list : list of str
        A list of SMILES strings for which MACCS keys will be generated.

    Returns
    -------
    pd.DataFrame
        A DataFrame of MACCS bit values (0 or 1) with column names 'MACCS_1' through 'MACCS_167'.
    """
    # Generate MACCS fingerprints and convert them to a list of bits
    maccs_fingerprints = [
        list(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smiles)).ToBitString())
        for smiles in smiles_list
    ]

    # Convert the list of bits to a DataFrame
    # MACCSkeys are 167 bits long, hence 167 columns
    X = pd.DataFrame(maccs_fingerprints, columns=[f'MACCS_{i}' for i in range(1, 168)])

    # Convert strings to integers (0 and 1)
    X = X.astype(int)

    return X

In [None]:
def ecfp_descriptor(smiles_list, radius=2, n_bits=1024):
    """
    Generates ECFP (Extended-Connectivity Fingerprints) for a list of SMILES strings
    and returns them as a NumPy array.

    Parameters
    ----------
    smiles_list : list of str
        A list of SMILES strings for which ECFP fingerprints will be generated.
    radius : int, optional (default=2)
        The fingerprint radius to use. For ECFP4, radius=2 is standard.
    n_bits : int, optional (default=1024)
        Length of the bit vector.

    Returns
    -------
    np.ndarray
        A 2D NumPy array where each row corresponds to the ECFP bit vector of the input SMILES.
    """
    def calculate_ecfp(smiles, radius=radius, n_bits=n_bits):
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits, useFeatures=False)
            return np.array(fp)
        else:
            return np.zeros(n_bits, dtype=int)

    # Apply ECFP calculation to the input list of SMILES
    ecfp_fingerprints = np.array([calculate_ecfp(s) for s in smiles_list])
    return ecfp_fingerprints

In [None]:
def fcfp_descriptor(smiles_list, radius=2, n_bits=1024):
    """
    Generates FCFP (Feature-based Circular Fingerprints) for a list of SMILES strings
    and returns them as a NumPy array.

    Parameters
    ----------
    smiles_list : list of str
        A list of SMILES strings for which FCFP fingerprints will be generated.
    radius : int, optional (default=2)
        The fingerprint radius to use.
    n_bits : int, optional (default=1024)
        Length of the bit vector.

    Returns
    -------
    np.ndarray
        A 2D NumPy array where each row corresponds to the FCFP bit vector of the input SMILES.
    """
    def calculate_fcfp(smiles, radius=radius, n_bits=n_bits):
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits, useFeatures=True)
            return np.array(fp)
        else:
            return np.zeros(n_bits, dtype=int)

    # Apply FCFP calculation to the input list of SMILES
    fcfp_fingerprints = np.array([calculate_fcfp(s) for s in smiles_list])
    return fcfp_fingerprints

In [None]:
def rdkit_descriptors_to_X(df: pd.DataFrame, smiles_col='SMILES', correlation_threshold=0.95) -> np.ndarray:
    """
    Compute RDKit molecular descriptors for each SMILES in the given DataFrame,
    remove descriptors with zero variance and highly correlated descriptors,
    and return the final feature matrix X.

    Parameters
    ----------
    df : pd.DataFrame
        A DataFrame containing at least a 'SMILES' column.
    smiles_col : str, optional
        The name of the column containing SMILES strings.
    correlation_threshold : float, optional
        Threshold above which descriptors are considered highly correlated and removed.

    Returns
    -------
    np.ndarray
        A 2D NumPy array containing the filtered RDKit descriptors.
    """
    # Convert SMILES to RDKit Mol objects
    mol_list = [Chem.MolFromSmiles(s) for s in df[smiles_col]]

    # Define the descriptor names
    descriptor_names = [desc_name[0] for desc_name in Descriptors._descList]

    # Create a descriptor calculator
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

    # Calculate descriptors for each molecule, if mol is None use NaNs
    descriptors_list = [
        calculator.CalcDescriptors(mol) if mol is not None else [np.nan]*len(descriptor_names)
        for mol in mol_list
    ]

    # Convert the list of descriptors to a DataFrame
    descriptors_df = pd.DataFrame(descriptors_list, columns=descriptor_names)

    # Replace NaNs with 0
    descriptors_df = descriptors_df.fillna(0)

    # Step 1: Remove descriptors with zero variance
    variance = descriptors_df.var()
    zero_variance_columns = variance[variance == 0].index
    descriptors_df = descriptors_df.drop(columns=zero_variance_columns)

    # Step 2: Remove descriptors with correlation exceeding the threshold
    corr = descriptors_df.corr().abs()
    to_drop = []

    for i, col1 in enumerate(corr.columns):
        for col2 in corr.columns[i+1:]:
            if corr.loc[col1, col2] > correlation_threshold:
                to_drop.append(col2)

    # Remove duplicates if any
    to_drop = list(set(to_drop))

    # Drop the highly correlated descriptors
    descriptors_df = descriptors_df.drop(columns=to_drop, errors='ignore')

    # Convert the final descriptors DataFrame to a NumPy array
    X = descriptors_df.values

    return X

# screening

In [None]:
def run_descriptor_model_screening(df: pd.DataFrame,
                                   target_col: str = 'Tendency for the selection of blood samples',
                                   smiles_col: str = 'SMILES',
                                   descriptor_method: str = 'maccs',
                                   correlation_threshold: float = 0.95):
    """
    Apply the specified descriptor method to the input DataFrame, extract feature matrix X,
    then run multiple classification models with 5-fold stratified cross-validation and print the results.
    Includes a progress bar for the model evaluation loop.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame containing at least the target column and a SMILES column.
    target_col : str, optional
        Name of the target column.
    smiles_col : str, optional
        Name of the SMILES column.
    descriptor_method : str, optional
        The descriptor method to use. Choose from: 'padel', 'maccs', 'fcfp', 'ecfp', 'rdkit'.
    correlation_threshold : float, optional
        Used only by rdkit_descriptors_to_X if that method is chosen.

    Returns
    -------
    pd.DataFrame
        A DataFrame summarizing the results of each model with the chosen descriptor method.
    """

    # Extract and binarize the target if necessary
    y = (df[target_col] > 0.5).astype(int)

    # Compute X based on the chosen descriptor method
    if descriptor_method.lower() == 'padel':
        X = padel_descriptor(df)  # returns a numpy array
    elif descriptor_method.lower() == 'maccs':
        X = maccs_descriptor(df[smiles_col].tolist()).values
    elif descriptor_method.lower() == 'fcfp':
        X = fcfp_descriptor(df[smiles_col].tolist())  # returns numpy array
    elif descriptor_method.lower() == 'ecfp':
        X = ecfp_descriptor(df[smiles_col].tolist())  # returns numpy array
    elif descriptor_method.lower() == 'rdkit':
        X = rdkit_descriptors_to_X(df, smiles_col=smiles_col, correlation_threshold=correlation_threshold)
    else:
        raise ValueError("Invalid descriptor_method. Choose from 'padel', 'maccs', 'fcfp', 'ecfp', 'rdkit'.")

    # Define models
    models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM": SVC(),
        "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        "MLP": MLPClassifier(random_state=42, max_iter=1000),
        "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "AdaBoost": AdaBoostClassifier(random_state=42),
        "Extra Trees": ExtraTreesClassifier(random_state=42),
        "LightGBM": LGBMClassifier(random_state=42, verbose=-1),
        "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
        "KNN": KNeighborsClassifier()
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    results = {}

    # Use tqdm to show progress over models
    for name, model in tqdm(models.items(), desc=f"Evaluating models with {descriptor_method.upper()} descriptors"):
        accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
        f1_scores = cross_val_score(model, X, y, cv=skf, scoring='f1_macro')
        precision_scores = cross_val_score(model, X, y, cv=skf, scoring='precision_macro')
        recall_scores = cross_val_score(model, X, y, cv=skf, scoring='recall_macro')

        results[name] = {
            "Accuracy (mean)": accuracy_scores.mean(),
            "Accuracy (std)": accuracy_scores.std(),
            "F1 (mean)": f1_scores.mean(),
            "F1 (std)": f1_scores.std(),
            "Precision (mean)": precision_scores.mean(),
            "Precision (std)": precision_scores.std(),
            "Recall (mean)": recall_scores.mean(),
            "Recall (std)": recall_scores.std()
        }
        print(f"{descriptor_method.upper()} - {name}: "
              f"Accuracy = {accuracy_scores.mean():.4f} ± {accuracy_scores.std():.4f}, "
              f"F1 = {f1_scores.mean():.4f} ± {f1_scores.std():.4f}, "
              f"Precision = {precision_scores.mean():.4f} ± {precision_scores.std():.4f}, "
              f"Recall = {recall_scores.mean():.4f} ± {recall_scores.std():.4f}")

    results_df = pd.DataFrame(results).T
    return results_df


In [None]:
# Define the descriptor methods you want to test
descriptor_methods = ['padel', 'maccs', 'fcfp', 'ecfp', 'rdkit']

# Dictionary to store the results for each descriptor method
all_results = {}

for method in descriptor_methods:
    results = run_descriptor_model_screening(df,
                                             target_col='Tendency for the selection of blood samples',
                                             smiles_col='SMILES',
                                             descriptor_method=method)
    all_results[method] = results


for method, result_df in all_results.items():
    print(f"Results for {method.upper()}:")
    print(result_df)
    print("======================================")


Evaluating models with PADEL descriptors:   8%|▊         | 1/12 [00:14<02:35, 14.14s/it]

PADEL - Random Forest: Accuracy = 0.9104 ± 0.0152, F1 = 0.9069 ± 0.0150, Precision = 0.9037 ± 0.0161, Recall = 0.9136 ± 0.0113


Evaluating models with PADEL descriptors:  17%|█▋        | 2/12 [00:15<01:06,  6.69s/it]

PADEL - SVM: Accuracy = 0.6362 ± 0.0175, F1 = 0.4670 ± 0.0345, Precision = 0.6856 ± 0.1038, Recall = 0.5348 ± 0.0204


Evaluating models with PADEL descriptors:  25%|██▌       | 3/12 [00:52<03:05, 20.62s/it]

PADEL - XGBoost: Accuracy = 0.9068 ± 0.0152, F1 = 0.9025 ± 0.0163, Precision = 0.9003 ± 0.0151, Recall = 0.9080 ± 0.0194


Evaluating models with PADEL descriptors:  33%|███▎      | 4/12 [01:12<02:40, 20.11s/it]

PADEL - MLP: Accuracy = 0.7780 ± 0.0199, F1 = 0.7627 ± 0.0227, Precision = 0.7743 ± 0.0206, Recall = 0.7641 ± 0.0271


Evaluating models with PADEL descriptors:  42%|████▏     | 5/12 [01:28<02:11, 18.77s/it]

PADEL - Logistic Regression: Accuracy = 0.8284 ± 0.0240, F1 = 0.8232 ± 0.0228, Precision = 0.8215 ± 0.0216, Recall = 0.8333 ± 0.0182


Evaluating models with PADEL descriptors:  50%|█████     | 6/12 [01:32<01:21, 13.64s/it]

PADEL - Decision Tree: Accuracy = 0.8508 ± 0.0452, F1 = 0.8417 ± 0.0479, Precision = 0.8452 ± 0.0454, Recall = 0.8417 ± 0.0491


Evaluating models with PADEL descriptors:  58%|█████▊    | 7/12 [04:15<05:13, 62.61s/it]

PADEL - Gradient Boosting: Accuracy = 0.8937 ± 0.0073, F1 = 0.8880 ± 0.0060, Precision = 0.8901 ± 0.0135, Recall = 0.8900 ± 0.0063


Evaluating models with PADEL descriptors:  67%|██████▋   | 8/12 [04:50<03:35, 53.82s/it]

PADEL - AdaBoost: Accuracy = 0.8937 ± 0.0303, F1 = 0.8886 ± 0.0310, Precision = 0.8874 ± 0.0314, Recall = 0.8918 ± 0.0300


Evaluating models with PADEL descriptors:  75%|███████▌  | 9/12 [04:55<01:55, 38.39s/it]

PADEL - Extra Trees: Accuracy = 0.9254 ± 0.0146, F1 = 0.9224 ± 0.0145, Precision = 0.9190 ± 0.0157, Recall = 0.9293 ± 0.0115


Evaluating models with PADEL descriptors:  83%|████████▎ | 10/12 [05:41<01:22, 41.01s/it]

PADEL - LightGBM: Accuracy = 0.9030 ± 0.0285, F1 = 0.8986 ± 0.0293, Precision = 0.8967 ± 0.0314, Recall = 0.9022 ± 0.0267


Evaluating models with PADEL descriptors:  92%|█████████▏| 11/12 [53:31<15:06, 906.76s/it]

PADEL - CatBoost: Accuracy = 0.9086 ± 0.0091, F1 = 0.9045 ± 0.0090, Precision = 0.9018 ± 0.0106, Recall = 0.9094 ± 0.0073


Evaluating models with PADEL descriptors: 100%|██████████| 12/12 [53:32<00:00, 267.69s/it]

PADEL - KNN: Accuracy = 0.8209 ± 0.0161, F1 = 0.8129 ± 0.0199, Precision = 0.8156 ± 0.0184, Recall = 0.8206 ± 0.0282



Evaluating models with MACCS descriptors:   8%|▊         | 1/12 [00:05<00:55,  5.00s/it]

MACCS - Random Forest: Accuracy = 0.9179 ± 0.0171, F1 = 0.9146 ± 0.0177, Precision = 0.9101 ± 0.0178, Recall = 0.9215 ± 0.0173


Evaluating models with MACCS descriptors:  17%|█▋        | 2/12 [00:05<00:23,  2.33s/it]

MACCS - SVM: Accuracy = 0.9067 ± 0.0284, F1 = 0.9033 ± 0.0286, Precision = 0.8995 ± 0.0286, Recall = 0.9115 ± 0.0257


Evaluating models with MACCS descriptors:  25%|██▌       | 3/12 [00:06<00:17,  1.94s/it]

MACCS - XGBoost: Accuracy = 0.9123 ± 0.0125, F1 = 0.9081 ± 0.0127, Precision = 0.9062 ± 0.0142, Recall = 0.9115 ± 0.0126


Evaluating models with MACCS descriptors:  33%|███▎      | 4/12 [00:56<02:45, 20.65s/it]

MACCS - MLP: Accuracy = 0.9105 ± 0.0161, F1 = 0.9057 ± 0.0168, Precision = 0.9053 ± 0.0172, Recall = 0.9073 ± 0.0168


Evaluating models with MACCS descriptors:  42%|████▏     | 5/12 [00:56<01:33, 13.39s/it]

MACCS - Logistic Regression: Accuracy = 0.8993 ± 0.0216, F1 = 0.8947 ± 0.0217, Precision = 0.8927 ± 0.0222, Recall = 0.8991 ± 0.0194


Evaluating models with MACCS descriptors:  50%|█████     | 6/12 [00:57<00:53,  8.94s/it]

MACCS - Decision Tree: Accuracy = 0.8974 ± 0.0228, F1 = 0.8918 ± 0.0244, Precision = 0.8920 ± 0.0241, Recall = 0.8931 ± 0.0259


Evaluating models with MACCS descriptors:  58%|█████▊    | 7/12 [01:03<00:40,  8.13s/it]

MACCS - Gradient Boosting: Accuracy = 0.9104 ± 0.0194, F1 = 0.9071 ± 0.0192, Precision = 0.9044 ± 0.0181, Recall = 0.9154 ± 0.0153


Evaluating models with MACCS descriptors:  67%|██████▋   | 8/12 [01:06<00:26,  6.58s/it]

MACCS - AdaBoost: Accuracy = 0.8805 ± 0.0248, F1 = 0.8731 ± 0.0266, Precision = 0.8761 ± 0.0263, Recall = 0.8719 ± 0.0277


Evaluating models with MACCS descriptors:  75%|███████▌  | 9/12 [01:10<00:16,  5.53s/it]

MACCS - Extra Trees: Accuracy = 0.9179 ± 0.0209, F1 = 0.9141 ± 0.0222, Precision = 0.9109 ± 0.0217, Recall = 0.9187 ± 0.0231


Evaluating models with MACCS descriptors:  83%|████████▎ | 10/12 [01:11<00:08,  4.14s/it]

MACCS - LightGBM: Accuracy = 0.9086 ± 0.0261, F1 = 0.9042 ± 0.0272, Precision = 0.9016 ± 0.0278, Recall = 0.9075 ± 0.0262


Evaluating models with MACCS descriptors:  92%|█████████▏| 11/12 [03:41<00:49, 49.03s/it]

MACCS - CatBoost: Accuracy = 0.9179 ± 0.0261, F1 = 0.9150 ± 0.0259, Precision = 0.9120 ± 0.0256, Recall = 0.9234 ± 0.0210


Evaluating models with MACCS descriptors: 100%|██████████| 12/12 [03:42<00:00, 18.53s/it]

MACCS - KNN: Accuracy = 0.8898 ± 0.0311, F1 = 0.8847 ± 0.0320, Precision = 0.8839 ± 0.0332, Recall = 0.8886 ± 0.0305



Evaluating models with FCFP descriptors:   8%|▊         | 1/12 [00:07<01:19,  7.22s/it]

FCFP - Random Forest: Accuracy = 0.9048 ± 0.0208, F1 = 0.9009 ± 0.0212, Precision = 0.8975 ± 0.0214, Recall = 0.9073 ± 0.0202


Evaluating models with FCFP descriptors:  17%|█▋        | 2/12 [00:08<00:39,  3.97s/it]

FCFP - SVM: Accuracy = 0.8955 ± 0.0208, F1 = 0.8920 ± 0.0207, Precision = 0.8885 ± 0.0206, Recall = 0.9015 ± 0.0176


Evaluating models with FCFP descriptors:  25%|██▌       | 3/12 [00:13<00:38,  4.24s/it]

FCFP - XGBoost: Accuracy = 0.8844 ± 0.0315, F1 = 0.8789 ± 0.0333, Precision = 0.8772 ± 0.0330, Recall = 0.8834 ± 0.0351


Evaluating models with FCFP descriptors:  33%|███▎      | 4/12 [02:12<06:36, 49.60s/it]

FCFP - MLP: Accuracy = 0.9105 ± 0.0149, F1 = 0.9060 ± 0.0149, Precision = 0.9053 ± 0.0181, Recall = 0.9082 ± 0.0137


Evaluating models with FCFP descriptors:  42%|████▏     | 5/12 [02:13<03:43, 31.97s/it]

FCFP - Logistic Regression: Accuracy = 0.8899 ± 0.0260, F1 = 0.8843 ± 0.0262, Precision = 0.8845 ± 0.0287, Recall = 0.8861 ± 0.0238


Evaluating models with FCFP descriptors:  50%|█████     | 6/12 [02:13<02:07, 21.27s/it]

FCFP - Decision Tree: Accuracy = 0.8713 ± 0.0199, F1 = 0.8637 ± 0.0224, Precision = 0.8652 ± 0.0195, Recall = 0.8644 ± 0.0268


Evaluating models with FCFP descriptors:  58%|█████▊    | 7/12 [02:30<01:38, 19.63s/it]

FCFP - Gradient Boosting: Accuracy = 0.8881 ± 0.0156, F1 = 0.8827 ± 0.0160, Precision = 0.8808 ± 0.0169, Recall = 0.8863 ± 0.0158


Evaluating models with FCFP descriptors:  67%|██████▋   | 8/12 [02:39<01:05, 16.27s/it]

FCFP - AdaBoost: Accuracy = 0.8824 ± 0.0249, F1 = 0.8762 ± 0.0255, Precision = 0.8760 ± 0.0275, Recall = 0.8772 ± 0.0233


Evaluating models with FCFP descriptors:  75%|███████▌  | 9/12 [02:46<00:40, 13.47s/it]

FCFP - Extra Trees: Accuracy = 0.9011 ± 0.0151, F1 = 0.8968 ± 0.0158, Precision = 0.8933 ± 0.0157, Recall = 0.9024 ± 0.0165


Evaluating models with FCFP descriptors:  83%|████████▎ | 10/12 [02:47<00:19,  9.60s/it]

FCFP - LightGBM: Accuracy = 0.8937 ± 0.0264, F1 = 0.8885 ± 0.0288, Precision = 0.8874 ± 0.0257, Recall = 0.8930 ± 0.0328


Evaluating models with FCFP descriptors:  92%|█████████▏| 11/12 [04:26<00:37, 37.10s/it]

FCFP - CatBoost: Accuracy = 0.8974 ± 0.0154, F1 = 0.8926 ± 0.0162, Precision = 0.8903 ± 0.0153, Recall = 0.8967 ± 0.0179


Evaluating models with FCFP descriptors: 100%|██████████| 12/12 [04:27<00:00, 22.30s/it]

FCFP - KNN: Accuracy = 0.8675 ± 0.0261, F1 = 0.8610 ± 0.0274, Precision = 0.8599 ± 0.0275, Recall = 0.8642 ± 0.0294



Evaluating models with ECFP descriptors:   8%|▊         | 1/12 [00:07<01:23,  7.55s/it]

ECFP - Random Forest: Accuracy = 0.9030 ± 0.0337, F1 = 0.8994 ± 0.0345, Precision = 0.8958 ± 0.0351, Recall = 0.9067 ± 0.0329


Evaluating models with ECFP descriptors:  17%|█▋        | 2/12 [00:09<00:43,  4.31s/it]

ECFP - SVM: Accuracy = 0.9030 ± 0.0209, F1 = 0.8991 ± 0.0208, Precision = 0.8970 ± 0.0225, Recall = 0.9058 ± 0.0172


Evaluating models with ECFP descriptors:  25%|██▌       | 3/12 [00:14<00:39,  4.39s/it]

ECFP - XGBoost: Accuracy = 0.9104 ± 0.0164, F1 = 0.9055 ± 0.0172, Precision = 0.9050 ± 0.0179, Recall = 0.9063 ± 0.0167


Evaluating models with ECFP descriptors:  33%|███▎      | 4/12 [01:37<04:44, 35.57s/it]

ECFP - MLP: Accuracy = 0.8900 ± 0.0213, F1 = 0.8842 ± 0.0224, Precision = 0.8832 ± 0.0222, Recall = 0.8861 ± 0.0234


Evaluating models with ECFP descriptors:  42%|████▏     | 5/12 [01:38<02:40, 22.97s/it]

ECFP - Logistic Regression: Accuracy = 0.9030 ± 0.0162, F1 = 0.8978 ± 0.0166, Precision = 0.8983 ± 0.0180, Recall = 0.8994 ± 0.0170


Evaluating models with ECFP descriptors:  50%|█████     | 6/12 [01:38<01:31, 15.33s/it]

ECFP - Decision Tree: Accuracy = 0.8862 ± 0.0231, F1 = 0.8809 ± 0.0230, Precision = 0.8798 ± 0.0266, Recall = 0.8838 ± 0.0184


Evaluating models with ECFP descriptors:  58%|█████▊    | 7/12 [01:57<01:22, 16.41s/it]

ECFP - Gradient Boosting: Accuracy = 0.9011 ± 0.0150, F1 = 0.8953 ± 0.0151, Precision = 0.8981 ± 0.0171, Recall = 0.8943 ± 0.0132


Evaluating models with ECFP descriptors:  67%|██████▋   | 8/12 [02:07<00:57, 14.31s/it]

ECFP - AdaBoost: Accuracy = 0.8768 ± 0.0327, F1 = 0.8699 ± 0.0351, Precision = 0.8704 ± 0.0354, Recall = 0.8707 ± 0.0360


Evaluating models with ECFP descriptors:  75%|███████▌  | 9/12 [02:12<00:34, 11.55s/it]

ECFP - Extra Trees: Accuracy = 0.9104 ± 0.0226, F1 = 0.9071 ± 0.0231, Precision = 0.9030 ± 0.0235, Recall = 0.9154 ± 0.0220


Evaluating models with ECFP descriptors:  83%|████████▎ | 10/12 [02:13<00:16,  8.38s/it]

ECFP - LightGBM: Accuracy = 0.9011 ± 0.0153, F1 = 0.8960 ± 0.0158, Precision = 0.8948 ± 0.0166, Recall = 0.8978 ± 0.0155


Evaluating models with ECFP descriptors:  92%|█████████▏| 11/12 [04:14<00:42, 42.81s/it]

ECFP - CatBoost: Accuracy = 0.9142 ± 0.0093, F1 = 0.9098 ± 0.0092, Precision = 0.9091 ± 0.0115, Recall = 0.9121 ± 0.0076


Evaluating models with ECFP descriptors: 100%|██████████| 12/12 [04:15<00:00, 21.28s/it]

ECFP - KNN: Accuracy = 0.8638 ± 0.0381, F1 = 0.8587 ± 0.0384, Precision = 0.8562 ± 0.0381, Recall = 0.8658 ± 0.0367



Evaluating models with RDKIT descriptors:   8%|▊         | 1/12 [00:05<01:03,  5.77s/it]

RDKIT - Random Forest: Accuracy = 0.9254 ± 0.0229, F1 = 0.9225 ± 0.0236, Precision = 0.9188 ± 0.0240, Recall = 0.9303 ± 0.0223


Evaluating models with RDKIT descriptors:  17%|█▋        | 2/12 [00:06<00:27,  2.73s/it]

RDKIT - SVM: Accuracy = 0.6157 ± 0.0023, F1 = 0.3811 ± 0.0009, Precision = 0.3078 ± 0.0011, Recall = 0.5000 ± 0.0000


Evaluating models with RDKIT descriptors:  25%|██▌       | 3/12 [00:12<00:38,  4.30s/it]

RDKIT - XGBoost: Accuracy = 0.9254 ± 0.0234, F1 = 0.9221 ± 0.0246, Precision = 0.9183 ± 0.0236, Recall = 0.9277 ± 0.0266


Evaluating models with RDKIT descriptors:  33%|███▎      | 4/12 [00:17<00:35,  4.50s/it]

RDKIT - MLP: Accuracy = 0.5407 ± 0.1480, F1 = 0.5066 ± 0.1250, Precision = 0.5592 ± 0.1417, Recall = 0.5307 ± 0.1056


Evaluating models with RDKIT descriptors:  42%|████▏     | 5/12 [00:28<00:47,  6.77s/it]

RDKIT - Logistic Regression: Accuracy = 0.8937 ± 0.0267, F1 = 0.8889 ± 0.0273, Precision = 0.8868 ± 0.0276, Recall = 0.8937 ± 0.0268


Evaluating models with RDKIT descriptors:  50%|█████     | 6/12 [00:28<00:28,  4.72s/it]

RDKIT - Decision Tree: Accuracy = 0.8937 ± 0.0159, F1 = 0.8875 ± 0.0174, Precision = 0.8882 ± 0.0149, Recall = 0.8873 ± 0.0203


Evaluating models with RDKIT descriptors:  58%|█████▊    | 7/12 [00:49<00:50, 10.00s/it]

RDKIT - Gradient Boosting: Accuracy = 0.9198 ± 0.0225, F1 = 0.9162 ± 0.0235, Precision = 0.9128 ± 0.0233, Recall = 0.9213 ± 0.0237


Evaluating models with RDKIT descriptors:  67%|██████▋   | 8/12 [00:57<00:36,  9.15s/it]

RDKIT - AdaBoost: Accuracy = 0.8974 ± 0.0354, F1 = 0.8924 ± 0.0372, Precision = 0.8905 ± 0.0370, Recall = 0.8958 ± 0.0375


Evaluating models with RDKIT descriptors:  75%|███████▌  | 9/12 [01:00<00:21,  7.32s/it]

RDKIT - Extra Trees: Accuracy = 0.9254 ± 0.0176, F1 = 0.9223 ± 0.0180, Precision = 0.9191 ± 0.0180, Recall = 0.9286 ± 0.0169


Evaluating models with RDKIT descriptors:  83%|████████▎ | 10/12 [01:04<00:12,  6.23s/it]

RDKIT - LightGBM: Accuracy = 0.9236 ± 0.0258, F1 = 0.9202 ± 0.0272, Precision = 0.9168 ± 0.0262, Recall = 0.9262 ± 0.0295


Evaluating models with RDKIT descriptors: 100%|██████████| 12/12 [06:34<00:00, 32.92s/it]

RDKIT - CatBoost: Accuracy = 0.9217 ± 0.0150, F1 = 0.9188 ± 0.0153, Precision = 0.9144 ± 0.0152, Recall = 0.9274 ± 0.0145
RDKIT - KNN: Accuracy = 0.7574 ± 0.0198, F1 = 0.7402 ± 0.0187, Precision = 0.7455 ± 0.0214, Recall = 0.7374 ± 0.0169
Results for PADEL:
                     Accuracy (mean)  Accuracy (std)  F1 (mean)  F1 (std)  \
Random Forest               0.910436        0.015246   0.906857  0.015018   
SVM                         0.636154        0.017467   0.467033  0.034516   
XGBoost                     0.906767        0.015236   0.902542  0.016311   
MLP                         0.777985        0.019936   0.762673  0.022683   
Logistic Regression         0.828366        0.024015   0.823172  0.022805   
Decision Tree               0.850796        0.045203   0.841689  0.047874   
Gradient Boosting           0.893666        0.007331   0.888036  0.005965   
AdaBoost                    0.893700        0.030295   0.888624  0.030974   
Extra Trees                 0.925355        0.014




In [None]:
combined_results = pd.concat(
    [df.assign(Descriptor_Method=method) for method, df in all_results.items()],
    ignore_index=True
)

# Save the combined results to a CSV file
combined_results.to_csv('all_descriptor_results.csv', index=False)

print("All results have been saved to 'all_descriptor_results.csv'.")

All results have been saved to 'all_descriptor_results.csv'.
