<a href="https://colab.research.google.com/github/Yanbelo/Aromatase/blob/main/YanBeloFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install rdkit-pypi
!pip install padelpy
import pandas as pd
import numpy as np
from sklearn.utils import resample
import os
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import AllChem
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix)
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier,
                              StackingClassifier, BaggingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
import joblib
from rdkit import RDLogger
import glob
from padelpy import padeldescriptor

# Load dataset
x = pd.read_csv('data.csv', sep=';')
x1 = x[['Molecule ChEMBL ID', 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units']]
x1.dropna(inplace=True)
df = x1[x1['Standard Units'].str.contains('nM') & x1['Standard Type'].str.contains('IC50')]
df = df[~df['Standard Type'].str.contains('pIC50|Log IC50')]

# Labeling
active = df.loc[df['Standard Value'] <= 1000].assign(label=1)
inactive = df.loc[df['Standard Value'] >= 1800].assign(label=0)
combined = pd.concat([active, inactive], axis=0)
combined[['Smiles', 'label']].to_csv("aromatase.smi", index=None, header=None, sep='\t')

# Standardize SMILES
def standardize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        Chem.rdmolops.RemoveStereochemistry(mol)
        mol = rdMolStandardize.ChargeParent(mol)
        return Chem.MolToSmiles(mol, isomericSmiles=False)
    return None

combined['Standardized_Smiles'] = combined['Smiles'].apply(standardize_smiles)
combined.to_csv('aromatase_standardized.csv', index=False)

# Filtering necessary columns
new_df = pd.read_csv('aromatase_standardized.csv', usecols=['Molecule ChEMBL ID', 'Standardized_Smiles', 'label'])
df4 = new_df[['Standardized_Smiles', 'Molecule ChEMBL ID']]
df4.to_csv('molecule.smi', sep='\t', index=False, header=False)




Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5
Collecting padelpy
  Downloading padelpy-0.1.16-py3-none-any.whl.metadata (7.7 kB)
Downloading padelpy-0.1.16-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m74.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.16


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[17:20:42] Running LargestFragmentChooser
[17:20:42] Fragment: COc1ccc2c(c1)C1C(CC2)C1c1ccncc1
[17:20:42] New largest fragment: COc1ccc2c(c1)C1C(CC2)C1c1ccncc1 (36)
[17:20:42] Running Uncharger
[17:20:42] Initializing MetalDisconnector
[17:20:42] Running MetalDisconnector
[17:20:42] Initializing Normalizer
[17:20:42] Running Normalizer
[17:20:42] Initializing MetalDisconnector
[17:20:42] Running MetalDisconnector
[17:20:42] Initializing Normalizer
[17:20:42] Running Normalizer
[17:20:42] Running LargestFragmentChooser
[17:20:42] Fragment: COCOc1ccc2c(c1)OC(c1ccc(OC)c(CC=C(C)C)c1)CC2=O
[17:20:42] New largest fragment: COCOc1ccc2c(c1)OC(c1ccc(OC)c(CC=C(C)C)c1)CC2=O (54)
[17:20:42] Running Uncharger
[17:20:42] Initializing MetalDisconnector
[17:20:42] Running MetalDisconnector
[17:20:42] Initializing Normalizer
[17:20:42] Running Normalizer
[17:20:42] Initializing MetalDisconnector
[17:20:42] Running MetalDisconnector
[17:20

In [3]:
! pip install padelpy



In [4]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

--2025-01-25 17:20:49--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2025-01-25 17:20:49--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’


2025-01-25 17:20:49 (53.6 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFin

In [5]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [6]:
!pip install xgboost




In [7]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [8]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [9]:
df3 = new_df
df3

Unnamed: 0,Molecule ChEMBL ID,label,Standardized_Smiles
0,CHEMBL95109,1,COc1ccc2c(c1)OC(c1ccccc1)CC2n1ccnc1
1,CHEMBL2311169,1,CC12CCC3C(CC=C4CCCCC43C)C1CCC2=O
2,CHEMBL304770,1,CC(=O)OC1CC2C3CCC(=O)C3(C)CCC2C2(C)CCCC=C12
3,CHEMBL73219,1,CCCCCC1CC2C(CCC3(C)C(O)CCC23)C2(C)CCCC=C12
4,CHEMBL243960,1,O=C(c1ccc(-c2ccccc2)cc1)c1ccccc1Cn1ccnc1
...,...,...,...
3229,CHEMBL3752500,0,CSC1=C2CCC3C4CCC(=O)C4(C)CCC3C2(C)CCC1=O
3230,CHEMBL5274290,0,CC(=O)OC1CC2C3CCC(=O)C3(C)CCC2C2(C)CCC(=O)C=C12
3231,CHEMBL5270376,0,CC(=O)N1N=C2C(CC3C4CCC5=CC(=O)C=CC5(C)C4CCC23C...
3232,CHEMBL5289628,0,COc1ccc(C2C3CC4C5CCC6CC(O)CCC6(C)C5CCC4(C)C3=N...


In [10]:

import pandas as pd
new_df = pd.read_csv('aromatase_standardized.csv', usecols=['Molecule ChEMBL ID', 'Standardized_Smiles', 'label'])
print(new_df.head())


df4 = pd.concat( [df3['Standardized_Smiles'],df3['Molecule ChEMBL ID']], axis=1 )
df4.to_csv('molecule.smi', sep='\t', index=False, header=False)
df4

  Molecule ChEMBL ID  label                          Standardized_Smiles
0        CHEMBL95109      1          COc1ccc2c(c1)OC(c1ccccc1)CC2n1ccnc1
1      CHEMBL2311169      1             CC12CCC3C(CC=C4CCCCC43C)C1CCC2=O
2       CHEMBL304770      1  CC(=O)OC1CC2C3CCC(=O)C3(C)CCC2C2(C)CCCC=C12
3        CHEMBL73219      1   CCCCCC1CC2C(CCC3(C)C(O)CCC23)C2(C)CCCC=C12
4       CHEMBL243960      1     O=C(c1ccc(-c2ccccc2)cc1)c1ccccc1Cn1ccnc1


Unnamed: 0,Standardized_Smiles,Molecule ChEMBL ID
0,COc1ccc2c(c1)OC(c1ccccc1)CC2n1ccnc1,CHEMBL95109
1,CC12CCC3C(CC=C4CCCCC43C)C1CCC2=O,CHEMBL2311169
2,CC(=O)OC1CC2C3CCC(=O)C3(C)CCC2C2(C)CCCC=C12,CHEMBL304770
3,CCCCCC1CC2C(CCC3(C)C(O)CCC23)C2(C)CCCC=C12,CHEMBL73219
4,O=C(c1ccc(-c2ccccc2)cc1)c1ccccc1Cn1ccnc1,CHEMBL243960
...,...,...
3229,CSC1=C2CCC3C4CCC(=O)C4(C)CCC3C2(C)CCC1=O,CHEMBL3752500
3230,CC(=O)OC1CC2C3CCC(=O)C3(C)CCC2C2(C)CCC(=O)C=C12,CHEMBL5274290
3231,CC(=O)N1N=C2C(CC3C4CCC5=CC(=O)C=CC5(C)C4CCC23C...,CHEMBL5270376
3232,COc1ccc(C2C3CC4C5CCC6CC(O)CCC6(C)C5CCC4(C)C3=N...,CHEMBL5289628


In [11]:
from padelpy import padeldescriptor # Import the padeldescriptor function from the padelpy module

import pandas as pd
fingerprint_list = ['AtomPairs2D', 'EState', 'CDKextended', 'CDK', 'CDKgraphonly',
                    'KlekotaRothCount', 'KlekotaRoth', 'MACCS', 'PubChem',
                    'SubstructureCount', 'Substructure']

for fingerprint in fingerprint_list:
  fingerprint_output_file = ''.join([fingerprint,'.csv'])
  fingerprint_descriptortypes = fp[fingerprint]

  padeldescriptor(mol_dir='molecule.smi', # Call padeldescriptor function
                  d_file=fingerprint_output_file,
                  descriptortypes=fingerprint_descriptortypes,
                  detectaromaticity=True,
                  standardizenitro=True,
                  standardizetautomers=True,
                  threads=2,
                  removesalt=True,
                  log=True,
                  fingerprints=True)

  descriptors = pd.read_csv(fingerprint_output_file)
  # You can now process or store the descriptors dataframe as needed
  print(f"Descriptors for {fingerprint} calculated and saved to {fingerprint_output_file}")
  # Example: store the dataframe in a dictionary or list
  #your_dataframe_dict[fingerprint] = descriptors

Descriptors for AtomPairs2D calculated and saved to AtomPairs2D.csv
Descriptors for EState calculated and saved to EState.csv
Descriptors for CDKextended calculated and saved to CDKextended.csv
Descriptors for CDK calculated and saved to CDK.csv
Descriptors for CDKgraphonly calculated and saved to CDKgraphonly.csv
Descriptors for KlekotaRothCount calculated and saved to KlekotaRothCount.csv
Descriptors for KlekotaRoth calculated and saved to KlekotaRoth.csv
Descriptors for MACCS calculated and saved to MACCS.csv
Descriptors for PubChem calculated and saved to PubChem.csv
Descriptors for SubstructureCount calculated and saved to SubstructureCount.csv
Descriptors for Substructure calculated and saved to Substructure.csv


In [12]:
# prompt: from all the calculated AtomPairs2D',
# #  'EState',
# #  'CDKextended',
# #  'CDK',
# #  'CDKgraphonly',
# #  'KlekotaRothCount',
# #  'KlekotaRoth',
# #  'MACCS',
# #  'PubChem',
# #  'SubstructureCount',
# #  'Substructure descriptors add in each new colunm label from df4 and store as dataframe

import pandas as pd
fingerprint_list = ['AtomPairs2D', 'EState', 'CDKextended', 'CDK', 'CDKgraphonly',
                    'KlekotaRothCount', 'KlekotaRoth', 'MACCS', 'PubChem',
                    'SubstructureCount', 'Substructure']

# Create a dictionary to store DataFrames
descriptors_dfs = {}

# Loop through the list of fingerprints
for fingerprint in fingerprint_list:
  fingerprint_output_file = ''.join([fingerprint,'.csv'])  # Construct the file name

  try:
    descriptors_df = pd.read_csv(fingerprint_output_file)
    descriptors_df = pd.concat([descriptors_df, df3['label']], axis=1)  # Add the 'label' column from df4
    descriptors_dfs[fingerprint] = descriptors_df  # Store the DataFrame in the dictionary
  except FileNotFoundError:
    print(f"File not found for {fingerprint}: {fingerprint_output_file}")

# prompt: for each developed dataframe name them descriptor_fringerprint

# Create a dictionary to store DataFrames with descriptive names
descriptors_dfs_renamed = {}

# Loop through the list of fingerprints and rename DataFrames
for fingerprint, df in descriptors_dfs.items():
  if 'Name' in df.columns:
    df = df.drop('Name', axis=1)
  descriptors_dfs_renamed[f'descriptors_{fingerprint}'] = df

# Now you have a dictionary 'descriptors_dfs_renamed' where the keys are like 'descriptors_MACCS',
# 'descriptors_AtomPairs2D', etc., and the values are the corresponding DataFrames.

# You can access the DataFrames using these keys:
# Example:
# descriptors_MACCS_df = descriptors_dfs_renamed['descriptors_MACCS']

# Now you have a dictionary 'descriptors_dfs' where the keys are the fingerprint names
# and the values are the corresponding DataFrames with 'label' added.
# prompt: print them

# prompt: in all the developed Dataframe, remove the colunm:"Name" and print them

for fingerprint, df in descriptors_dfs.items():
  if 'Name' in df.columns:
    df = df.drop('Name', axis=1)
    print(f"\n--- {fingerprint} Fingerprint DataFrame (Name column removed) ---")
    print(df.head())
  else:
    print(f"\n--- {fingerprint} Fingerprint DataFrame (Name column not found) ---")
    print(df.head())


--- AtomPairs2D Fingerprint DataFrame (Name column removed) ---
   AD2D1  AD2D2  AD2D3  AD2D4  AD2D5  AD2D6  AD2D7  AD2D8  AD2D9  AD2D10  ...  \
0      1      1      1      0      0      0      0      0      0       0  ...   
1      1      0      1      0      0      0      0      0      0       0  ...   
2      1      0      1      0      0      0      0      0      0       0  ...   
3      1      0      1      0      0      0      0      0      0       0  ...   
4      1      1      1      0      0      0      0      0      0       0  ...   

   AD2D772  AD2D773  AD2D774  AD2D775  AD2D776  AD2D777  AD2D778  AD2D779  \
0        0        0        0        0        0        0        0        0   
1        0        0        0        0        0        0        0        0   
2        0        0        0        0        0        0        0        0   
3        0        0        0        0        0        0        0        0   
4        0        0        0        0        0        0        

Randomforest

In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    Rf = RandomForestClassifier(max_depth = None, min_samples_split= 2, n_estimators = 100, random_state=42)
    classifiers[descriptor_name] = Rf

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    Rf = classifiers[name]
    Rf.fit(X_train, y_train)
    y_train_pred = Rf.predict(X_train)
    y_test_pred = Rf.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'Rf_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_Rf = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_Rf.fit(X_train, y_train)
y_train_pred_stack = stack_Rf.predict(X_train)
y_test_pred_stack = stack_Rf.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_Rf_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)


Adaboost

In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    Ada = AdaBoostClassifier(learning_rate = 0.1, n_estimators = 200, random_state=42)
    classifiers[descriptor_name] = Ada

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    Ada = classifiers[name]
    Ada.fit(X_train, y_train)
    y_train_pred = Ada.predict(X_train)
    y_test_pred = Ada.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'Ada_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_Ada = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_Ada.fit(X_train, y_train)
y_train_pred_stack = stack_Ada.predict(X_train)
y_test_pred_stack = stack_Ada.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_Ada_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

Gradient Boosting

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    GB = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 5, n_estimators = 100, random_state=42)
    classifiers[descriptor_name] = GB

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    GB = classifiers[name]
    GB.fit(X_train, y_train)
    y_train_pred = GB.predict(X_train)
    y_test_pred = GB.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'GB_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_GB = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_GB.fit(X_train, y_train)
y_train_pred_stack = stack_GB.predict(X_train)
y_test_pred_stack = stack_GB.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_GB_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

Decision Tree

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    DT = DecisionTreeClassifier(max_depth = None, min_samples_split = 5, random_state=42)
    classifiers[descriptor_name] = DT

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    DT = classifiers[name]
    DT.fit(X_train, y_train)
    y_train_pred = DT.predict(X_train)
    y_test_pred = DT.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'DT_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_DT = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_DT.fit(X_train, y_train)
y_train_pred_stack = stack_DT.predict(X_train)
y_test_pred_stack = stack_DT.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_DT_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

SVM

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}
# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    SVM = SVC(C = 10, kernel = 'rbf', random_state=42, probability=True)
    classifiers[descriptor_name] = SVM

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    SVM = classifiers[name]
    SVM.fit(X_train, y_train)
    y_train_pred = SVM.predict(X_train)
    y_test_pred = SVM.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'SVM_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_SVM = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_SVM.fit(X_train, y_train)
y_train_pred_stack = stack_SVM.predict(X_train)
y_test_pred_stack = stack_SVM.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_SVM_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

KNN

In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    KNN =  KNeighborsClassifier(n_neighbors =5)
    classifiers[descriptor_name] = KNN

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    KNN = classifiers[name]
    KNN.fit(X_train, y_train)
    y_train_pred = KNN.predict(X_train)
    y_test_pred = KNN.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'KNN_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_KNN = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_SVM.fit(X_train, y_train)
y_train_pred_stack = stack_KNN.predict(X_train)
y_test_pred_stack = stack_KNN.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_KNN_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

NotFittedError: This StackingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

MLP

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    MLP =  MLPClassifier(alpha = 0.0001, hidden_layer_sizes = (100,), random_state=42)
    classifiers[descriptor_name] = MLP

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    MLP = classifiers[name]
    MLP.fit(X_train, y_train)
    y_train_pred = MLP.predict(X_train)
    y_test_pred = MLP.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'MLP_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_MLP = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_MLP.fit(X_train, y_train)
y_train_pred_stack = stack_MLP.predict(X_train)
y_test_pred_stack = stack_MLP.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_MLP_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

Naive Bayes

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    NB = GaussianNB()
    classifiers[descriptor_name] = MLP

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    NB = classifiers[name]
    NB.fit(X_train, y_train)
    y_train_pred = NB.predict(X_train)
    y_test_pred = NB.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'NB_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_NB = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_NB.fit(X_train, y_train)
y_train_pred_stack = stack_NB.predict(X_train)
y_test_pred_stack = stack_NB.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_NB_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

Logistic Regression

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    LR = LogisticRegression(C = 1, penalty = 'l2', random_state=42, max_iter=1000)
    classifiers[descriptor_name] = LR

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    LR = classifiers[name]
    LR.fit(X_train, y_train)
    y_train_pred = LR.predict(X_train)
    y_test_pred = LR.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'LR_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_LR = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_LR.fit(X_train, y_train)
y_train_pred_stack = stack_LR.predict(X_train)
y_test_pred_stack = stack_LR.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_LR_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

Bagging

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculates performance metrics."""
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['F1-Score'] = f1_score(y_true, y_pred, average='weighted')
    try:
        metrics['ROC AUC'] = roc_auc_score(y_true, y_pred)
    except ValueError:
        metrics['ROC AUC'] = 0  # Handle non-binary classification
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    return metrics

# Data import and setup
descriptor_dfs = {
    'AP2D': descriptors_dfs_renamed['descriptors_AtomPairs2D'],
    'EST': descriptors_dfs_renamed['descriptors_EState'],
    'MACCS': descriptors_dfs_renamed['descriptors_MACCS'],
    'PUBC': descriptors_dfs_renamed['descriptors_PubChem'],
    'FP4': descriptors_dfs_renamed['descriptors_Substructure'],  # Substructure
    'KR': descriptors_dfs_renamed['descriptors_KlekotaRoth'],
    'KRC': descriptors_dfs_renamed['descriptors_KlekotaRothCount'],
    'FP4C': descriptors_dfs_renamed['descriptors_SubstructureCount'],  # SubstructureCount
    'CDKEXT': descriptors_dfs_renamed['descriptors_CDKextended'],
    'CDKGR': descriptors_dfs_renamed['descriptors_CDKgraphonly']
}

# Initialize classifiers for each descriptor type
classifiers = {}
for descriptor_name in descriptor_dfs.keys():
    BG =BaggingClassifier(random_state=42, n_estimators = 50)
    classifiers[descriptor_name] = BG

# Training and evaluating each classifier
results = {}
for name, df in descriptor_dfs.items():
    X = df.drop(["label"], axis=1)
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    BG = classifiers[name]
    BG.fit(X_train, y_train)
    y_train_pred = BG.predict(X_train)
    y_test_pred = BG.predict(X_test)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    results[name] = {'train': train_metrics, 'test': test_metrics}

    # Save metrics to Excel sheets
    with pd.ExcelWriter(f'LR_{name}.xlsx') as writer:
        train_df = pd.DataFrame(train_metrics, index=[0])
        test_df = pd.DataFrame(test_metrics, index=[0])
        train_df.to_excel(writer, sheet_name='Train', index=False)
        test_df.to_excel(writer, sheet_name='Test', index=False)

# Stacking classifiers
estimator_list = [(name, classifiers[name]) for name in classifiers]
stack_BG = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression(), cv=5
)
stack_BG.fit(X_train, y_train)
y_train_pred_stack = stack_BG.predict(X_train)
y_test_pred_stack = stack_BG.predict(X_test)
stack_train_metrics = calculate_metrics(y_train, y_train_pred_stack)
stack_test_metrics = calculate_metrics(y_test, y_test_pred_stack)

# Save stacked model metrics to Excel
with pd.ExcelWriter('stacked_BG_metrics.xlsx') as writer:
    pd.DataFrame(stack_train_metrics, index=[0]).to_excel(writer, sheet_name='Train', index=False)
    pd.DataFrame(stack_test_metrics, index=[0]).to_excel(writer, sheet_name='Test', index=False)

In [None]:
# Define estimators
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimator_list = [
    ('stack_BG',stack_BG),
    ('stack_LG',stack_LG),
    ('stack_NB',stack_NB),
    ('stack_MLP',stack_MLP),
    ('stack_Rf',stack_Rf),
    ('stack_SVM',stack_SVM),
    ('stack_GB',stack_GB),
    ('stack_KNN',stack_KNN),
    ('stack_DT',stack_DT),
    ('stack_Ada',stack_Ada)
    ]

# Build stack model
stack = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train stacked model
stack.fit(X_train, y_train)

# Make predictions
y_train_pred_stack = stack.predict(X_train)
y_test_pred_stack = stack.predict(X_test)
# Create DataFrames for training and test metrics
train_metrics_stack = pd.DataFrame(columns=['Metric', 'Value'])
test_metrics_stack = pd.DataFrame(columns=['Metric', 'Value'])


def calculate_metrics(y_true, y_pred):
  """Calculates performance metrics."""
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred, average='weighted')
  recall = recall_score(y_true, y_pred, average='weighted')
  f1 = f1_score(y_true, y_pred, average='weighted')
  try:
      roc_auc = roc_auc_score(y_true, y_pred)
  except ValueError:
      roc_auc = 0
  mcc = matthews_corrcoef(y_true, y_pred)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  kappa = cohen_kappa_score(y_true, y_pred)
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
  sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
  specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
  return {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1-Score': f1,
      'ROC AUC': roc_auc,
      'MCC': mcc,
      'Balanced Accuracy': balanced_accuracy,
      'Kappa': kappa,
      'Sensitivity': sensitivity,
      'Specificity': specificity
  }

# Calculate metrics for training set
train_metrics_dict_stack = calculate_metrics(y_train, y_train_pred_stack)
for metric, value in train_metrics_dict.items():
    train_metrics_stack = pd.concat([train_metrics_stack, pd.DataFrame({'Metric': [metric], 'Value': [value]})], ignore_index=True)


# Calculate metrics for test set
test_metrics_dict = calculate_metrics(y_test, y_test_pred_stack)
for metric, value in test_metrics_dict.items():
  test_metrics_stack = pd.concat([test_metrics_stack, pd.DataFrame({'Metric': [metric], 'Value': [value]})], ignore_index=True)


# Save metrics to Excel sheets
with pd.ExcelWriter('stack.xlsx') as writer:
    train_metrics.to_excel(writer, sheet_name='Train', index=False)
    test_metrics.to_excel(writer, sheet_name='Test', index=False)