In [1]:
# Data process
!pip install rdkit-pypi
import pandas as pd
import numpy as np
from sklearn.utils import resample
import os

import pandas as pd
import numpy as np
from sklearn.utils import resample
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize

def process_chembl_data(filepath):
    # Read data
    x = pd.read_csv(filepath, sep=";")

    # Display basic data info
    print(x.head())
    print(x.shape)
    print(x.columns)

    # Select specific columns
    x1 = x[['Molecule ChEMBL ID', 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units']]
    print(x1.head())
    print(x1.shape)
    print(x1['Standard Units'].value_counts())
    print(x1['Standard Relation'].value_counts())
    print(x1['Standard Type'].value_counts())

    # Sorting and cleaning data
    x1.sort_values('Standard Units', ascending=True, inplace=True)
    x1 = x1.dropna()

    # Filtering data
    df = x1[x1['Standard Units'].str.contains('nM')]
    print(df['Standard Type'].value_counts())
    print(df['Molecule ChEMBL ID'].value_counts())

    # More data manipulation
    df3 = df[df['Molecule ChEMBL ID'].str.contains('CHEMBL488')]
    print(df3.head(20))
    print(df3.tail())
    print(df3['Standard Value'].min())

    df3.sort_values('Standard Value', ascending=True, inplace=True)
    print(df3)
    print(df.head())

    # Remove duplicates and unwanted standard types
    df.sort_values('Standard Value', ascending=True, inplace=True)
    df = df.drop_duplicates(subset=['Molecule ChEMBL ID'], keep='first')
    print(df.shape)
    df = df[df['Standard Type'].str.contains('IC50')]
    df = df[~df['Standard Type'].str.contains('pIC50|Log IC50')]
    print(df['Standard Type'].value_counts())

    # Classifying compounds
    active = df.loc[df['Standard Value'] <= 100]
    inactive = df.loc[df['Standard Value'] >= 1000]

    # Handling data imbalance
    combined_df = pd.concat([active, inactive])
    df_majority = combined_df[combined_df['Standard Value'] >= 1000]
    df_minority = combined_df[combined_df['Standard Value'] <= 100]

    df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=123)
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    print(df_upsampled['Standard Value'].value_counts())

    # Further filtering
    df = df_upsampled[~df_upsampled['Standard Units'].str.contains('ug.mL-1')]
    active = df.loc[df['Standard Value'] <= 100]
    inactive = df.loc[df['Standard Value'] >= 1000]

    active = active.assign(label=1)
    inactive = inactive.assign(label=0)

    # Combine and save results
    combined = pd.concat([active, inactive])
    combined.to_csv('aromatase_filtered.csv', index=False)

    # Standardize SMILES
    def standardize_smiles(smiles):
        """Standardizes a SMILES string using RDKit."""
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            Chem.rdmolops.RemoveStereochemistry(mol)
            mol = rdMolStandardize.ChargeParent(mol)
            return Chem.MolToSmiles(mol, isomericSmiles=False)
        return None

    combined['Standardized_Smiles'] = combined['Smiles'].apply(standardize_smiles)
    combined.to_csv('aromatase_standardized.csv', index=False)
    return combined

# Example usage:
df_processed = process_chembl_data('data.csv')


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from rdkit import RDLogger
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict # Import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier,
                              StackingClassifier, BaggingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from rdkit import RDLogger
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict # Import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.utils import resample
from rdkit import Chem
# Import AllChem explicitly
from rdkit.Chem import AllChem
from rdkit.Chem.MolStandardize import rdMolStandardize
# prompt: create a new data  from aromatase_standardized.csv, select : Filter
# Molecule ChEMBL ID, Standarized_Smiles, label

import pandas as pd
new_df = pd.read_csv('aromatase_standardized.csv', usecols=['Molecule ChEMBL ID', 'Standardized_Smiles', 'label'])
new_df[['Standardized_Smiles','label']].to_csv("aromatase.smi", index=None, header =None, sep='\t')
t1 = Chem.SmilesMolSupplier('aromatase.smi', delimiter='\t', titleLine=False)
fp = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in t1 if mol]
train = np.asarray(fp, dtype= np.int32)
ids = [mol.GetProp('_Name') for mol in t1 if mol]
labels = np.asarray(ids, dtype = int).reshape(-1,1) # Use the built-in int instead of np.int
dataset = np.hstack([train, labels])
np.save('dataset_feature', dataset)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.25,shuffle= True,  random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(random_state=42, probability=True), # probability=True for ROC AUC
    "KNN": KNeighborsClassifier(),
    "MLP": MLPClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Bagging": BaggingClassifier(random_state=42)
}

results_baseline_models = {}
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)[:, 1]  # Probability for positive class

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    mcc = matthews_corrcoef(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Calculate Sensibility and Specificity from the confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    sensibility = tp / (tp + fn) if (tp + fn) !=0 else 0
    specificity = tn / (tn + fp) if (tn + fp) !=0 else 0

    results_baseline_models[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC AUC': roc_auc,
        'MCC': mcc,
        'Balanced Accuracy': balanced_accuracy,
        'Kappa': kappa,
        'Sensibility': sensibility,
        'Specificity': specificity
    }
    print(f"\n{name}:")
    for metric, value in results_baseline_models[name].items():
        print(f"{metric}: {value}")
    print(classification_report(y_test, y_pred))
# Convert the results dictionary to a DataFrame
results_df = pd.DataFrame.from_dict(results_baseline_models, orient='index')

# Convert results to DataFrame and save
results_df = pd.DataFrame.from_dict(results_baseline_models, orient='index')
results_df.to_csv('baseline_models_results.csv', index=True)

# Define the base models for stacking
base_models = [(name, model) for name, model in models.items()]

# Define final estimator for stacking
final_estimator = LogisticRegression(random_state=42)

# Create the stacking classifier
stacking_classifier = StackingClassifier(
    estimators=base_models,
    final_estimator=final_estimator,
    cv=5,  # 5-fold stratified cross-validation
    stack_method='auto',
    n_jobs=-1
)

# Train and evaluate the stacked model
stacking_classifier.fit(x_train, y_train.ravel())
y_train_pred = stacking_classifier.predict(x_train)
y_train_proba = stacking_classifier.predict_proba(x_train)[:, 1]
y_test_pred = stacking_classifier.predict(x_test)
y_test_proba = stacking_classifier.predict_proba(x_test)[:, 1]

stack_train_metrics = calculate_metrics(y_train, y_train_pred, y_train_proba)
stack_test_metrics = calculate_metrics(y_test, y_test_pred, y_test_proba)

print("\nStacked Model - Training vs. Testing Performance Comparison:")
for metric in stack_train_metrics.keys():
    print(f"{metric} - Train: {stack_train_metrics[metric]:.4f}, Test: {stack_test_metrics[metric]:.4f}")

# Save the stacked model and metrics
joblib.dump(stacking_classifier, 'stacking_classifier.pkl')
stacked_results_df = pd.DataFrame([stack_train_metrics, stack_test_metrics], index=['Train', 'Test'])
stacked_results_df.to_csv('stacked_model_performance.csv')

Defaulting to user installation because normal site-packages is not writeable


ERROR: Could not find a version that satisfies the requirement rdkit-pypi (from versions: none)
ERROR: No matching distribution found for rdkit-pypi


ModuleNotFoundError: No module named 'rdkit'

In [2]:
conda install -c conda-forge rdkit

error: bad escape \P at position 28