<a href="https://colab.research.google.com/github/Yanbelo/Aromatase/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: connect to google drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True) # add the force_remount parameter

In [None]:
ls

In [None]:
import pandas as pd
import numpy as np

In [None]:
x = pd.read_csv('data.csv', sep = ";")

In [None]:
x.head()

In [None]:
x.columns

In [None]:
x1 =x[['Molecule ChEMBL ID','Smiles','Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units']]

In [None]:
x1.head()

In [None]:
x1.shape

In [None]:
x1['Standard Units'].value_counts()

In [None]:
x1['Standard Relation'].value_counts()

In [None]:
x1['Standard Type'].value_counts()

In [None]:
x1.sort_values('Standard Units',ascending= True)

In [None]:
x1=x1.dropna()

In [None]:
df =x1[x1['Standard Units'].str.contains('nM')]
df

In [None]:
df['Standard Type'].value_counts()

In [None]:
df['Molecule ChEMBL ID'].value_counts()

In [None]:
df3=df[df['Molecule ChEMBL ID'].str.contains('CHEMBL488')]
df3.head(20)

In [None]:
df3.tail()

In [None]:
df3['Standard Value'].min()

In [None]:
df3=df3.sort_values('Standard Value',ascending= True)
df3

In [None]:
df.head()

In [None]:
df=df.sort_values('Standard Value',ascending= True)
df

In [None]:
df=df.drop_duplicates(subset=['Molecule ChEMBL ID'], keep='first')
df.shape

In [None]:
df =df[df['Standard Type'].str.contains('IC50')]
df

In [None]:
df['Standard Type'].value_counts()

In [None]:
# prompt: remove pIC50 and Log IC50 in the standard type

df = df[~df['Standard Type'].str.contains('pIC50')]
df = df[~df['Standard Type'].str.contains('Log IC50')]
df['Standard Type'].value_counts()


In [None]:
df

In [None]:
active = df.loc[df['Standard Value']<= 100]
active

In [None]:
active.shape


In [None]:
inactive = df.loc[df['Standard Value']>= 1000]
inactive

In [None]:
# prompt: deal with the data inbalance

import pandas as pd
from sklearn.utils import resample

# Combine active and inactive datasets
combined_df = pd.concat([active, inactive])

# Separate majority and minority classes
df_majority = combined_df[combined_df['Standard Value'] >= 1000]
df_minority = combined_df[combined_df['Standard Value'] <= 100]

# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
print(df_upsampled['Standard Value'].value_counts())


In [None]:
df_upsampled

In [None]:
df = df_upsampled[~df_upsampled['Standard Units'].str.contains('ug.mL-1')]
df

In [None]:
active = df.loc[df['Standard Value']<= 100]
active

In [None]:
inactive = df.loc[df['Standard Value']>= 1000]
inactive

In [None]:
active['Standard Units'].value_counts()

In [None]:
inactive['Standard Units'].value_counts()

In [None]:
active = active.assign(label =1)
inactive = inactive.assign(label =0)

In [None]:
active

In [None]:
inactive

In [None]:
combined = pd.concat([active, inactive], axis = 0)
combined

In [None]:
combined.to_csv('aromatase_filtered.csv', index=False)

In [None]:
ls

In [None]:
combined[['Smiles','label']].to_csv("aromatase.smi", index=None, header =None, sep='\t')


In [None]:
ls

In [None]:
! pip install rdkit-pypi

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

In [None]:
Draw.MolsToGridImage([Chem.MolFromSmiles(smi) for smi in combined['Smiles'].iloc[0:10]])

In [None]:
# prompt: perform some analysis in the chemical structure

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski

# Function to calculate molecular properties
def calculate_properties(smiles):
  mol = Chem.MolFromSmiles(smiles)
  if mol is not None:
    molecular_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    num_h_donors = Lipinski.NumHDonors(mol)
    num_h_acceptors = Lipinski.NumHAcceptors(mol)
    return molecular_weight, logp, num_h_donors, num_h_acceptors
  else:
    return None, None, None, None

# Apply the function to the 'Smiles' column
combined['Molecular Weight'], combined['LogP'], combined['NumHDonors'], combined['NumHAcceptors'] = zip(*combined['Smiles'].apply(calculate_properties))

# Print the first few rows with the calculated properties
print(combined[['Smiles', 'Molecular Weight', 'LogP', 'NumHDonors', 'NumHAcceptors']].head())

# You can perform further analysis based on these properties,
# such as calculating the distribution of molecular weights,
# identifying compounds with specific properties, or
# creating visualizations to understand the data better.


In [None]:
# prompt: perform further chemical structure analysis

from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors

# Function to calculate additional molecular properties
def calculate_more_properties(smiles):
  mol = Chem.MolFromSmiles(smiles)
  if mol is not None:
    tpsa = rdMolDescriptors.CalcTPSA(mol)
    num_rotatable_bonds = Lipinski.NumRotatableBonds(mol)
    num_rings = Lipinski.RingCount(mol)
    num_atoms = mol.GetNumAtoms()
    # Add more properties as needed
    return tpsa, num_rotatable_bonds, num_rings, num_atoms
  else:
    return None, None, None, None


# Apply the function to the 'Smiles' column
combined['TPSA'], combined['NumRotatableBonds'], combined['NumRings'], combined['NumAtoms'] = zip(*combined['Smiles'].apply(calculate_more_properties))


# Print the first few rows with the calculated properties
print(combined[['Smiles', 'TPSA', 'NumRotatableBonds', 'NumRings', 'NumAtoms']].head())


# Further analysis, such as:
# - Correlation analysis between properties and activity (label)
# - Distribution analysis of different properties for active and inactive compounds
# - Identifying compounds with specific property ranges
# - Visualizing the properties in scatter plots or histograms
# - Clustering compounds based on their properties
# - Feature selection for building a predictive model



In [None]:
# prompt:  Further analysis, such as:
# # - Correlation analysis between properties and activity (label)
# # - Distribution analysis of different properties for active and inactive compounds
# # - Identifying compounds with specific property ranges
# # - Visualizing the properties in scatter plots or histograms
# # - Clustering compounds based on their properties
# # - Feature selection for building a predictive model

import seaborn as sns
import matplotlib.pyplot as plt

# Correlation analysis between properties and activity (label)
correlation_matrix = combined[['Molecular Weight', 'LogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'NumRotatableBonds', 'NumRings', 'NumAtoms', 'label']].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Properties and Activity')
plt.show()

# Distribution analysis of different properties for active and inactive compounds
plt.figure(figsize=(12, 6))
sns.histplot(data=combined, x='Molecular Weight', hue='label', kde=True)
plt.title('Distribution of Molecular Weight for Active and Inactive Compounds')
plt.show()

plt.figure(figsize=(12, 6))
sns.histplot(data=combined, x='LogP', hue='label', kde=True)
plt.title('Distribution of LogP for Active and Inactive Compounds')
plt.show()

# Identifying compounds with specific property ranges
# For example, find compounds with LogP between 2 and 5
specific_compounds = combined[(combined['LogP'] >= 2) & (combined['LogP'] <= 5)]
print(specific_compounds)

# Visualizing the properties in scatter plots or histograms
plt.figure(figsize=(8, 6))
sns.scatterplot(data=combined, x='Molecular Weight', y='LogP', hue='label')
plt.title('Scatter Plot of Molecular Weight vs LogP')
plt.show()

# Clustering compounds based on their properties
# You can use KMeans clustering or hierarchical clustering
from sklearn.cluster import KMeans
X = combined[['Molecular Weight', 'LogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'NumRotatableBonds', 'NumRings', 'NumAtoms']]
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
combined['cluster'] = kmeans.labels_
print(combined.groupby('cluster')['label'].mean()) # Check if clusters are related to activity

# Feature selection for building a predictive model
from sklearn.feature_selection import SelectKBest, f_classif
X = combined[['Molecular Weight', 'LogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'NumRotatableBonds', 'NumRings', 'NumAtoms']]
y = combined['label']
selector = SelectKBest(f_classif, k=5) # Select top 5 features
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print(selected_features)


In [None]:
# prompt: more deep chemical analysis

from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdmolops
from rdkit.Chem import Lipinski

# Function to calculate more detailed chemical properties
def calculate_detailed_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # Calculate properties related to ring systems
        # num_aromatic_rings = rdmolops.GetAromaticRings(mol) # This function does not exist
        num_aromatic_rings = rdMolDescriptors.CalcNumAromaticRings(mol) # Use this function instead to calculate the number of aromatic rings
        num_aliphatic_rings = rdMolDescriptors.CalcNumAliphaticRings(mol)
        num_saturated_rings = rdMolDescriptors.CalcNumSaturatedRings(mol)
        num_heterocycles = rdMolDescriptors.CalcNumHeterocycles(mol)

        # Calculate properties related to functional groups
        num_halogens = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() in [17, 35, 53])
        num_nitrogens = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 7)
        num_oxygens = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 8)
        num_sulfurs = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 16)

        # Calculate other descriptors
        num_valence_electrons = Descriptors.NumValenceElectrons(mol)
        fr_alkyl_halide = rdMolDescriptors.CalcNumLipinskiHBA(mol) # Example of a specific functional group count

        return num_aromatic_rings, num_aliphatic_rings, num_saturated_rings, num_heterocycles, num_halogens, num_nitrogens, num_oxygens, num_sulfurs, num_valence_electrons, fr_alkyl_halide
    else:
        return [None] * 10  # Return a list of None values for all properties


# Apply the function to the 'Smiles' column
(combined['NumAromaticRings'], combined['NumAliphaticRings'], combined['NumSaturatedRings'], combined['NumHeterocycles'],
 combined['NumHalogens'], combined['NumNitrogens'], combined['NumOxygens'], combined['NumSulfurs'],
 combined['NumValenceElectrons'], combined['fr_alkyl_halide']) = zip(*combined['Smiles'].apply(calculate_detailed_properties))

# Print the first few rows with the calculated properties
print(combined[['Smiles', 'NumAromaticRings', 'NumAliphaticRings', 'NumSaturatedRings', 'NumHeterocycles',
                'NumHalogens', 'NumNitrogens', 'NumOxygens', 'NumSulfurs', 'NumValenceElectrons', 'fr_alkyl_halide']].head())

# Further analysis using these properties:
# - Explore relationships between ring systems and activity
# - Identify compounds with specific functional groups
# - Analyze the influence of different atom types on activity
# - Use these properties in machine learning models for prediction

In [None]:
# prompt: store the the above output in csv file

combined.to_csv('aromatase_with_properties.csv', index=False)


In [None]:
# prompt: # Further analysis using these properties:
# # - Explore relationships between ring systems and activity
# # - Identify compounds with specific functional groups
# # - Analyze the influence of different atom types on activity
# # - Use these properties in machine learning models for prediction

import matplotlib.pyplot as plt
# Explore relationships between ring systems and activity
plt.figure(figsize=(8, 6))
sns.boxplot(x='label', y='NumAromaticRings', data=combined)
plt.title('Relationship between Number of Aromatic Rings and Activity')
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x='label', y='NumHeterocycles', data=combined)
plt.title('Relationship between Number of Heterocycles and Activity')
plt.show()

# Identify compounds with specific functional groups
halogen_compounds = combined[combined['NumHalogens'] > 0]
print("Compounds with Halogens:", halogen_compounds.shape[0])

nitrogen_compounds = combined[combined['NumNitrogens'] > 0]
print("Compounds with Nitrogens:", nitrogen_compounds.shape[0])

# Analyze the influence of different atom types on activity
# You can use statistical tests or machine learning models to analyze the relationship between atom counts and activity
# For example, you can use a t-test to compare the average number of nitrogens in active and inactive compounds
from scipy.stats import ttest_ind

active_nitrogens = combined[combined['label'] == 1]['NumNitrogens']
inactive_nitrogens = combined[combined['label'] == 0]['NumNitrogens']
t_statistic, p_value = ttest_ind(active_nitrogens, inactive_nitrogens)
print("T-test for Nitrogen count between active and inactive compounds:")
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# Use these properties in machine learning models for prediction
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Select features for the model
features = ['Molecular Weight', 'LogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'NumRotatableBonds', 'NumRings', 'NumAtoms',
            'NumAromaticRings', 'NumAliphaticRings', 'NumSaturatedRings', 'NumHeterocycles', 'NumHalogens', 'NumNitrogens',
            'NumOxygens', 'NumSulfurs', 'NumValenceElectrons', 'fr_alkyl_halide']
X = combined[features]
y = combined['label']

# Handle missing values
X = X.fillna(0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize # Import the required module

def standardize_smiles(smiles):
  """Standardizes a SMILES string using RDKit."""
  mol = Chem.MolFromSmiles(smiles)
  if mol is not None:
    # Remove stereochemistry
    Chem.rdmolops.RemoveStereochemistry(mol)
    # Neutralize the molecule
    mol = rdMolStandardize.ChargeParent(mol) # Use the correct function to neutralize charges
    # Convert to canonical SMILES
    return Chem.MolToSmiles(mol, isomericSmiles=False)
  else:
    return None

# Apply the function to the 'Smiles' column
combined['Standardized_Smiles'] = combined['Smiles'].apply(standardize_smiles)

print(combined[['Smiles', 'Standardized_Smiles']].head())

In [None]:
combined


In [None]:
combined.to_csv('aromatase_standardized.csv', index=False)

In [None]:
# prompt: create a new data  from aromatase_standardized.csv, select : Filter
# Molecule ChEMBL ID, Standarized_Smiles, label

import pandas as pd
new_df = pd.read_csv('aromatase_standardized.csv', usecols=['Molecule ChEMBL ID', 'Standardized_Smiles', 'label'])
print(new_df.head())


In [None]:
new_df


In [None]:
t1 = Chem.SmilesMolSupplier('aromatase.smi', delimiter='\t', titleLine=False)
t1

In [None]:
fp = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in t1 if mol]
fp


In [None]:
train = np.asarray(fp, dtype= np.int32)
train

In [None]:
train.shape

In [None]:
ids = [mol.GetProp('_Name') for mol in t1 if mol]
ids

In [None]:
labels = np.asarray(ids, dtype = int).reshape(-1,1) # Use the built-in int instead of np.int
labels

In [None]:
dataset = np.hstack([train, labels])
dataset

In [None]:
np.save('dataset_feature', dataset)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.25,shuffle= True,  random_state=42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
y_train

In [None]:
rf_Morgan = RandomForestClassifier(n_estimators=100, random_state=42)
rf_Morgan.fit(x_train, y_train.ravel())
preds = rf_Morgan.predict(x_test)
preds

In [None]:
 roc_auc_score(y_test, preds)


In [None]:
# prompt: other metrics

from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate other metrics
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


In [None]:
# prompt: more metrics

from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score

# Calculate Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, preds)
print("Matthews Correlation Coefficient (MCC):", mcc)

# Calculate Balanced Accuracy Score
balanced_accuracy = balanced_accuracy_score(y_test, preds)
print("Balanced Accuracy Score:", balanced_accuracy)

# Calculate Cohen's Kappa Score
kappa = cohen_kappa_score(y_test, preds)
print("Cohen's Kappa Score:", kappa)


In [None]:
!pip install pycm # install the missing pycm module
from pycm import*  # import the ConfusionMatrix class from the pycm module
cm = ConfusionMatrix(actual_vector=y_test, predict_vector=preds) # create a confusion matrix object

In [None]:
from pycm import ConfusionMatrix
cm = ConfusionMatrix(y_test.reshape (-1),preds)
print(cm)

In [None]:
# prompt: train with other machine learning

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# ... (Your existing code for data preparation and feature extraction) ...


# Handle missing values
X = X.fillna(0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train a Logistic Regression model
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# Train an SVM model
model_svm = SVC(random_state=42)
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# Train a K-Nearest Neighbors model
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


In [None]:
# prompt: all metrics

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score
from pycm import ConfusionMatrix

def calculate_all_metrics(y_true, y_pred):
  """Calculates various classification metrics."""

  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)
  mcc = matthews_corrcoef(y_true, y_pred)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  kappa = cohen_kappa_score(y_true, y_pred)

  # Convert Pandas Series to a NumPy array
  cm = ConfusionMatrix(actual_vector=y_true.to_numpy(), predict_vector=y_pred)

  metrics = {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1-Score': f1,
      'ROC AUC': roc_auc,
      'MCC': mcc,
      'Balanced Accuracy': balanced_accuracy,
      'Kappa': kappa,
      'Confusion Matrix': cm
  }

  return metrics

# Example usage (replace y_test and y_pred with your actual values)
metrics = calculate_all_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
  print(f"{metric_name}: {metric_value}")

In [None]:
# prompt: details aoutcome for each single model

def model_details(model_name, y_true, y_pred):
  """Calculates and prints detailed metrics for a single model."""

  print(f"\n--- {model_name} Model Details ---")

  metrics = calculate_all_metrics(y_true, y_pred)

  for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")


# Example usage for each model:
model_details("Random Forest", y_test, y_pred)
model_details("Logistic Regression", y_test, y_pred_lr)
model_details("SVM", y_test, y_pred_svm)
model_details("KNN", y_test, y_pred_knn)


In [None]:
# prompt: perform a model for MLP, Xgboost, Decision tree et 5 more MLs

from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

# ... (Your existing code for data preparation and feature extraction) ...

# Handle missing values
X = X.fillna(0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an MLP model
model_mlp = MLPClassifier(random_state=42)
model_mlp.fit(X_train, y_train)
y_pred_mlp = model_mlp.predict(X_test)
model_details("MLP", y_test, y_pred_mlp)


# Train an XGBoost model
model_xgb = XGBClassifier(random_state=42)
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
model_details("XGBoost", y_test, y_pred_xgb)


# Train a Decision Tree model
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)
model_details("Decision Tree", y_test, y_pred_dt)


# Train a Logistic Regression model
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
model_details("Logistic Regression", y_test, y_pred_lr)


# Train an SVM model
model_svm = SVC(random_state=42)
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)
model_details("SVM", y_test, y_pred_svm)


# Train a K-Nearest Neighbors model
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)
model_details("KNN", y_test, y_pred_knn)


# Train a Naive Bayes model
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)
model_details("Naive Bayes", y_test, y_pred_nb)


# Train an AdaBoost model
model_ada = AdaBoostClassifier(random_state=42)
model_ada.fit(X_train, y_train)
y_pred_ada = model_ada.predict(X_test)
model_details("AdaBoost", y_test, y_pred_ada)


# Train a Gradient Boosting model
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)
model_details("Gradient Boosting", y_test, y_pred_gb)


# Train a Bagging model
model_bag = BaggingClassifier(random_state=42)
model_bag.fit(X_train, y_train)
y_pred_bag = model_bag.predict(X_test)
model_details("Bagging", y_test, y_pred_bag)


In [None]:
# prompt: store all the model perfomance in csv file for all the above models

import csv

def store_model_performance(filename, model_name, y_true, y_pred):
  """Stores model performance metrics in a CSV file."""

  metrics = calculate_all_metrics(y_true, y_pred)

  with open(filename, 'a', newline='') as csvfile:
    fieldnames = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'MCC', 'Balanced Accuracy', 'Kappa']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header only if the file is newly created
    if csvfile.tell() == 0:
      writer.writeheader()

    writer.writerow({
        'Model': model_name,
        'Accuracy': metrics['Accuracy'],
        'Precision': metrics['Precision'],
        'Recall': metrics['Recall'],
        'F1-Score': metrics['F1-Score'],
        'ROC AUC': metrics['ROC AUC'],
        'MCC': metrics['MCC'],
        'Balanced Accuracy': metrics['Balanced Accuracy'],
        'Kappa': metrics['Kappa']
    })

# Create a CSV file to store the results
csv_filename = 'model_performance_morgan.csv'

# Store the performance of each model
store_model_performance(csv_filename, "Random Forest", y_test, y_pred)
store_model_performance(csv_filename, "Logistic Regression", y_test, y_pred_lr)
store_model_performance(csv_filename, "SVM", y_test, y_pred_svm)
store_model_performance(csv_filename, "KNN", y_test, y_pred_knn)
store_model_performance(csv_filename, "MLP", y_test, y_pred_mlp)
store_model_performance(csv_filename, "XGBoost", y_test, y_pred_xgb)
store_model_performance(csv_filename, "Decision Tree", y_test, y_pred_dt)
store_model_performance(csv_filename, "Naive Bayes", y_test, y_pred_nb)
store_model_performance(csv_filename, "AdaBoost", y_test, y_pred_ada)
store_model_performance(csv_filename, "Gradient Boosting", y_test, y_pred_gb)
store_model_performance(csv_filename, "Bagging", y_test, y_pred_bag)

print(f"Model performance saved to '{csv_filename}'")


In [None]:
# prompt: add more machine learning

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier


# Train a Gradient Boosting model
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

# Train a Multi-Layer Perceptron (MLP) model
model_mlp = MLPClassifier(random_state=42)
model_mlp.fit(X_train, y_train)
y_pred_mlp = model_mlp.predict(X_test)
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))


# Train a Decision Tree model
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


# Example usage for each model:
model_details("Random Forest", y_test, y_pred)
model_details("Logistic Regression", y_test, y_pred_lr)
model_details("SVM", y_test, y_pred_svm)
model_details("KNN", y_test, y_pred_knn)
model_details("Gradient Boosting", y_test, y_pred_gb)
model_details("MLP", y_test, y_pred_mlp)
model_details("Decision Tree", y_test, y_pred_dt)


In [None]:
# prompt: Used now the MACC fringerprint and use this metris : use this metrics as well : Accuracy,
#         'Precision,
#         'Recall,
#         'F1-Score',
#         'ROC AUC',
#         'MCC,
#         'Balanced Accuracy and Kappa

import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score
from pycm import ConfusionMatrix

# ... (Your existing code for data preparation and feature extraction) ...

# Generate MACCS fingerprints
def generate_maccs_fingerprint(smiles):
  mol = Chem.MolFromSmiles(smiles)
  if mol is not None:
    return MACCSkeys.GenMACCSKeys(mol)
  else:
    return None

combined['MACCS_Fingerprint'] = combined['Smiles'].apply(generate_maccs_fingerprint)


# Convert fingerprints to a suitable format for machine learning
X_maccs = np.array([list(fp) for fp in combined['MACCS_Fingerprint'] if fp is not None])
y = combined['label']

# Handle missing values (if any)
X_maccs = np.nan_to_num(X_maccs)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_maccs, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model_rf_maccs = RandomForestClassifier(random_state=42)
model_rf_maccs.fit(X_train, y_train)

# Make predictions
y_pred_rf_maccs = model_rf_maccs.predict(X_test)

# Evaluate the model and calculate metrics
def calculate_all_metrics(y_true, y_pred):
  """Calculates various classification metrics."""
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)
  mcc = matthews_corrcoef(y_true, y_pred)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  kappa = cohen_kappa_score(y_true, y_pred)

  cm = ConfusionMatrix(actual_vector=y_true.to_numpy(), predict_vector=y_pred)

  metrics = {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1-Score': f1,
      'ROC AUC': roc_auc,
      'MCC': mcc,
      'Balanced Accuracy': balanced_accuracy,
      'Kappa': kappa,
      'Confusion Matrix': cm
  }
  return metrics


metrics_rf_maccs = calculate_all_metrics(y_test, y_pred_rf_maccs)

for metric_name, metric_value in metrics_rf_maccs.items():
  print(f"{metric_name}: {metric_value}")


In [None]:
# prompt: perform for Rf, KNN, MLP, Xgboost , Adaboost, decision tree, SVM, Gradient boosting, naive bayes, Logistic Regression, Bagging

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score
from pycm import ConfusionMatrix


# ... (Your existing code for data preparation and feature extraction) ...

# Handle missing values
X = X.fillna(0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Bagging': BaggingClassifier(random_state=42)
}


results = {}

for model_name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  metrics = calculate_all_metrics(y_test, y_pred)
  results[model_name] = metrics

# Print and store the results
for model_name, metrics in results.items():
  print(f"\n--- {model_name} Model Details ---")
  for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")
    # Store results in a CSV file (as before)
    csv_filename = 'model_performance_MACCS.csv'
    store_model_performance(csv_filename, model_name, y_test, y_pred)


print(f"Model performance saved to '{csv_filename}'")



In [None]:
# prompt: Please store the  performance of all MLS-MACCS

# Assuming 'results' dictionary contains the model performance metrics from the previous code

# Create a CSV file to store the results
csv_filename = 'model_performance_MLS-MACCS.csv'

# Store the performance of each model
for model_name, metrics in results.items():
  with open(csv_filename, 'a', newline='') as csvfile:
    fieldnames = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'MCC', 'Balanced Accuracy', 'Kappa']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header only if the file is newly created
    if csvfile.tell() == 0:
      writer.writeheader()

    writer.writerow({
        'Model': model_name,
        'Accuracy': metrics['Accuracy'],
        'Precision': metrics['Precision'],
        'Recall': metrics['Recall'],
        'F1-Score': metrics['F1-Score'],
        'ROC AUC': metrics['ROC AUC'],
        'MCC': metrics['MCC'],
        'Balanced Accuracy': metrics['Balanced Accuracy'],
        'Kappa': metrics['Kappa']
    })

print(f"Model performance saved to '{csv_filename}'")


In [None]:
# prompt: use the  2D atom pair with all the above MLs

import numpy as np
from rdkit.Chem import rdMolDescriptors

# ... (Your existing code for data preparation and feature extraction) ...

# Generate 2D Atom Pair Fingerprints
def generate_atompair_fingerprint(smiles):
  mol = Chem.MolFromSmiles(smiles)
  if mol is not None:
    return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol)
  else:
    return None

combined['AtomPair_Fingerprint'] = combined['Smiles'].apply(generate_atompair_fingerprint)


# Convert fingerprints to a suitable format for machine learning
X_atompair = np.array([list(fp) for fp in combined['AtomPair_Fingerprint'] if fp is not None])
y = combined['label']

# Handle missing values (if any)
X_atompair = np.nan_to_num(X_atompair)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_atompair, y, test_size=0.2, random_state=42)


models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Bagging': BaggingClassifier(random_state=42)
}


results = {}

for model_name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  metrics = calculate_all_metrics(y_test, y_pred)
  results[model_name] = metrics

# Print and store the results
for model_name, metrics in results.items():
  print(f"\n--- {model_name} Model Details ---")
  for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")
    # Store results in a CSV file (as before)
    csv_filename = 'model_performance_atompair.csv'
    store_model_performance(csv_filename, model_name, y_test, y_pred)


print(f"Model performance saved to '{csv_filename}'")


In [None]:
! pip install padelpy

In [None]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

--2024-10-02 18:50:27--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2024-10-02 18:50:28--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip.1’


2024-10-02 18:50:28 (69.8 MB/s) - ‘fingerprints_xml.zip.1’ saved [10871/10871]

Archive:  fingerprints_xml.zip
replace AtomPairs2DFingerprintCount.xml? [y]es, [n]o, [A]ll, [N]one, [

In [None]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

In [None]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [None]:
fp = dict(zip(FP_list, xml_files))
fp

In [None]:
df = combined
df

In [None]:
new_df
df3 = new_df
df3

In [None]:
# prompt: change the name of Standardized_Smiles to Smiles

df3.rename(columns={'Standardized_Smiles': 'Smiles'}, inplace=True)
df3


In [None]:
df2 = pd.concat( [df3['Smiles'],df3['Molecule ChEMBL ID']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2

In [None]:
from padelpy import padeldescriptor

fingerprint = 'MACCS'

fingerprint_output_file_MACCS = ''.join([fingerprint,'.csv']) #MACCS.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi',
                d_file=fingerprint_output_file_MACCS, #'MACCS.csv'
                #descriptortypes='MACCSFingerprinter.xml',
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

MACSS MAchine learning

In [None]:
descriptors_MACCS = pd.read_csv(fingerprint_output_file_MACCS)
descriptors_MACCS

In [None]:
df4= df3['label']
df4


In [None]:
df1= pd.concat([descriptors_MACCS, df3['label']], axis=1)
df1 = df1.drop('Name', axis=1)
df1

In [None]:
# prompt: build 10 MLs classifies modeld with df1 data and with the metrics used preciously

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score
from pycm import ConfusionMatrix
import csv

# Prepare your data
X = df1.drop('label', axis=1)
y = df1['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Bagging': BaggingClassifier(random_state=42)
}


results = {}

for model_name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  metrics = calculate_all_metrics(y_test, y_pred)
  results[model_name] = metrics

# Print and store the results
for model_name, metrics in results.items():
  print(f"\n--- {model_name} Model Details ---")
  for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")
    # Store results in a CSV file (as before)
    csv_filename = 'model_performance_MACCS_MLS2.csv'
    store_model_performance(csv_filename, model_name, y_test, y_pred)


print(f"Model performance saved to '{csv_filename}'")


In [None]:
# prompt: provides the metrics for the testing dataset as independent dataset. for the above Mls models

# Assuming you have already trained your models and have X_test and y_test

# Create an empty dictionary to store the metrics for each model on the test data
test_results = {}

# Loop through each model and make predictions on the test data
for model_name, model in models.items():
    y_pred = model.predict(X_test)

    # Calculate the metrics for the test data
    metrics = calculate_all_metrics(y_test, y_pred)
    test_results[model_name] = metrics

# Print and store the results for each model
for model_name, metrics in test_results.items():
    print(f"\n--- {model_name} Model Details (Test Data) ---")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")

    # Store results in a CSV file (as before)
    csv_filename = 'model_performance_MACCS_MLS2_test.csv'
    store_model_performance(csv_filename, model_name, y_test, y_pred)


print(f"Model performance (Test Data) saved to '{csv_filename}'")


In [None]:
# prompt: check the overfit of the developped models

# Assuming you have already trained your models and have X_train, y_train, X_test, y_test

# Create an empty dictionary to store the metrics for each model on the training and testing data
train_test_results = {}

# Loop through each model and make predictions on the training and testing data
for model_name, model in models.items():
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate the metrics for the training and testing data
    train_metrics = calculate_all_metrics(y_train, y_train_pred)
    test_metrics = calculate_all_metrics(y_test, y_test_pred)
    train_test_results[model_name] = {'train': train_metrics, 'test': test_metrics}

# Print and store the results for each model
for model_name, metrics in train_test_results.items():
    print(f"\n--- {model_name} Model Overfit Analysis ---")
    print("Training Data Metrics:")
    for metric_name, metric_value in metrics['train'].items():
        print(f"{metric_name}: {metric_value}")
    print("\nTesting Data Metrics:")
    for metric_name, metric_value in metrics['test'].items():
        print(f"{metric_name}: {metric_value}")

    # You can compare the training and testing metrics to identify potential overfitting
    # For example, if the training accuracy is significantly higher than the testing accuracy,
    # it could indicate overfitting.


# Optional: Store the train and test metrics in a CSV file for further analysis
# You can modify this part to include the desired columns and format
csv_filename = 'model_overfit_analysis.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    fieldnames = ['Model', 'Data_Split', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'MCC', 'Balanced Accuracy', 'Kappa']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for model_name, metrics in train_test_results.items():
        for data_split, metric_dict in metrics.items():
            writer.writerow({
                'Model': model_name,
                'Data_Split': data_split,
                'Accuracy': metric_dict['Accuracy'],
                'Precision': metric_dict['Precision'],
                'Recall': metric_dict['Recall'],
                'F1-Score': metric_dict['F1-Score'],
                'ROC AUC': metric_dict['ROC AUC'],
                'MCC': metric_dict['MCC'],
                'Balanced Accuracy': metric_dict['Balanced Accuracy'],
                'Kappa': metric_dict['Kappa']
            })

print(f"Model Overfit Analysis saved to '{csv_filename}'")


In [None]:
# prompt: calculate for the following fingerprint :'AtomPairs2DCount', 'AtomPairs2D', 'EState', 'CDKextended', 'CDK', 'CDKgraphonly' save the output for each descriptor in dataframe

import pandas as pd
fingerprint_list = ['AtomPairs2DCount', 'AtomPairs2D', 'EState', 'CDKextended', 'CDK', 'CDKgraphonly']

for fingerprint in fingerprint_list:
  fingerprint_output_file = ''.join([fingerprint,'.csv'])
  fingerprint_descriptortypes = fp[fingerprint]

  padeldescriptor(mol_dir='molecule.smi',
                  d_file=fingerprint_output_file,
                  descriptortypes=fingerprint_descriptortypes,
                  detectaromaticity=True,
                  standardizenitro=True,
                  standardizetautomers=True,
                  threads=2,
                  removesalt=True,
                  log=True,
                  fingerprints=True)

  descriptors = pd.read_csv(fingerprint_output_file)
  # You can now process or store the descriptors dataframe as needed
  print(f"Descriptors for {fingerprint} calculated and saved to {fingerprint_output_file}")
  # Example: store the dataframe in a dictionary or list
  #your_dataframe_dict[fingerprint] = descriptors



Machine learning for AtoMPaird

In [None]:
descriptors_AtomPairs2D = pd.read_csv('AtomPairs2D.csv')
descriptors_AtomPairs2D

In [None]:
df4= df3['label']
df4


In [None]:
df2 =pd.concat([descriptors_AtomPairs2D, df3['label']], axis=1)
df2 = df2.drop('Name', axis=1)
df2

In [None]:
# prompt: build 10 MLs classifies modeld with df1 data and with the metrics used preciously

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score
from pycm import ConfusionMatrix

# Assuming df1 is your dataframe with features (X) and labels (y)
X = df1.drop('label', axis=1)
y = df1['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define your models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Bagging': BaggingClassifier(random_state=42)
}

# Function to calculate metrics
def calculate_all_metrics(y_true, y_pred):
  """Calculates various classification metrics."""
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)
  mcc = matthews_corrcoef(y_true, y_pred)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  kappa = cohen_kappa_score(y_true, y_pred)
  cm = ConfusionMatrix(actual_vector=y_true.to_numpy(), predict_vector=y_pred)

  metrics = {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1-Score': f1,
      'ROC AUC': roc_auc,
      'MCC': mcc,
      'Balanced Accuracy': balanced_accuracy,
      'Kappa': kappa,
      'Confusion Matrix': cm
  }
  return metrics


# Train and evaluate models
results = {}
for model_name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  metrics = calculate_all_metrics(y_test, y_pred)
  results[model_name] = metrics

# Print results
for model_name, metrics in results.items():
  print(f"\n--- {model_name} Model Details ---")
  for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")


In [None]:
Descriptors_AtomPairs2DCounts = pd.read_csv('AtomPairs2DCount.csv')
Descriptors_AtomPairs2DCounts

In [None]:
Descriptors_EState = pd.read_csv('EState.csv')
Descriptors_EState

In [None]:
Descriptors_CDKextended = pd.read_csv('CDKextended.csv')
Descriptors_CDKextended

In [None]:
Descriptors_CDK = pd.read_csv('CDK.csv')
Descriptors_CDK

In [None]:
# prompt: calculate for the following fingerprint :'AtomPairs2DCount', 'AtomPairs2D', 'EState', 'CDKextended', 'CDK', 'CDKgraphonly' save the output for each descriptor in dataframe

import pandas as pd
fingerprint_list = ['PubChem','SubstructureCount','Substructure']

for fingerprint in fingerprint_list:
  fingerprint_output_file = ''.join([fingerprint,'.csv'])
  fingerprint_descriptortypes = fp[fingerprint]

  padeldescriptor(mol_dir='molecule.smi',
                  d_file=fingerprint_output_file,
                  descriptortypes=fingerprint_descriptortypes,
                  detectaromaticity=True,
                  standardizenitro=True,
                  standardizetautomers=True,
                  threads=2,
                  removesalt=True,
                  log=True,
                  fingerprints=True)

  descriptors = pd.read_csv(fingerprint_output_file)
  # You can now process or store the descriptors dataframe as needed
  print(f"Descriptors for {fingerprint} calculated and saved to {fingerprint_output_file}")
  # Example: store the dataframe in a dictionary or list

In [None]:
 # prompt: output for each fingerprint

import pandas as pd
# Assuming 'combined' DataFrame and 'new_df' DataFrame are already defined

# List of fingerprints to calculate
fingerprint_list = ['KlekotaRothCount', 'KlekotaRoth']

# Create a dictionary to map fingerprint names to XML files
fp = dict(zip(fingerprint_list, xml_files))

# Iterate thr#ough the fingerprints and calculate them
for fingerprint in fingerprint_list:
  fingerprint_output_file = ''.join([fingerprint, '.csv'])
  fingerprint_descriptortypes = fp[fingerprint]

  df2 = pd.concat([new_df['Smiles'], new_df['Molecule ChEMBL ID']], axis=1)
  df2.to_csv('molecule.smi', sep='\t', index=False, header=False)

  padeldescriptor(mol_dir='molecule.smi',
                  d_file=fingerprint_output_file,
                  descriptortypes=fingerprint_descriptortypes,
                  detectaromaticity=True,
                  standardizenitro=True,
                  standardizetautomers=True,
                  threads=2,
                  removesalt=True,
                  log=True,
                  fingerprints=True)

  descriptors = pd.read_csv(fingerprint_output_file)
  print(f"\n--- {fingerprint} Fingerprint Output ---")
  print(descriptors.head())  # Print the first few rows of the calculated descriptors
  # Process the calculated descriptors (e.g., merge with your original DataFrame)
  # ...


In [None]:
# prompt: store the output of the calculation in dataframe

import pandas as pd

# Assuming you have already calculated the descriptors and they are stored in a CSV file named 'fingerprint_output_file'

# Create an empty dictionary to store the dataframes
descriptors_dict = {}

# Iterate through the fingerprints and load the descriptors into the dictionary
for fingerprint in fingerprint_list:
  fingerprint_output_file = ''.join([fingerprint,'.csv'])
  try:
    descriptors_dict[fingerprint] = pd.read_csv(fingerprint_output_file)
  except FileNotFoundError:
    print(f"File not found for {fingerprint}: {fingerprint_output_file}")

# You can access the dataframes using the fingerprint names as keys in the dictionary
# For example, to access the dataframe for 'MACCS':
# descriptors_dict['MACCS']



In [None]:
Descriptors_KlekotaRoth= descriptors_dict['KlekotaRoth']
Descriptors_KlekotaRoth

In [None]:
Descriptors_KlekotaRoth


In [None]:
# prompt: from all the calculated AtomPairs2D',
# #  'EState',
# #  'CDKextended',
# #  'CDK',
# #  'CDKgraphonly',
# #  'KlekotaRothCount',
# #  'KlekotaRoth',
# #  'MACCS',
# #  'PubChem',
# #  'SubstructureCount',
# #  'Substructure descriptors add in each new colunm label from df4 and store as dataframe

import pandas as pd
fingerprint_list = ['AtomPairs2D', 'EState', 'CDKextended', 'CDK', 'CDKgraphonly',
                    'KlekotaRothCount', 'KlekotaRoth', 'MACCS', 'PubChem',
                    'SubstructureCount', 'Substructure']

# Create a dictionary to store DataFrames
descriptors_dfs = {}

# Loop through the list of fingerprints
for fingerprint in fingerprint_list:
  fingerprint_output_file = ''.join([fingerprint,'.csv'])  # Construct the file name

  try:
    descriptors_df = pd.read_csv(fingerprint_output_file)
    descriptors_df = pd.concat([descriptors_df, df4], axis=1)  # Add the 'label' column from df4
    descriptors_dfs[fingerprint] = descriptors_df  # Store the DataFrame in the dictionary
  except FileNotFoundError:
    print(f"File not found for {fingerprint}: {fingerprint_output_file}")

# Now you have a dictionary 'descriptors_dfs' where the keys are the fingerprint names
# and the values are the corresponding DataFrames with 'label' added.


In [None]:
# prompt: print them

for fingerprint, df in descriptors_dfs.items():
  print(f"\n--- {fingerprint} Fingerprint DataFrame ---")
  print(df.head())


In [None]:
# prompt: in all the developed Dataframe, remove the colunm:"Name" and print them

for fingerprint, df in descriptors_dfs.items():
  if 'Name' in df.columns:
    df = df.drop('Name', axis=1)
    print(f"\n--- {fingerprint} Fingerprint DataFrame (Name column removed) ---")
    print(df.head())
  else:
    print(f"\n--- {fingerprint} Fingerprint DataFrame (Name column not found) ---")
    print(df.head())


In [None]:
# prompt: print descriptor foe each fingerprint

# Assuming 'descriptors_MACCS' is already defined
print("MACCS Descriptors:")
print(descriptors_MACCS.head())  # Print the first few rows of MACCS descriptors

# Assuming you have other descriptor DataFrames (e.g., descriptors_AtomPairs2D)
# Print descriptors for each calculated fingerprint
# for example
# print("AtomPairs2D Descriptors:")
# print(descriptors_AtomPairs2D.head())

# You can continue this pattern for all the fingerprint descriptors you calculated


In [None]:
# prompt: for each developed dataframe name them descriptor_fringerprint

# Create a dictionary to store DataFrames with descriptive names
descriptors_dfs_renamed = {}

# Loop through the list of fingerprints and rename DataFrames
for fingerprint, df in descriptors_dfs.items():
  if 'Name' in df.columns:
    df = df.drop('Name', axis=1)
  descriptors_dfs_renamed[f'descriptors_{fingerprint}'] = df

# Now you have a dictionary 'descriptors_dfs_renamed' where the keys are like 'descriptors_MACCS',
# 'descriptors_AtomPairs2D', etc., and the values are the corresponding DataFrames.

# You can access the DataFrames using these keys:
# Example:
# descriptors_MACCS_df = descriptors_dfs_renamed['descriptors_MACCS']


In [None]:
# prompt: print all the dataframe

for df_name, df in descriptors_dfs_renamed.items():
  print(f"\n--- {df_name} DataFrame ---")
  print(df)


In [None]:
# prompt: for each descriptors dataframe, build  12MLs, provide all the above metrics. Please each model must be name ML-fringerprint.

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score
from pycm import ConfusionMatrix
import csv

# Assuming descriptors_dfs_renamed is your dictionary of DataFrames

def calculate_all_metrics(y_true, y_pred):
  """Calculates various classification metrics."""
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)
  mcc = matthews_corrcoef(y_true, y_pred)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  kappa = cohen_kappa_score(y_true, y_pred)
  cm = ConfusionMatrix(actual_vector=y_true.to_numpy(), predict_vector=y_pred)

  metrics = {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1-Score': f1,
      'ROC AUC': roc_auc,
      'MCC': mcc,
      'Balanced Accuracy': balanced_accuracy,
      'Kappa': kappa,
      'Confusion Matrix': cm
  }
  return metrics

# Define your models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Bagging': BaggingClassifier(random_state=42)
}

# Loop through each DataFrame in the dictionary
for df_name, df in descriptors_dfs_renamed.items():
  print(f"\n--- Building ML Models for {df_name} ---")

  # Split data into features (X) and labels (y)
  X = df.drop('label', axis=1)
  y = df['label']

  # Split data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Train and evaluate each model
  for model_name, model in models.items():
    model_fingerprint_name = f"{model_name}-{df_name}"  # Create a unique name for each model
    print(f"\nTraining {model_fingerprint_name}")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    metrics = calculate_all_metrics(y_test, y_pred)

    print(f"--- {model_fingerprint_name} Metrics ---")
    for metric_name, metric_value in metrics.items():
      print(f"{metric_name}: {metric_value}")

    # Store the results in a CSV file (optional)
    csv_filename = 'model_performance_ML_fingerprints.csv'
    with open(csv_filename, 'a', newline='') as csvfile:
      fieldnames = ['Model', 'Fingerprint', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'MCC', 'Balanced Accuracy', 'Kappa']
      writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
      if not csvfile.tell():  # Write header only if the file is empty
        writer.writeheader()

      writer.writerow({
          'Model': model_name,
          'Fingerprint': df_name,
          'Accuracy': metrics['Accuracy'],
          'Precision': metrics['Precision'],
          'Recall': metrics['Recall'],
          'F1-Score': metrics['F1-Score'],
          'ROC AUC': metrics['ROC AUC'],
          'MCC': metrics['MCC'],
          'Balanced Accuracy': metrics['Balanced Accuracy'],
          'Kappa': metrics['Kappa']
      })

print(f"Model Performance (ML-Fingerprint) saved to '{csv_filename}'")


In [None]:
# prompt: perform individual model analysis

import pandas as pd
# Assuming descriptors_dfs_renamed is your dictionary of DataFrames

def calculate_all_metrics(y_true, y_pred):
  """Calculates various classification metrics."""
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)
  mcc = matthews_corrcoef(y_true, y_pred)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  kappa = cohen_kappa_score(y_true, y_pred)
  cm = ConfusionMatrix(actual_vector=y_true.to_numpy(), predict_vector=y_pred)

  metrics = {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1-Score': f1,
      'ROC AUC': roc_auc,
      'MCC': mcc,
      'Balanced Accuracy': balanced_accuracy,
      'Kappa': kappa,
      'Confusion Matrix': cm
  }
  return metrics

# Define your models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Bagging': BaggingClassifier(random_state=42)
}

# Loop through each DataFrame in the dictionary
for df_name, df in descriptors_dfs_renamed.items():
  print(f"\n--- Individual Model Analysis for {df_name} ---")

  # Split data into features (X) and labels (y)
  X = df.drop('label', axis=1)
  y = df['label']

  # Split data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Train and evaluate each model
  for model_name, model in models.items():
    model_fingerprint_name = f"{model_name}-{df_name}"  # Create a unique name for each model
    print(f"\nAnalyzing {model_fingerprint_name}")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    metrics = calculate_all_metrics(y_test, y_pred)

    print(f"--- {model_fingerprint_name} Metrics ---")
    for metric_name, metric_value in metrics.items():
      print(f"{metric_name}: {metric_value}")

    # Individual model analysis (example: feature importance for Random Forest)
    if model_name == 'Random Forest':
      try:
        importances = model.feature_importances_
        feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
        feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
        print("\n--- Top 10 Important Features for Random Forest ---")
        print(feature_importance_df.head(10))
      except Exception as e:
        print(f"Error calculating feature importance for Random Forest: {e}")

    # Add more analysis for other models as needed (e.g., coefficients for Logistic Regression, etc.)

print("Individual Model Analysis completed.")


In [None]:

# prompt: perform 10 fold cross validation to enchance the individual model performance and output for the metrics

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer

# Assuming descriptors_dfs_renamed is your dictionary of DataFrames

def calculate_all_metrics(y_true, y_pred):
  """Calculates various classification metrics."""
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)
  mcc = matthews_corrcoef(y_true, y_pred)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  kappa = cohen_kappa_score(y_true, y_pred)
  cm = ConfusionMatrix(actual_vector=y_true.to_numpy(), predict_vector=y_pred)

  metrics = {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1-Score': f1,
      'ROC AUC': roc_auc,
      'MCC': mcc,
      'Balanced Accuracy': balanced_accuracy,
      'Kappa': kappa,
      'Confusion Matrix': cm
  }
  return metrics

# Define your models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Bagging': BaggingClassifier(random_state=42)
}

# Loop through each DataFrame in the dictionary
for df_name, df in descriptors_dfs_renamed.items():
  print(f"\n--- 10-Fold Cross-Validation for {df_name} ---")

  # Split data into features (X) and labels (y)
  X = df.drop('label', axis=1)
  y = df['label']

  # Define the cross-validation strategy
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

  # Train and evaluate each model using cross-validation
  for model_name, model in models.items():
    model_fingerprint_name = f"{model_name}-{df_name}"  # Create a unique name for each model
    print(f"\nAnalyzing {model_fingerprint_name} with 10-Fold CV")

    # Use cross_val_score to perform cross-validation
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(model, X, y, cv=cv, scoring='precision')
    recall_scores = cross_val_score(model, X, y, cv=cv, scoring='recall')
    f1_scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
    roc_auc_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')


    print(f"--- {model_fingerprint_name} Cross-Validation Metrics ---")
    print(f"Accuracy: {accuracy_scores.mean():.4f} (+/- {accuracy_scores.std():.4f})")
    print(f"Precision: {precision_scores.mean():.4f} (+/- {precision_scores.std():.4f})")
    print(f"Recall: {recall_scores.mean():.4f} (+/- {recall_scores.std():.4f})")
    print(f"F1-Score: {f1_scores.mean():.4f} (+/- {f1_scores.std():.4f})")
    print(f"ROC AUC: {roc_auc_scores.mean():.4f} (+/- {roc_auc_scores.std():.4f})")

    # Store the results in a CSV file (optional)
    csv_filename = 'model_performance_CV_ML_fingerprints.csv'
    with open(csv_filename, 'a', newline='') as csvfile:
      fieldnames = ['Model', 'Fingerprint', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']
      writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
      if not csvfile.tell():  # Write header only if the file is empty
        writer.writeheader()

      writer.writerow({
          'Model': model_name,
          'Fingerprint': df_name,
          'Accuracy': accuracy_scores.mean(),
          'Precision': precision_scores.mean(),
          'Recall': recall_scores.mean(),
          'F1-Score': f1_scores.mean(),
          'ROC AUC': roc_auc_scores.mean()
      })

print(f"Model Performance (CV-ML-Fingerprint) saved to '{csv_filename}'")

In [None]:
# prompt: add as well sensibility and specificity metrics

from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.utils import resample
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski
from rdkit.Chem import rdMolDescriptors
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif
from rdkit.Chem import rdmolops
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from rdkit.Chem.MolStandardize import rdMolStandardize # Import the required module
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score
from pycm import ConfusionMatrix
import csv
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

# ... (Your existing code) ...

def calculate_all_metrics(y_true, y_pred):
  """Calculates various classification metrics."""
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)
  mcc = matthews_corrcoef(y_true, y_pred)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  kappa = cohen_kappa_score(y_true, y_pred)
  cm = ConfusionMatrix(actual_vector=y_true.to_numpy(), predict_vector=y_pred)

  # Calculate Sensitivity (Recall of the positive class)
  try:
      sensitivity = recall_score(y_true, y_pred, pos_label=1)
  except ValueError:
      sensitivity = 0.0

  # Calculate Specificity (Recall of the negative class)
  try:
      specificity = recall_score(y_true, y_pred, pos_label=0)
  except ValueError:
      specificity = 0.0

  metrics = {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1-Score': f1,
      'ROC AUC': roc_auc,
      'MCC': mcc,
      'Balanced Accuracy': balanced_accuracy,
      'Kappa': kappa,
      'Sensitivity': sensitivity,
      'Specificity': specificity,
      'Confusion Matrix': cm
  }
  return metrics


# ... (Your existing code for model training and evaluation) ...

# Inside your loop for each model:
# ...
metrics = calculate_all_metrics(y_test, y_pred)

print(f"--- {model_fingerprint_name} Metrics ---")
for metric_name, metric_value in metrics.items():
  print(f"{metric_name}: {metric_value}")

# ... (Rest of your code for storing results in CSV) ...

# Example of how to update your CSV writer:
csv_filename = 'model_performance_CV_ML_fingerprints.csv'
with open(csv_filename, 'a', newline='') as csvfile:
  fieldnames = ['Model', 'Fingerprint', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'MCC', 'Balanced Accuracy', 'Kappa', 'Sensitivity', 'Specificity']
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  if not csvfile.tell():  # Write header only if the file is empty
    writer.writeheader()

  writer.writerow({
      'Model': model_name,
      'Fingerprint': df_name,
      'Accuracy': metrics['Accuracy'],
      'Precision': metrics['Precision'],
      'Recall': metrics['Recall'],
      'F1-Score': metrics['F1-Score'],
      'ROC AUC': metrics['ROC AUC'],
      'MCC': metrics['MCC'],
      'Balanced Accuracy': metrics['Balanced Accuracy'],
      'Kappa': metrics['Kappa'],
      'Sensitivity': metrics['Sensitivity'],
      'Specificity': metrics['Specificity']
  })


In [None]:
# prompt: can you number total number of baseline Mls developed

total_models = len(models) * len(descriptors_dfs_renamed)
print(f"Total number of baseline ML models developed: {total_models}")
