# Assess Drug-Likeness

This code allows to train a model for toxicology, using the Tox21 dataset and pre-trained model, filtered for molecules up to 50 atoms and with atom types C,H,N,O,Cl,F,S. Also allows to address the Lipinski Rule of 5 and Synthetic Accessibility, usil Erl algorithm.

## Generate molecules from trained models

In [None]:
# code to generate molecules and create them in the folders of each model

## Train Toxicology model

In [None]:
import os
import deepchem as dc
import matplotlib.pyplot as plt
import numpy as np
from rdkit import Chem
from deepchem.data import DiskDataset

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Get the dataset
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='Raw')
train_dataset, valid_dataset, test_dataset = datasets

# Define a function to filter the molecules
def filter_molecules(dataset, max_atoms=50, allowed_atoms=set(['C','H','N','O','F','S','Cl'])):
    valid_inds = []
    for i in range(dataset.X.shape[0]):
        molecule = Chem.MolFromSmiles(dataset.ids[i])
        if molecule.GetNumAtoms() <= max_atoms:
            atoms = [atom.GetSymbol() for atom in molecule.GetAtoms()]
            if set(atoms).issubset(allowed_atoms):
                valid_inds.append(i)
    return dataset.select(valid_inds)

# Apply the filter to all datasets
train_dataset = filter_molecules(train_dataset)
valid_dataset = filter_molecules(valid_dataset)
test_dataset = filter_molecules(test_dataset)

# Print the number of molecules after filtering
print(f"Number of molecules in training dataset after filtering: {len(train_dataset)}")
print(f"Number of molecules in validation dataset after filtering: {len(valid_dataset)}")
print(f"Number of molecules in test dataset after filtering: {len(test_dataset)}")

featurizer = dc.feat.ConvMolFeaturizer()

train_mols = [Chem.MolToSmiles(mol) for mol in train_dataset.X]
valid_mols = [Chem.MolToSmiles(mol) for mol in valid_dataset.X]
test_mols = [Chem.MolToSmiles(mol) for mol in test_dataset.X]

train_features = featurizer.featurize(train_mols)
valid_features = featurizer.featurize(valid_mols)
test_features = featurizer.featurize(test_mols)

# Create DiskDataset from the features
train_dataset = DiskDataset.from_numpy(train_features, train_dataset.y, train_dataset.w, ids=train_dataset.ids)
valid_dataset = DiskDataset.from_numpy(valid_features, valid_dataset.y, valid_dataset.w, ids=valid_dataset.ids)
test_dataset = DiskDataset.from_numpy(test_features, test_dataset.y, test_dataset.w, ids=test_dataset.ids)

# Initialize the model
model = dc.models.GraphConvModel(n_tasks=len(tasks), mode='classification')

# Lists to store the performance at each epoch
train_scores = []
valid_scores = []

# Directory to save or load the model
model_dir = "toxicity_model"

# Check if a checkpoint exists
if os.path.exists(model_dir) and os.listdir(model_dir):
    # Restore the model from the checkpoint
    model.restore(model_dir=model_dir)
    print("Model loaded from checkpoint.")
else:
    # If no checkpoint exists, train the model from scratch
    metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

    # Number of epochs
    num_epochs = 100

    for epoch in range(num_epochs):
        loss = model.fit(train_dataset, nb_epoch=1)
        
        train_score = model.evaluate(train_dataset, [metric], transformers)
        valid_score = model.evaluate(valid_dataset, [metric], transformers)
        
        train_scores.append(train_score)
        valid_scores.append(valid_score)
        
        print(f"Epoch {epoch+1}, Loss: {loss}, Train AUC: {train_score['mean-roc_auc_score']}, Valid AUC: {valid_score['mean-roc_auc_score']}")

    # Save the model
    model.save_checkpoint(max_checkpoints_to_keep=100, model_dir=model_dir)

    # Plot the performance
    plt.figure(figsize=(10, 5))
    plt.plot(np.arange(num_epochs)+1, [x['mean-roc_auc_score'] for x in train_scores], label='Training')
    plt.plot(np.arange(num_epochs)+1, [x['mean-roc_auc_score'] for x in valid_scores], label='Validation')
    plt.xlabel('Epoch')
    plt.ylabel('Mean ROC AUC')
    plt.legend()

    # Save the figure
    plt.savefig("performance_plot.png")

    plt.show()

# Evaluate on the test set
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
print(model.evaluate(test_dataset, [metric], transformers))

## Make predictions for toxicity

In [None]:
from tqdm import tqdm
import pandas as pd
import os
from rdkit import Chem
import deepchem as dc

# List of directories containing the .mol files
mol_dirs = [
    "training_models/model2_zinc15ii/generated_molecules/",
    "training_models/model3_zinc15ii/generated_molecules/",
    "training_models/model3_zinc15iii/generated_molecules/"
]

# Define target names in the correct order
target_names = [
    "NR-AhR", "NR-AR", "NR-AR-LBD", "NR-Aromatase", "NR-ER",
    "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5",
    "SR-HSE", "SR-MMP", "SR-p53"
]

# Featurizer used in training
featurizer = dc.feat.ConvMolFeaturizer()

# Iterate through each model directory
for mol_dir in mol_dirs:

    # Extract the model name from the directory path
    model_name = os.path.basename(os.path.normpath(mol_dir))

    # Create a directory to save the CSV file for this model
    output_dir = os.path.join(mol_dir, "predicted_toxicity")
    os.makedirs(output_dir, exist_ok=True)

    # List to store RDKit Mol objects and their filenames
    molecules_mols = []
    filenames = []

    # Get the list of .mol files
    mol_files = [f for f in os.listdir(mol_dir) if f.endswith('.mol')]

    # Iterate through the .mol files in the directory with progress bar
    for mol_file in tqdm(mol_files, desc=f"Processing .mol files in {model_name}"):
        mol_path = os.path.join(mol_dir, mol_file)
        mol = Chem.MolFromMolFile(mol_path)
        if mol is not None:  # Check if the molecule was read successfully
            molecules_mols.append(mol)
            filenames.append(mol_file)

    # Featurize the RDKit Mol objects
    molecules_features = featurizer.featurize(molecules_mols)

    # Create a DiskDataset from the features
    molecules_dataset = dc.data.DiskDataset.from_numpy(molecules_features, ids=filenames)

    # Predict toxicity using the trained model
    predictions = model.predict(molecules_dataset)

    # Create a DataFrame with predictions for this model
    model_predictions = pd.DataFrame(predictions[:, :, 1], columns=target_names)
    model_predictions['Molecule_File'] = filenames

    # Save the DataFrame to a CSV file inside the model folder
    output_csv = os.path.join(output_dir, f"{model_name}_predicted_toxicity.csv")
    model_predictions.to_csv(output_csv, index=False)


In [None]:
import pandas as pd
import os

# Dictionary containing model names
model_names = {
    "model2_zinc15ii": "training_models/model2_zinc15ii/generated_molecules/",
    "model3_zinc15ii": "training_models/model3_zinc15ii/generated_molecules/",
    "model3_zinc15iii": "training_models/model3_zinc15iii/generated_molecules/"
}

# Loop through each model directory
for model_name, mol_dir in model_names.items():
    # Extract the model name from the directory path (derived from the way you provided earlier)
    derived_model_name = os.path.basename(os.path.normpath(mol_dir))
    
    # Define the path to the saved CSV file
    csv_path = os.path.join(mol_dir, "predicted_toxicity", f"{derived_model_name}_predicted_toxicity.csv")
    
    # Check if the CSV file exists
    if os.path.exists(csv_path):
        # Load the CSV file into a DataFrame
        df = pd.read_csv(csv_path)

        # Print the model name (from the dictionary) and the first 10 rows of the DataFrame
        print(f"Predictions for {model_name}:\n")
        print(df.head(10))
        print("\n" + "-"*80 + "\n")  # Print a separator line for better readability
    else:
        print(f"No prediction CSV found for {model_name}.\n")

In [None]:
import os
import pandas as pd

# Dictionary containing model names and their corresponding directories
model_to_dir = {
    "model2_zinc15ii": "training_models/model2_zinc15ii/generated_molecules/",
    "model3_zinc15ii": "training_models/model3_zinc15ii/generated_molecules/",
    "model3_zinc15iii": "training_models/model3_zinc15iii/generated_molecules/"
}

# Define target names in the correct order (assuming you've already defined it)
target_names = [
    "NR-AhR", "NR-AR", "NR-AR-LBD", "NR-Aromatase", "NR-ER",
    "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5",
    "SR-HSE", "SR-MMP", "SR-p53"
]

# Define a threshold for classifying as toxic or non-toxic
threshold = 0.5

# Initialize an empty list to store DataFrames from each model
dfs = []

# Load and append predictions for each model
for model_name, mol_dir in model_to_dir.items():
    csv_path = os.path.join(mol_dir, "predicted_toxicity", "generated_molecules_predicted_toxicity.csv")
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        df['Model'] = model_name  # Add a 'Model' column to specify the model's name
        dfs.append(df)

# Concatenate all loaded DataFrames
final_predictions = pd.concat(dfs, ignore_index=True)

# Loop through each model's name to compute and print active and inactive counts
for model_name in model_to_dir.keys():
    # Filter the predictions for the current model
    model_predictions = final_predictions[final_predictions['Model'] == model_name]
    
    print(f"\nStatistics for {model_name}:\n")
    
    # Create empty dictionaries to store counts for the current model
    active_counts = {target: 0 for target in target_names}
    inactive_counts = {target: 0 for target in target_names}

    # Calculate mean toxicity for each target and count active/inactive
    for target in target_names:
        mean_toxicity = model_predictions[target].mean()
        print(f"Mean toxicity for {target}: {mean_toxicity:.2f}")

        # Count active and inactive molecules for the target
        active_counts[target] = (model_predictions[target] >= threshold).sum()
        inactive_counts[target] = (model_predictions[target] < threshold).sum()

    # Print the frequency of active and inactive targets
    print("\nActive counts:", active_counts)
    print("Inactive counts:", inactive_counts)
    print("-" * 50)  # Line separator for clarity


In [None]:
import os
import matplotlib.pyplot as plt

# Dictionary containing model names and their corresponding directories
model_to_dir = {
    "model2_zinc15ii": "training_models/model2_zinc15ii/generated_molecules/",
    "model3_zinc15ii": "training_models/model3_zinc15ii/generated_molecules/",
    "model3_zinc15iii": "training_models/model3_zinc15iii/generated_molecules/"
}

# Extract model names to a separate list
model_names = list(model_to_dir.keys())

# Initialize an empty list to store DataFrames from each model
dfs = []

# Load and append predictions for each model
for model_name, mol_dir in model_to_dir.items():
    csv_path = os.path.join(mol_dir, "predicted_toxicity", "generated_molecules_predicted_toxicity.csv")
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        df['Model'] = model_name  # Add a 'Model' column to specify the model's name
        dfs.append(df)

# Concatenate all loaded DataFrames
final_predictions = pd.concat(dfs, ignore_index=True)

# Loop through each model name
for model_name in model_names:
    mol_dir = model_to_dir[model_name]
    
    model_predictions = final_predictions[final_predictions['Model'] == model_name]
    print(model_name, model_predictions.shape)

    # Calculate the frequency of active and inactive predictions for each target
    active_counts = {target: (model_predictions[target] > 0.5).sum() for target in target_names}
    inactive_counts = {target: (model_predictions[target] <= 0.5).sum() for target in target_names}

    # Calculate how many compounds are inactive and active for all the targets
    all_inactive_count = (model_predictions[target_names] <= 0.5).all(axis=1).sum()
    all_active_count = (model_predictions[target_names] > 0.5).all(axis=1).sum()
    active_counts['all'] = all_active_count
    inactive_counts['all'] = all_inactive_count

    # Convert active and inactive counts to lists
    active_values = list(active_counts.values())
    inactive_values = list(inactive_counts.values())

    # Extend the target names with the "all" category
    target_names_with_all = target_names + ['all']

    # Set up the figure and axes
    fig, ax = plt.subplots(figsize=(12,6))

    # Set the bar positions
    bar_width = 0.35
    index = range(len(target_names_with_all))

    # Plot the bars for active and inactive counts
    bar1 = plt.bar(index, active_values, bar_width, label="Active")
    bar2 = plt.bar([i + bar_width for i in index], inactive_values, bar_width, label="Inactive")

    # Add some text for labels, title and axes ticks
    plt.xlabel('Targets')
    plt.ylabel('Frequency')
    plt.title(f'Frequency of Active and Inactive Predictions for {model_name}')
    plt.xticks([i + bar_width/2 for i in index], target_names_with_all, rotation=90)
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
import os
from tabulate import tabulate
import pandas as pd

# List of directories containing the .mol files
mol_dirs = [
    "training_models/model2_zinc15ii/generated_molecules/",
    "training_models/model3_zinc15ii/generated_molecules/",
    "training_models/model3_zinc15iii/generated_molecules/"
]

master_data = []

# Initialize an empty list to store DataFrames from each model
dfs = []

# Load and append predictions for each model
for mol_dir in mol_dirs:
    csv_path = os.path.join(mol_dir, "predicted_toxicity", "generated_molecules_predicted_toxicity.csv")
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        model_name = os.path.basename(os.path.dirname(os.path.dirname(mol_dir)))  # Extracting the model name from the grandparent directory path
        df['Model'] = model_name  # Add a 'Model' column to specify the model's name
        dfs.append(df)

# Concatenate all loaded DataFrames
final_predictions = pd.concat(dfs, ignore_index=True)

# Loop through each model's directory
for mol_dir in mol_dirs:
    model_name = os.path.basename(os.path.dirname(os.path.dirname(mol_dir)))  # Extracting the model name from the grandparent directory path
    model_predictions = final_predictions[final_predictions['Model'] == model_name]
        
    # Calculate mean and median toxicity for each target
    mean_values = {target: model_predictions[target].mean() for target in target_names}
    median_values = {target: model_predictions[target].median() for target in target_names}

    # Calculate mean and median for "all" category
    mean_values['all'] = model_predictions[target_names].mean(axis=1).mean()
    median_values['all'] = model_predictions[target_names].mean(axis=1).median()

    # Calculate active and inactive counts for the model
    active_counts = {target: (model_predictions[target] > 0.5).sum() for target in target_names}
    inactive_counts = {target: (model_predictions[target] <= 0.5).sum() for target in target_names}
    all_inactive_count = (model_predictions[target_names] <= 0.5).all(axis=1).sum()
    all_active_count = (model_predictions[target_names] > 0.5).all(axis=1).sum()
    active_counts['all'] = all_active_count
    inactive_counts['all'] = all_inactive_count

    # Prepare the table
    table_data = []
    target_names_with_all = target_names + ['all']
    for target in target_names_with_all:
        table_data.append([model_name, target, mean_values[target], median_values[target], active_counts[target], inactive_counts[target]])
        master_data.append([model_name, target, mean_values[target], median_values[target], active_counts[target], inactive_counts[target]])

    # Add headers
    headers = ['Model', 'Target', 'Mean Toxicity', 'Median Toxicity', 'Active Count', 'Inactive Count']

    # Print the table
    print(tabulate(table_data, headers=headers))
    print("\n")

# Convert master_data to DataFrame and save to CSV
df_master = pd.DataFrame(master_data, columns=headers)
df_master.to_csv("model_statistics.csv", index=False)

## Assess lipinski rule of five and synthetic accessibility

In [None]:
from rdkit.Chem import Lipinski, Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit import Chem
import sascorer
import os

def lipinski_rule_of_five(molecule):
    mw = Descriptors.MolWt(molecule)
    logp = Descriptors.MolLogP(molecule)
    hbd = Lipinski.NumHDonors(molecule)
    hba = Lipinski.NumHAcceptors(molecule)
    return mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10

def synthetic_accessibility(molecule):
    sas = sascorer.calculateScore(molecule)
    return sas

# List of directories containing the .mol files
mol_dirs = [
    "training_models/model2_zinc15ii/generated_molecules/",
    "training_models/model3_zinc15ii/generated_molecules/",
    "training_models/model3_zinc15iii/generated_molecules/"
]

for mol_dir in mol_dirs:
    model_name = os.path.basename(os.path.dirname(os.path.dirname(mol_dir)))  # Extracting the model name from the grandparent directory path
    mol_files = [f for f in os.listdir(mol_dir) if f.endswith('.mol')]
    
    molecules_mols = []
    filenames = []

    for mol_file in mol_files:
        mol_path = os.path.join(mol_dir, mol_file)
        mol = Chem.MolFromMolFile(mol_path)
        if mol is not None:  # Check if the molecule was read successfully
            molecules_mols.append(mol)
            filenames.append(mol_file)

    # Calculate Lipinski and SAS for the molecules from the current model
    lipinski_results = [lipinski_rule_of_five(mol) for mol in molecules_mols]
    sas_results = [synthetic_accessibility(mol) for mol in molecules_mols]

    # Update the final_predictions DataFrame
    model_mask = final_predictions['Model'] == model_name
    final_predictions.loc[model_mask, 'Lipinski_Rule_of_Five'] = lipinski_results
    final_predictions.loc[model_mask, 'Synthetic_Accessibility'] = sas_results

# Save the updated DataFrame to a CSV file
final_predictions.to_csv("predicted_toxicity_with_properties.csv", index=False)

In [None]:
# Split `final_predictions` dataframe into separate dataframes for each model
all_predictions = [final_predictions[final_predictions['Model'] == model] for model in final_predictions['Model'].unique()]

# Then, you can use the loop as before:
for idx, df in enumerate(all_predictions):
    model_name = os.path.basename(os.path.dirname(os.path.dirname(mol_dirs[idx])))  # Extracting the model name from the grandparent directory path
    print(f"\nModel: {model_name}")
    print(df.head(10))
    print("--------------------------------------------------")

In [None]:
import matplotlib.pyplot as plt
from tabulate import tabulate
import os
import matplotlib
matplotlib.use('Agg')
import pandas as pd

print("Current Directory:", os.getcwd())

# This dictionary will help map the 'mol_dir' to its respective model name.
model_name_dict = {mol_dir: os.path.basename(os.path.dirname(os.path.dirname(mol_dir))) for mol_dir in mol_dirs}

# Assuming 'all_sas_results' is a list of sas_results for each model
all_sas_results = []  # This needs to be populated with data before running the loop

# Add this part to populate all_sas_results from the DataFrame you saved
final_predictions = pd.read_csv("predicted_toxicity_with_properties.csv")
for mol_dir in mol_dirs:
    model_name = model_name_dict[mol_dir]  # Use the model_name_dict here
    all_sas_results.append(final_predictions.loc[final_predictions['Model'] == model_name, 'Synthetic_Accessibility'].tolist())

ranges_colors = [((0, 0.99), 'red'), ((1, 3), 'blue'), ((3.01, 6), 'green'), ((6.01, 8), 'yellow'), ((8.01, 10), 'purple')]

for idx, sas_results in enumerate(all_sas_results):
    model_name = model_name_dict[mol_dirs[idx]]  # Use the model_name_dict here as well
    print(f"Processing Model: {model_name}")
    
    save_directory = os.path.join(mol_dirs[idx], "plot_results")  # This now creates the 'plot_results' directory inside each model's directory
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    # Define the new ranges
    ranges = [(0, 0.99), (1, 3), (3.01, 6), (6.01, 8), (8.01, 10)]
    frequency = {f"{r[0]}-{r[1]}": 0 for r in ranges}

    # Calculate the frequency
    for score in sas_results:
        for r in ranges:
            if r[0] <= score <= r[1]:
                frequency[f"{r[0]}-{r[1]}"] += 1

    # Create a histogram
    plt.figure(figsize=[10,6])
    plt.hist(sas_results, bins=[0, 1, 3.01, 6.01, 8.01, 10], edgecolor='black')
    plt.xlabel('SA Score')
    plt.ylabel('Frequency')
    plt.title(f'Frequency of SA Scores for {model_name}')
    plt.xticks([i + 0.5 for i in [0, 1, 3.01, 6.01, 8.01]], [f"{r[0]}-{r[1]}" for r in ranges])

    # Save the histogram
    plt.savefig(os.path.join(save_directory, f"{model_name}_histogram.png"))

    # Print the tabulate table
    table_data = [[key, value] for key, value in frequency.items()]
    headers = ['SA Score Range', 'Frequency']
    print(tabulate(table_data, headers=headers, tablefmt='grid'))

    # Create a scatter plot
    plt.figure(figsize=[10,6])
    for i, score in enumerate(sas_results):
        for r, color in ranges_colors:
            if r[0] <= score <= r[1]:
                plt.scatter(i, score, c=color)

    plt.xlabel('Molecule Index')
    plt.ylabel('SA Score')
    plt.title(f'Synthetic Accessibility Scores for {model_name}')
    plt.yticks([i + 1 for i in range(10)])
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)

    # Create a custom legend
    legend_elements = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=f'{r[0]}-{r[1]}') for r, color in ranges_colors]
    plt.legend(handles=legend_elements, title="SA Score Range")

    # Save the scatter plot
    plt.savefig(os.path.join(save_directory, f"{model_name}_scatter_plot.png"))

    print("--------------------------------------------------")

In [None]:
import pandas as pd
from tabulate import tabulate

# Load the data
final_predictions = pd.read_csv("predicted_toxicity_with_properties.csv")

# Create a dictionary that maps each mol_dir to its respective model name.
model_name_dict = {mol_dir: os.path.basename(os.path.dirname(os.path.dirname(mol_dir))) for mol_dir in mol_dirs}

for mol_dir in mol_dirs:
    # Use the model_name_dict to get the correct model name
    model_name = model_name_dict[mol_dir]
    print(f"Model: {model_name}")

    # Filter the DataFrame for the current model
    model_data = final_predictions[final_predictions['Model'] == model_name]

    # Tally up the 'Pass' and 'Fail' results
    pass_count = sum(model_data['Lipinski_Rule_of_Five'])
    fail_count = len(model_data) - pass_count
    
    # Create a table
    table_data_lipinski = [['Pass', pass_count], ['Fail', fail_count]]
    headers_lipinski = ['Lipinski Rule of Five', 'Frequency']
    print(tabulate(table_data_lipinski, headers=headers_lipinski, tablefmt='grid'))

    print("--------------------------------------------------")