<a href="https://colab.research.google.com/github/aliu-7/Molecular-Property-Prediction-and-Optimization/blob/main/4_4_2_Implementing_a_Neural_Network_with_Molecular_Descriptors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install dependencies
!pip install -q rdkit-pypi pandas scikit-learn tensorflow

# Step 2: Load the BBBP dataset
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

url = "https://raw.githubusercontent.com/Data-Chemist-Handbook/Data-Chemist-Handbook.github.io/refs/heads/master/_pages/BBBP.csv"
data = pd.read_csv(url)

# Step 3: Define a function to compute molecular descriptors
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return [
        Descriptors.MolWt(mol),                      # Molecular weight
        Descriptors.MolLogP(mol),                    # LogP
        Descriptors.TPSA(mol),                       # Topological polar surface area
        Descriptors.NumRotatableBonds(mol),          # Rotatable bonds
        Descriptors.NumHDonors(mol),                 # H-bond donors
        Descriptors.NumHAcceptors(mol)               # H-bond acceptors
    ]

# Step 4: Apply descriptor function to SMILES
descriptor_data = data['smiles'].apply(compute_descriptors)

# Filter out None entries
valid_mask = descriptor_data.notnull()
valid_descriptors = descriptor_data[valid_mask]

# Convert the list of valid descriptors into a DataFrame
df_desc = pd.DataFrame(valid_descriptors.tolist(), columns=[
    'MolWt', 'LogP', 'TPSA', 'RotatableBonds', 'HDonors', 'HAcceptors'
])

# Attach the corresponding labels
df_desc['Label'] = data.loc[valid_mask, 'p_np'].values

# Step 5: Train/test split
from sklearn.model_selection import train_test_split
X = df_desc.drop('Label', axis=1).values
y = df_desc['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Normalize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 7: Build a neural network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Step 8: Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Step 9: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[23:08:09] Explicit valence for atom # 1 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 6 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 6 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 11 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 12 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:08:09] Explicit valence for atom # 5 N, 4, is greater than permitted
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7369 - loss: 0.6099 - val_accuracy: 0.8593 - val_loss: 0.3901
Epoch 2/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8157 - loss: 0.4409 - val_accuracy: 0.8624 - val_loss: 0.3408
Epoch 3/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8411 - loss: 0.4326 - val_accuracy: 0.8502 - val_loss: 0.3378
Epoch 4/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8506 - loss: 0.3954 - val_accuracy: 0.8654 - val_loss: 0.3352
Epoch 5/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8517 - loss: 0.3978 - val_accuracy: 0.8502 - val_loss: 0.3362
Epoch 6/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8354 - loss: 0.4146 - val_accuracy: 0.8654 - val_loss: 0.3335
Epoch 7/10
[1m41/41[0m [32m━━━━━━━━━━

# Predict on New Molecules

In [None]:
new_smiles = ["CCN(CC)CC", "c1ccccc1O", "CC(=O)OC1=CC=CC=C1C(=O)O"]
new_desc = [compute_descriptors(smi) for smi in new_smiles]
new_X = scaler.transform(new_desc)
predictions = model.predict(new_X)

for i, smi in enumerate(new_smiles):
    prob = predictions[i][0]
    print(f"{smi} → Predicted BBB permeability: {prob:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step
CCN(CC)CC → Predicted BBB permeability: 0.98
c1ccccc1O → Predicted BBB permeability: 0.96
CC(=O)OC1=CC=CC=C1C(=O)O → Predicted BBB permeability: 0.91
