<a href="https://colab.research.google.com/github/aliu-7/Molecular-Property-Prediction-and-Optimization/blob/main/4_4_3_Training_and_Evaluating_a_Neural_Network_for_Property_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install RDKit (if not already installed)
!pip install -q rdkit

# Step 2: Import required libraries
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Step 3: Load the dataset
url = "https://raw.githubusercontent.com/Data-Chemist-Handbook/Data-Chemist-Handbook.github.io/refs/heads/master/_pages/BBBP.csv"
data = pd.read_csv(url)

# Step 4: Define descriptor calculation function
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.TPSA(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol)
    ]

# Step 5: Apply descriptor calculation
descriptor_data = data['smiles'].apply(compute_descriptors)
descriptor_data = descriptor_data.dropna()  # Remove invalid entries
X = np.array(descriptor_data.tolist())
y = data.loc[descriptor_data.index, 'p_np'].values  # Use same indices to align labels

# Step 6: Scale descriptors and split data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 7: Define and compile neural network
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 8: Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Step 9: Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, y_pred))

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25h

[23:12:39] Explicit valence for atom # 1 N, 4, is greater than permitted
[23:12:39] Explicit valence for atom # 6 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 6 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 11 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 12 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:12:40] Explicit valence for atom # 5 N, 4, is greater than permitted
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.4757 - loss: 0.7142 - val_accuracy: 0.8287 - val_loss: 0.5001
Epoch 2/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8270 - loss: 0.4987 - val_accuracy: 0.8777 - val_loss: 0.3742
Epoch 3/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8437 - loss: 0.4208 - val_accuracy: 0.8807 - val_loss: 0.3374
Epoch 4/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8557 - loss: 0.3883 - val_accuracy: 0.8838 - val_loss: 0.3308
Epoch 5/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8472 - loss: 0.3758 - val_accuracy: 0.8807 - val_loss: 0.3295
Epoch 6/10
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8481 - loss: 0.3876 - val_accuracy: 0.8777 - val_loss: 0.3294
Epoch 7/10
[1m41/41[0m [32m━━━━━━━━━