<a href="https://colab.research.google.com/github/aliu-7/Molecular-Property-Prediction-and-Optimization/blob/main/4_1_1_BBBP_Dataset_Classification_Using_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install dependencies
!pip install -q rdkit pandas scikit-learn tensorflow

# Step 2: Load the BBBP dataset from GitHub
import pandas as pd
url = "https://raw.githubusercontent.com/Data-Chemist-Handbook/Data-Chemist-Handbook.github.io/refs/heads/master/_pages/BBBP.csv"
data = pd.read_csv(url)

# Step 3: Extract SMILES and labels
smiles_list = data['smiles']
labels = data['p_np']

# Step 4: Tokenize SMILES at character level
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(smiles_list)
sequences = tokenizer.texts_to_sequences(smiles_list)
X = pad_sequences(sequences, padding='post', maxlen=120)
y = labels.values

# Step 5: Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Build GRU model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=120))
model.add(GRU(units=64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Step 7: Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Step 8: Evaluate performance
loss, acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {acc:.2f}')




Epoch 1/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 79ms/step - accuracy: 0.6981 - loss: 0.6149 - val_accuracy: 0.7896 - val_loss: 0.5176
Epoch 2/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 69ms/step - accuracy: 0.7638 - loss: 0.5354 - val_accuracy: 0.8018 - val_loss: 0.5091
Epoch 3/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 114ms/step - accuracy: 0.7743 - loss: 0.5352 - val_accuracy: 0.8018 - val_loss: 0.5051
Epoch 4/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 67ms/step - accuracy: 0.8021 - loss: 0.5094 - val_accuracy: 0.8018 - val_loss: 0.5326
Epoch 5/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 65ms/step - accuracy: 0.7896 - loss: 0.5353 - val_accuracy: 0.8018 - val_loss: 0.5019
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7730 - loss: 0.5354
Test Accuracy: 0.79
