<a href="https://colab.research.google.com/github/aliu-7/Molecular-Property-Prediction-and-Optimization/blob/main/4_1_3_Encoding_and_Embedding_Strategies_for_SMILES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install and import dependencies
!pip install -q rdkit pandas scikit-learn tensorflow

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense

# Step 2: Load the BBBP dataset
url = "https://raw.githubusercontent.com/Data-Chemist-Handbook/Data-Chemist-Handbook.github.io/refs/heads/master/_pages/BBBP.csv"
data = pd.read_csv(url)

smiles_list = data['smiles']
labels = data['p_np']
y = labels.values

# Step 3: Tokenize SMILES and pad
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(smiles_list)
sequences = tokenizer.texts_to_sequences(smiles_list)
X_seq = pad_sequences(sequences, padding='post', maxlen=120)
vocab_size = len(tokenizer.word_index) + 1

# Step 4: One-hot encode the sequences
X_onehot = to_categorical(X_seq, num_classes=vocab_size)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2, random_state=42)

# Step 6: GRU model without embedding
model = Sequential()
model.add(GRU(units=64, input_shape=(120, vocab_size)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 7: Train and evaluate
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy (One-Hot): {acc:.2f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25h

  super().__init__(**kwargs)


Epoch 1/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 76ms/step - accuracy: 0.7029 - loss: 0.6166 - val_accuracy: 0.7896 - val_loss: 0.5027
Epoch 2/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 64ms/step - accuracy: 0.7576 - loss: 0.5513 - val_accuracy: 0.8018 - val_loss: 0.5022
Epoch 3/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 85ms/step - accuracy: 0.7735 - loss: 0.5407 - val_accuracy: 0.8018 - val_loss: 0.5009
Epoch 4/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 61ms/step - accuracy: 0.7523 - loss: 0.5675 - val_accuracy: 0.8018 - val_loss: 0.4979
Epoch 5/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 65ms/step - accuracy: 0.7745 - loss: 0.5357 - val_accuracy: 0.8018 - val_loss: 0.5024
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.7730 - loss: 0.5341
Test Accuracy (One-Hot): 0.79
