In [1]:
import pandas as pd
import random

# Example SMILES and Target Protein IDs (can be expanded)
smiles_list = ['CCO', 'CCN(CC)CC', 'CCC(C)CO', 'CC(C)C', 'CCOC', 'CCCC']
target_proteins = ['P12345', 'P67890', 'P13579', 'P24680']

# Generate synthetic data
data = []
for i in range(1, 101):  # 100 samples
    mol_id = f"MOL{i:03}"
    smiles = random.choice(smiles_list)
    mol_weight = round(random.uniform(40, 500), 2)  # Random molecular weight
    logp = round(random.uniform(-2, 5), 2)  # LogP value
    num_hba = random.randint(0, 10)  # Hydrogen bond acceptors
    num_hbd = random.randint(0, 5)   # Hydrogen bond donors
    target = random.choice(target_proteins)
    activity = round(random.uniform(0.1, 100), 2)  # IC50 value in µM
    response = random.choice(['Stable', 'Partial', 'No'])

    data.append([mol_id, smiles, mol_weight, logp, num_hba, num_hbd, target, activity, response])

# Create DataFrame
df = pd.DataFrame(data, columns=['Molecule_ID', 'SMILES', 'MolWeight', 'LogP', 'Num_HBA', 'Num_HBD',
                                 'Target_Protein_ID', 'Activity (IC50)', 'Response'])

# Save to CSV
df.to_csv('synthetic_drug_discovery_dataset.csv', index=False)

print("Dataset created successfully!")

Dataset created successfully!


In [3]:
!pip3 install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_ecfp(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)

In [5]:
import tensorflow as tf

class DiffusionModel(tf.keras.Model):
    def __init__(self, input_shape):
        super(DiffusionModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(512, activation='relu')
        self.dense2 = tf.keras.layers.Dense(256, activation='relu')
        self.dense3 = tf.keras.layers.Dense(128, activation='relu')
        self.output_layer = tf.keras.layers.Dense(input_shape, activation='linear')

    def call(self, x, t):
        # Inject noise level 't' into the model
        xt = tf.concat([x, t], axis=-1)
        x = self.dense1(xt)
        x = self.dense2(x)
        x = self.dense3(x)
        return self.output_layer(x)

In [6]:
import numpy as np

def forward_diffusion(x, t, noise_std=0.1):
    noise = np.random.normal(0, noise_std, x.shape)
    xt = x * (1 - t) + noise * np.sqrt(t)
    return xt

In [7]:
def reverse_diffusion_step(model, xt, t):
    pred_noise = model(xt, t)
    x0_pred = xt - pred_noise * np.sqrt(t)
    return x0_pred

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset with SMILES and activity labels
data = pd.read_csv('/content/synthetic_drug_discovery_dataset.csv')
X = np.array([smiles_to_ecfp(s).ToBitString() for s in data['SMILES']], dtype='int8')

# Train-Test split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Training loop
model = DiffusionModel(input_shape=X_train.shape[1])

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_fn = tf.keras.losses.MeanSquaredError()

# Training for 100 diffusion steps
epochs = 20
for epoch in range(epochs):
    epoch_loss = 0
    for step in range(100):  # 100 diffusion steps
        t = np.random.uniform(0, 1, size=(X_train.shape[0], 1))  # Noise level
        xt = forward_diffusion(X_train, t)  # Add noise
        with tf.GradientTape() as tape:
            pred_noise = model(xt, t)
            loss = loss_fn(pred_noise, xt - X_train)  # Compare predicted noise
        grads = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        epoch_loss += loss.numpy()
    print(f'Epoch {epoch+1}, Loss: {epoch_loss / 100:.4f}')




OverflowError: Python int too large to convert to C long